A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [6]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from gqe.mingpt.utils import set_seed

set_seed(3407)

In [7]:
# create a GPT instance
from gqe.mingpt.model import GPT
from gqe.mingpt.cost import IndicesCost

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = 2
model_config.block_size = 20
model_config.n_gates = 20
model_config.n_samples = 5
model_config.temperature = 2
answers = torch.randint(0, model_config.vocab_size, (model_config.n_gates,))
cost = IndicesCost(answers)
model = GPT(model_config, cost)

number of parameters: 2.67M


In [8]:
answers

tensor([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1])

In [9]:
# create a Trainer object
from gqe.mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4  # the model we're using is so small that we can go a bit faster
train_config.max_iters = 400
train_config.num_workers = 0
trainer = Trainer(train_config, model)

running on device cpu


In [10]:
def batch_end_callback(trainer):
    if trainer.iter_num % 10 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
        model.temperature += 0.1


trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

tensor(0.4900)
iter_dt 0.00ms; iter 0: train loss 0.31469
tensor(0.6000)
tensor(0.6000)
tensor(0.6000)
tensor(0.6000)
tensor(0.5700)
tensor(0.5900)
tensor(0.6000)
tensor(0.5500)
tensor(0.5900)
tensor(0.5800)
iter_dt 1579.76ms; iter 10: train loss 0.08830
tensor(0.5700)
tensor(0.5800)
tensor(0.5000)
tensor(0.5400)
tensor(0.4100)
tensor(0.4000)
tensor(0.3900)
tensor(0.3900)
tensor(0.4800)
tensor(0.4800)
iter_dt 1598.42ms; iter 20: train loss 0.00410
tensor(0.5100)
tensor(0.4800)
tensor(0.4300)
tensor(0.5300)
tensor(0.5300)
tensor(0.5300)
tensor(0.5800)
tensor(0.5400)
tensor(0.5200)
tensor(0.4500)
iter_dt 1561.71ms; iter 30: train loss 0.01863
tensor(0.4600)
tensor(0.5500)
tensor(0.5000)
tensor(0.4600)
tensor(0.4900)
tensor(0.4300)
tensor(0.4300)
tensor(0.4800)
tensor(0.4000)
tensor(0.4900)
iter_dt 1561.77ms; iter 40: train loss 0.00471
tensor(0.4500)
tensor(0.4900)
tensor(0.4900)
tensor(0.4700)
tensor(0.4900)
tensor(0.4300)
tensor(0.4500)
tensor(0.4900)
tensor(0.4500)
tensor(0.4600)
iter

In [11]:
# now let's perform some evaluation
print(answers[0])
print(model.generate(torch.tensor([[0]]), model_config.n_gates)[0][0][1:])

tensor(1)
tensor([1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1])
