A cute little demo showing the simplest usage of minGPT. Configured to run fine on Macbook Air in like a minute.

In [1]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
import random
from mingpt.model import GPT
from mingpt.trainer import Trainer
import pickle


set_seed(3407)

In [2]:

class SortDataset(Dataset):
    """ 
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=6, num_digits=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits
    
    def __len__(self):
        return 10000 # ...
    
    def get_vocab_size(self):
        return self.num_digits
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):
        
        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that 
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = -1
        return x, y


In [16]:

def random_mul_instance(length):
    a = [random.randint(0,9) for i in range(length)]
    b = [random.randint(0,9) for i in range(length)]
    val_a = int(''.join(str(d) for d in a))
    val_b = int(''.join(str(d) for d in b))
    val_c = val_a * val_b
    str_c = str(val_c)
    str_c = (length * 2 - len(str_c)) * '0' + str_c
    return a + b + [int(d) for d in str_c]

for i in range(10):
    print (random_mul_instance(3))

[6, 9, 2, 3, 2, 3, 2, 2, 3, 5, 1, 6]
[8, 9, 7, 3, 6, 8, 3, 3, 0, 0, 9, 6]
[8, 7, 0, 1, 7, 6, 1, 5, 3, 1, 2, 0]
[3, 2, 0, 5, 3, 9, 1, 7, 2, 4, 8, 0]
[5, 6, 5, 9, 8, 0, 5, 5, 3, 7, 0, 0]
[8, 7, 7, 6, 3, 5, 5, 5, 6, 8, 9, 5]
[9, 9, 3, 8, 1, 5, 8, 0, 9, 2, 9, 5]
[3, 2, 1, 4, 9, 5, 1, 5, 8, 8, 9, 5]
[9, 7, 5, 2, 9, 3, 2, 8, 5, 6, 7, 5]
[7, 2, 6, 7, 9, 4, 5, 7, 6, 4, 4, 4]


In [49]:
class MultDataset(Dataset):
    """ 
    Dataset for the Mult problem. E.g. for problem length 3:
    12 * 333 = 3996
    Input: 0 1 2 3 3 3 -> Output: 0 0 3 9 9 6
    Which will feed into the transformer concatenated as:
    input:  0 1 2 3 3 3 0 3 4
    output: I I I 0 0 3 9 9 6
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
    
    def __len__(self):
        return 30000 # ...
    
    def get_vocab_size(self):
        return 12
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return 2 * self.length + 2 * self.length - 1

    def __getitem__(self, idx):
        while True:
            rai = random_mul_instance(self.length)
            h = hash(str(rai[:2*self.length]))
            
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        x = torch.tensor(rai[:-1], dtype=torch.long)
        y = torch.tensor(rai[1:], dtype=torch.long)
        
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:2*self.length - 1] = -1
        return x, y

In [50]:
# print an example instance of the dataset
train_dataset = MultDataset('train')
test_dataset = MultDataset('test')
x, y = train_dataset[0]

print(len(train_dataset))

print (x)
for a, b in zip(x,y):
    print(int(a),int(b))

30000
tensor([7, 0, 7, 5, 3, 2, 3, 7, 6, 1, 2])
7 -1
0 -1
7 -1
5 -1
3 -1
2 3
3 7
7 6
6 1
1 2
2 4


In [51]:
# create a GPT instance

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-micro'
#model_config.model_type = 'gpt-nano'

model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 0.80M


In [52]:
print (model_config.n_head, model_config.n_layer, model_config.n_embd)

4 4 128


In [53]:
# create a Trainer object

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 10000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [54]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 2.50639
iter_dt 13.18ms; iter 100: train loss 1.87751
iter_dt 11.01ms; iter 200: train loss 1.77292
iter_dt 11.07ms; iter 300: train loss 1.75499
iter_dt 11.15ms; iter 400: train loss 1.71079
iter_dt 12.94ms; iter 500: train loss 1.62218
iter_dt 10.89ms; iter 600: train loss 1.58352
iter_dt 10.98ms; iter 700: train loss 1.58070
iter_dt 11.10ms; iter 800: train loss 1.58736
iter_dt 11.98ms; iter 900: train loss 1.58493
iter_dt 21.06ms; iter 1000: train loss 1.51586
iter_dt 20.27ms; iter 1100: train loss 1.53417
iter_dt 13.52ms; iter 1200: train loss 1.48563
iter_dt 13.48ms; iter 1300: train loss 1.53177
iter_dt 17.66ms; iter 1400: train loss 1.53807
iter_dt 16.80ms; iter 1500: train loss 1.50627
iter_dt 13.69ms; iter 1600: train loss 1.56247
iter_dt 16.38ms; iter 1700: train loss 1.49763
iter_dt 12.06ms; iter 1800: train loss 1.49411
iter_dt 14.20ms; iter 1900: train loss 1.45839
iter_dt 12.42ms; iter 2000: train loss 1.51186
iter_dt 13.39ms; iter 2100

In [55]:
# now let's perform some evaluation
model.eval()
None

In [56]:
def eval_add_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    #loader = DataLoader(dataset, batch_size=1, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)

        inp = x[:, :2*n]
        sol = y[:, -2*n:]
        
        cat = model.generate(inp, 2*n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, -2*n:]         
        correct = (sol == sol_candidate).all(1).cpu() 
        for i in range(x.size(0)):
            results.append(int(correct[i]))
    
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_add_split(trainer, 'train', max_batches=50)
    test_score  = eval_add_split(trainer, 'test',  max_batches=50)

train final score: 358/30000 = 1.19% correct
test final score: 370/30000 = 1.23% correct


In [11]:
def random_long_mul_instance(length):
    a = [random.randint(0,9) for i in range(length)]
    b = [random.randint(0,9) for i in range(length)]

    def str_c(t):
        stage, max_c = t
        s_c = str(stage)
        return (max_c - len(s_c)) * '0' + s_c

    val_a = int(''.join(str(d) for d in a))
    val_b = int(''.join(str(d) for d in b))
    val_c = val_a * val_b
    
    c_stage_1 = val_a * (val_b % 10)
    c_stage_2 = val_a * ((val_b // 10) % 10)
    c_stage_3 = val_a * ((val_b // 100) % 10)

    res = ''.join(map(str_c, [(c_stage_1, 4), (c_stage_2*10, 5), (c_stage_3*100, 6), (val_c, 6)]))

    return a + b + [int(d) for d in res]

for i in range(10):
    print (random_long_mul_instance(3))

[8, 1, 4, 9, 4, 0, 0, 0, 0, 0, 3, 2, 5, 6, 0, 7, 3, 2, 6, 0, 0, 7, 6, 5, 1, 6, 0]
[2, 1, 5, 0, 7, 1, 0, 2, 1, 5, 1, 5, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 1, 5, 2, 6, 5]
[9, 2, 2, 2, 0, 1, 0, 9, 2, 2, 0, 0, 0, 0, 0, 1, 8, 4, 4, 0, 0, 1, 8, 5, 3, 2, 2]
[9, 7, 8, 4, 8, 7, 6, 8, 4, 6, 7, 8, 2, 4, 0, 3, 9, 1, 2, 0, 0, 4, 7, 6, 2, 8, 6]
[2, 4, 8, 2, 1, 3, 0, 7, 4, 4, 0, 2, 4, 8, 0, 0, 4, 9, 6, 0, 0, 0, 5, 2, 8, 2, 4]
[6, 0, 2, 2, 7, 8, 4, 8, 1, 6, 4, 2, 1, 4, 0, 1, 2, 0, 4, 0, 0, 1, 6, 7, 3, 5, 6]
[9, 7, 8, 4, 7, 7, 6, 8, 4, 6, 6, 8, 4, 6, 0, 3, 9, 1, 2, 0, 0, 4, 6, 6, 5, 0, 6]
[4, 5, 3, 1, 8, 2, 0, 9, 0, 6, 3, 6, 2, 4, 0, 0, 4, 5, 3, 0, 0, 0, 8, 2, 4, 4, 6]
[3, 0, 4, 1, 3, 1, 0, 3, 0, 4, 0, 9, 1, 2, 0, 0, 3, 0, 4, 0, 0, 0, 3, 9, 8, 2, 4]
[1, 5, 2, 7, 5, 3, 0, 4, 5, 6, 0, 7, 6, 0, 0, 1, 0, 6, 4, 0, 0, 1, 1, 4, 4, 5, 6]


In [20]:
class MultLongDataset(Dataset):
    """ 
    Dataset for the Mult problem. E.g. for problem length 3:
    12 * 333 = 3996
    Input: 0 1 2 3 3 3 -> Output: 0 0 3 9 9 6
    Which will feed into the transformer concatenated as:
    input:  0 1 2 3 3 3 0 3 4
    output: I I I 0 0 3 9 9 6
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
    
    def __len__(self):
        return 20 # ...
    
    def get_vocab_size(self):
        return 10
    
    def get_block_size(self):
        # the length of the sequence that will feed into transformer, 
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return 2 * self.length + (((self.length + 1 + 2*self.length) * self.length) // 2) + 2 * self.length

    def __getitem__(self, idx):
        while True:
            rai = random_long_mul_instance(self.length)
            h = hash(str(rai[:2*self.length]))
            
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok
        
        x = torch.tensor(rai[:-1], dtype=torch.long)
        y = torch.tensor(rai[1:], dtype=torch.long)
        
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:2*self.length - 1] = -1
        return x, y

In [1]:
# print an example instance of the dataset
train_dataset = MultLongDataset('train')
test_dataset = MultLongDataset('test')

for i in train_dataset:
    print (i)
x, y = train_dataset[0]

print(len(train_dataset))

print (x)
for a, b in zip(x,y):
    print(int(a),int(b))

NameError: name 'MultLongDataset' is not defined

In [22]:
# create a GPT instance

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-micro'
#model_config.model_type = 'gpt-nano'

model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model = GPT(model_config)

number of parameters: 0.80M


In [23]:
print (model_config.n_head, model_config.n_layer, model_config.n_embd)

4 4 128


In [24]:
# create a Trainer object

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 10000
train_config.num_workers = 0
trainer = Trainer(train_config, model, train_dataset)

running on device cuda


In [25]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer.set_callback('on_batch_end', batch_end_callback)

trainer.run()

iter_dt 0.00ms; iter 0: train loss 2.36801
iter_dt 16.80ms; iter 100: train loss 1.42925
iter_dt 15.49ms; iter 200: train loss 1.21153
iter_dt 14.82ms; iter 300: train loss 0.95736
iter_dt 16.02ms; iter 400: train loss 0.84157
iter_dt 22.30ms; iter 500: train loss 0.68790
iter_dt 15.19ms; iter 600: train loss 0.66285
iter_dt 19.38ms; iter 700: train loss 0.53967
iter_dt 16.11ms; iter 800: train loss 0.46876
iter_dt 17.43ms; iter 900: train loss 0.41489
iter_dt 17.40ms; iter 1000: train loss 0.38700
iter_dt 25.45ms; iter 1100: train loss 0.33108
iter_dt 15.80ms; iter 1200: train loss 0.31181
iter_dt 17.68ms; iter 1300: train loss 0.26817
iter_dt 16.17ms; iter 1400: train loss 0.25537
iter_dt 22.25ms; iter 1500: train loss 0.24916
iter_dt 18.00ms; iter 1600: train loss 0.22479
iter_dt 16.07ms; iter 1700: train loss 0.19845
iter_dt 15.32ms; iter 1800: train loss 0.21958
iter_dt 20.53ms; iter 1900: train loss 0.19916
iter_dt 16.77ms; iter 2000: train loss 0.17082
iter_dt 16.92ms; iter 2100

In [26]:
# now let's perform some evaluation
model.eval()
None

In [27]:
def eval_long_mult_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    #loader = DataLoader(dataset, batch_size=1, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)

        inp = x[:, :2*n]
        sol = y[:, -2*n:]
        
        cat = model.generate(inp, (2*n + (((3*n + 1) * n) // 2)), do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, -2*n:]         
        correct = (sol == sol_candidate).all(1).cpu() 
        for i in range(x.size(0)):
            results.append(int(correct[i]))
    
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_long_mult_split(trainer, 'train', max_batches=50)
    test_score  = eval_long_mult_split(trainer, 'test',  max_batches=50)

train final score: 18/20 = 90.00% correct
test final score: 19/20 = 95.00% correct
