Secuencias ordenadas con diferentes tipos de modelos.

In [9]:
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import DataLoader
from mingpt.utils import set_seed
set_seed(3407)

In [10]:
import pickle

class SortDataset(Dataset):
    """
    Dataset for the Sort problem. E.g. for problem length 6:
    Input: 0 0 2 1 0 1 -> Output: 0 0 0 1 1 2
    Which will feed into the transformer concatenated as:
    input:  0 0 2 1 0 1 0 0 0 1 1
    output: I I I I I 0 0 0 1 1 2
    where I is "ignore", as the transformer is reading the input sequence
    """

    def __init__(self, split, length=6, num_digits=3):
        assert split in {'train', 'test'}
        self.split = split
        self.length = length
        self.num_digits = num_digits

    def __len__(self):
        return 10000 # ...

    def get_vocab_size(self):
        return self.num_digits

    def get_block_size(self):
        # the length of the sequence that will feed into transformer,
        # containing concatenated input and the output, but -1 because
        # the transformer starts making predictions at the last input element
        return self.length * 2 - 1

    def __getitem__(self, idx):

        # use rejection sampling to generate an input example from the desired split
        while True:
            # generate some random integers
            inp = torch.randint(self.num_digits, size=(self.length,), dtype=torch.long)
            # half of the time let's try to boost the number of examples that
            # have a large number of repeats, as this is what the model seems to struggle
            # with later in training, and they are kind of rate
            if torch.rand(1).item() < 0.5:
                if inp.unique().nelement() > self.length // 2:
                    # too many unqiue digits, re-sample
                    continue
            # figure out if this generated example is train or test based on its hash
            h = hash(pickle.dumps(inp.tolist()))
            inp_split = 'test' if h % 4 == 0 else 'train' # designate 25% of examples as test
            if inp_split == self.split:
                break # ok

        # solve the task: i.e. sort
        sol = torch.sort(inp)[0]

        # concatenate the problem specification and the solution
        cat = torch.cat((inp, sol), dim=0)

        # the inputs to the transformer will be the offset sequence
        x = cat[:-1].clone()
        y = cat[1:].clone()
        # we only want to predict at output locations, mask out the loss at the input locations
        y[:self.length-1] = -1
        return x, y

In [11]:
# print an example instance of the dataset
train_dataset = SortDataset('train')
test_dataset = SortDataset('test')
x, y = train_dataset[0]
for a, b in zip(x,y):
    print(int(a),int(b))

1 -1
0 -1
1 -1
0 -1
0 -1
0 0
0 0
0 0
0 0
0 1
1 1


gpt-nano: n_layer=3, n_head=3, n_embd=48

In [12]:
# create a GPT instance
from transform.model_tce import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-nano'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_nano = GPT(model_config)

number of parameters: 0.09M


In [13]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 3e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 2
trainer_nano = Trainer(train_config, model_nano, train_dataset)

running on device cuda


In [14]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer_nano.set_callback('on_batch_end', batch_end_callback)

trainer_nano.run()

iter_dt 0.00ms; iter 0: train loss 1.10802
iter_dt 3.39ms; iter 100: train loss 0.38102
iter_dt 3.39ms; iter 200: train loss 0.15310
iter_dt 3.85ms; iter 300: train loss 0.06243
iter_dt 3.43ms; iter 400: train loss 0.03350
iter_dt 3.41ms; iter 500: train loss 0.02456
iter_dt 3.54ms; iter 600: train loss 0.03938
iter_dt 3.47ms; iter 700: train loss 0.01617
iter_dt 3.31ms; iter 800: train loss 0.05884
iter_dt 3.42ms; iter 900: train loss 0.02484
iter_dt 3.35ms; iter 1000: train loss 0.02841
iter_dt 3.27ms; iter 1100: train loss 0.05692
iter_dt 3.33ms; iter 1200: train loss 0.01039
iter_dt 3.23ms; iter 1300: train loss 0.05498
iter_dt 6.43ms; iter 1400: train loss 0.00485
iter_dt 3.34ms; iter 1500: train loss 0.02731
iter_dt 6.39ms; iter 1600: train loss 0.01011
iter_dt 6.07ms; iter 1700: train loss 0.00976
iter_dt 7.11ms; iter 1800: train loss 0.01035
iter_dt 3.36ms; iter 1900: train loss 0.03309


In [15]:
# now let's perform some evaluation
model_nano.eval();

In [16]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model_nano.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer_nano, 'train', max_batches=50)
    test_score  = eval_split(trainer_nano, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


gpt-micro: n_layer=4, n_head=4, n_embd=128

In [17]:
# create a GPT instance
from transform.model_tce import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-micro'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_micro = GPT(model_config)

number of parameters: 0.80M


In [18]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 3e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 2
trainer_micro = Trainer(train_config, model_micro, train_dataset)

running on device cuda


In [19]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer_micro.set_callback('on_batch_end', batch_end_callback)

trainer_micro.run()

iter_dt 0.00ms; iter 0: train loss 1.14815
iter_dt 4.14ms; iter 100: train loss 0.54070
iter_dt 4.20ms; iter 200: train loss 0.31869
iter_dt 4.38ms; iter 300: train loss 0.10073
iter_dt 4.13ms; iter 400: train loss 0.07262
iter_dt 4.16ms; iter 500: train loss 0.06950
iter_dt 4.13ms; iter 600: train loss 0.02049
iter_dt 4.06ms; iter 700: train loss 0.08846
iter_dt 8.12ms; iter 800: train loss 0.02523
iter_dt 4.17ms; iter 900: train loss 0.05560
iter_dt 4.34ms; iter 1000: train loss 0.02421
iter_dt 4.25ms; iter 1100: train loss 0.00775
iter_dt 4.26ms; iter 1200: train loss 0.03190
iter_dt 4.17ms; iter 1300: train loss 0.03272
iter_dt 4.18ms; iter 1400: train loss 0.00704
iter_dt 4.09ms; iter 1500: train loss 0.01057
iter_dt 4.03ms; iter 1600: train loss 0.01725
iter_dt 4.05ms; iter 1700: train loss 0.00758
iter_dt 4.32ms; iter 1800: train loss 0.02172
iter_dt 4.33ms; iter 1900: train loss 0.00960


In [20]:
# now let's perform some evaluation
model_micro.eval();

In [21]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model_micro.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer_micro, 'train', max_batches=50)
    test_score  = eval_split(trainer_micro, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


gpt-mini: n_layer=6, n_head=6, n_embd=192

In [22]:
# create a GPT instance
from transform.model_tce import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gpt-mini'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_mini = GPT(model_config)

number of parameters: 2.67M


In [23]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 3e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 2
trainer_mini = Trainer(train_config, model_mini, train_dataset)

running on device cuda


In [24]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer_mini.set_callback('on_batch_end', batch_end_callback)

trainer_mini.run()

iter_dt 0.00ms; iter 0: train loss 1.10289
iter_dt 5.87ms; iter 100: train loss 1.09844
iter_dt 5.86ms; iter 200: train loss 1.04299
iter_dt 5.72ms; iter 300: train loss 0.57473
iter_dt 5.71ms; iter 400: train loss 0.37487
iter_dt 5.78ms; iter 500: train loss 0.17507
iter_dt 5.76ms; iter 600: train loss 0.11273
iter_dt 5.86ms; iter 700: train loss 0.11443
iter_dt 5.77ms; iter 800: train loss 0.05292
iter_dt 5.72ms; iter 900: train loss 0.07133
iter_dt 5.79ms; iter 1000: train loss 0.02398
iter_dt 5.75ms; iter 1100: train loss 0.04186
iter_dt 5.78ms; iter 1200: train loss 0.01225
iter_dt 5.78ms; iter 1300: train loss 0.00619
iter_dt 5.85ms; iter 1400: train loss 0.08079
iter_dt 5.79ms; iter 1500: train loss 0.01665
iter_dt 5.86ms; iter 1600: train loss 0.00657
iter_dt 5.89ms; iter 1700: train loss 0.05709
iter_dt 6.09ms; iter 1800: train loss 0.00692
iter_dt 5.77ms; iter 1900: train loss 0.00625


In [25]:
# now let's perform some evaluation
model_mini.eval();

In [26]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model_mini.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer_mini, 'train', max_batches=50)
    test_score  = eval_split(trainer_mini, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


gopher-44m: n_layer=8, n_head=16, n_embd=512

In [45]:
# create a GPT instance
from transform.model_tce import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'gopher-44m'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_gopher = GPT(model_config)

number of parameters: 25.23M


In [46]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 5e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 2000
train_config.num_workers = 2
trainer_gopher = Trainer(train_config, model_gopher, train_dataset)

running on device cuda


In [47]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer_gopher.set_callback('on_batch_end', batch_end_callback)

trainer_gopher.run()

iter_dt 0.00ms; iter 0: train loss 1.10300
iter_dt 10.14ms; iter 100: train loss 1.10951
iter_dt 10.06ms; iter 200: train loss 1.09702
iter_dt 10.06ms; iter 300: train loss 1.10143
iter_dt 10.18ms; iter 400: train loss 1.10241
iter_dt 10.15ms; iter 500: train loss 1.09867
iter_dt 10.15ms; iter 600: train loss 1.09124
iter_dt 10.03ms; iter 700: train loss 1.04532
iter_dt 10.13ms; iter 800: train loss 0.94652
iter_dt 10.17ms; iter 900: train loss 0.73639
iter_dt 10.00ms; iter 1000: train loss 0.57808
iter_dt 10.03ms; iter 1100: train loss 0.34231
iter_dt 10.14ms; iter 1200: train loss 0.20982
iter_dt 10.11ms; iter 1300: train loss 0.20980
iter_dt 10.28ms; iter 1400: train loss 0.12443
iter_dt 10.35ms; iter 1500: train loss 0.18815
iter_dt 10.15ms; iter 1600: train loss 0.13721
iter_dt 10.22ms; iter 1700: train loss 0.10104
iter_dt 10.22ms; iter 1800: train loss 0.18866
iter_dt 10.12ms; iter 1900: train loss 0.24501


In [48]:
# now let's perform some evaluation
model_gopher.eval();

In [49]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model_gopher.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer_gopher, 'train', max_batches=50)
    test_score  = eval_split(trainer_gopher, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
test final score: 5000/5000 = 100.00% correct


GPT-1, openai-gpt: n_layer=12, n_head=12, n_embd=768

In [64]:
# create a GPT instance
from transform.model_tce import GPT

model_config = GPT.get_default_config()
model_config.model_type = 'openai-gpt'
model_config.vocab_size = train_dataset.get_vocab_size()
model_config.block_size = train_dataset.get_block_size()
model_gpt1 = GPT(model_config)

number of parameters: 85.07M


In [65]:
# create a Trainer object
from mingpt.trainer import Trainer

train_config = Trainer.get_default_config()
train_config.learning_rate = 3e-4 # the model we're using is so small that we can go a bit faster
train_config.max_iters = 10000
train_config.num_workers = 8
trainer_gpt1 = Trainer(train_config, model_gpt1, train_dataset)

running on device cuda


In [66]:
def batch_end_callback(trainer):
    if trainer.iter_num % 100 == 0:
        print(f"iter_dt {trainer.iter_dt * 1000:.2f}ms; iter {trainer.iter_num}: train loss {trainer.loss.item():.5f}")
trainer_gpt1.set_callback('on_batch_end', batch_end_callback)

trainer_gpt1.run()

iter_dt 0.00ms; iter 0: train loss 1.12391
iter_dt 26.66ms; iter 100: train loss 1.09791
iter_dt 25.87ms; iter 200: train loss 1.09673
iter_dt 25.81ms; iter 300: train loss 1.09701
iter_dt 25.87ms; iter 400: train loss 1.10424
iter_dt 25.95ms; iter 500: train loss 1.10726
iter_dt 26.06ms; iter 600: train loss 1.09843
iter_dt 25.95ms; iter 700: train loss 1.10070
iter_dt 26.08ms; iter 800: train loss 1.09836
iter_dt 28.63ms; iter 900: train loss 1.09896
iter_dt 25.93ms; iter 1000: train loss 1.10455
iter_dt 26.05ms; iter 1100: train loss 1.09940
iter_dt 25.99ms; iter 1200: train loss 1.10078
iter_dt 26.09ms; iter 1300: train loss 1.10492
iter_dt 26.23ms; iter 1400: train loss 1.10310
iter_dt 26.21ms; iter 1500: train loss 1.09734
iter_dt 26.47ms; iter 1600: train loss 1.10424
iter_dt 25.97ms; iter 1700: train loss 1.09907
iter_dt 26.23ms; iter 1800: train loss 1.10394
iter_dt 26.16ms; iter 1900: train loss 1.10229
iter_dt 25.94ms; iter 2000: train loss 1.09994
iter_dt 26.12ms; iter 2100

In [67]:
# now let's perform some evaluation
model_gpt1.eval();

In [68]:
def eval_split(trainer, split, max_batches):
    dataset = {'train':train_dataset, 'test':test_dataset}[split]
    n = train_dataset.length # naugy direct access shrug
    results = []
    mistakes_printed_already = 0
    loader = DataLoader(dataset, batch_size=100, num_workers=0, drop_last=False)
    for b, (x, y) in enumerate(loader):
        x = x.to(trainer.device)
        y = y.to(trainer.device)
        # isolate the input pattern alone
        inp = x[:, :n]
        sol = y[:, -n:]
        # let the model sample the rest of the sequence
        cat = model_gpt1.generate(inp, n, do_sample=False) # using greedy argmax, not sampling
        sol_candidate = cat[:, n:] # isolate the filled in sequence
        # compare the predicted sequence to the true sequence
        correct = (sol == sol_candidate).all(1).cpu() # Software 1.0 vs. Software 2.0 fight RIGHT on this line haha
        for i in range(x.size(0)):
            results.append(int(correct[i]))
            if not correct[i] and mistakes_printed_already < 3: # only print up to 5 mistakes to get a sense
                mistakes_printed_already += 1
                print("GPT claims that %s sorted is %s but gt is %s" % (inp[i].tolist(), sol_candidate[i].tolist(), sol[i].tolist()))
        if max_batches is not None and b+1 >= max_batches:
            break
    rt = torch.tensor(results, dtype=torch.float)
    print("%s final score: %d/%d = %.2f%% correct" % (split, rt.sum(), len(results), 100*rt.mean()))
    return rt.sum()

# run a lot of examples from both train and test through the model and verify the output correctness
with torch.no_grad():
    train_score = eval_split(trainer_gpt1, 'train', max_batches=50)
    test_score  = eval_split(trainer_gpt1, 'test',  max_batches=50)

train final score: 5000/5000 = 100.00% correct
GPT claims that [2, 2, 2, 2, 2, 1] sorted is [2, 2, 2, 2, 2, 2] but gt is [1, 2, 2, 2, 2, 2]
GPT claims that [2, 2, 2, 2, 2, 1] sorted is [2, 2, 2, 2, 2, 2] but gt is [1, 2, 2, 2, 2, 2]
GPT claims that [2, 2, 2, 2, 2, 1] sorted is [2, 2, 2, 2, 2, 2] but gt is [1, 2, 2, 2, 2, 2]
test final score: 4978/5000 = 99.56% correct
