Can the model learn how to add two 2 digit numbers?

In [11]:
from gptbench import Train, empty_config, LogFlag

To create train and validation datasets run in ../dataprep/:
```
python prepare_addition.py ../data/add2.txt 2 --sep="\n" --split=0.9
```
The creates add2.train.txt and add2.val.txt with entries in the form a+b=cc, one per line.

In [58]:
with open('../data/add2.val.txt', 'r', newline=None) as f:
    val_data = f.read()
print(val_data[:100])

90+0=90
90+1=91
90+2=92
90+3=93
90+4=94
90+5=95
90+6=96
90+7=97
90+8=98
90+9=99
90+10=100
90+11=101



In [None]:
# We'll load sample via the PaddedLineCharDataset: each read sample line is stored in a 16 character block padded at the end.

In [13]:
# create the GPTBench object - we'll name this model add2
ben = Train('add2', log_mask=LogFlag.ALL)

# set train and validation datasets
ben.set_datasets('padlinechar', 
                 train_path='../data/add2.train.txt', 
                 val_path='../data/add2.val.txt')

# set config settings that will override the default values
cfg = empty_config()
cfg.train.log_period=0
cfg.model.set(n_layer=6, n_head=6, n_embd=90, block_size=16) # our model parameters - block_size is big enough for aa+bb=ccc
cfg.sample.set(top=1, max_batch_size=256) # note the top_k(1) - always pick the best item
cfg.train.set(sample_period=-5)
cfg.trainer.set(batch_size=128)

# and init a new model with config
ben.init_new(cfg)

New random seed 1648103466
Initializing new model add2
Dataset train_path: ../data/add2.train.txt, val_path: ../data/add2.val.txt, train_split: None, vocab_size: 13
Model params: 0.59M


In [14]:
# Let's train for 3000 batch iterations. 
# Each dot means a batch was trained.
# Train and validation losses are evaluated wach 100 iterations (iters). 
# Also each 500 iters a random sample is taken.
ben.train(iter_count=3000)

Training
Batches per epoch: 70
iter 0 (0.000 epoch): loss train=2.1801, val=2.2212, eval->2.2212
==> Saving model at iter=0, eval loss->2.2212 
9
.CUDA max memory used: 174.47M
...................................................................................................iter 100 (1.422 epoch): loss train=1.0779, val=1.1047, eval->1.1047
==> Saving model at iter=100, eval loss->1.1047 
....................................................................................................iter 200 (2.844 epoch): loss train=0.8542, val=0.9584, eval->0.9584
==> Saving model at iter=200, eval loss->0.9584 
....................................................................................................iter 300 (4.267 epoch): loss train=0.7791, val=0.9301, eval->0.9301
==> Saving model at iter=300, eval loss->0.9301 
....................................................................................................iter 400 (5.689 epoch): loss train=0.7270, val=0.8859, eval->0.8859
==> S

In [15]:
# The current state loss info:
ben.state

{'n_samples': 383872,
 'train_loss': 0.44791367650032043,
 'val_loss': 0.9759516716003418,
 'eval_loss': 0.9759516716003418}

In [16]:
# The last saved checkpoint info - the best performing model we got. Both train and val losses are thus lower than above.
ben.last_saved_state

{'n_samples': 256000,
 'train_loss': 0.4747387766838074,
 'val_loss': 0.8504394292831421,
 'eval_loss': 0.8504394292831421}

In [17]:
# last saved checkpoint has lower validation loss: let's load it
ben.load()
ben.state

New random seed 542600063
Loading checkpoint from ./models/add2/
Checkpoint: iter=2000 (28.444 epoch), loss train=0.4747 val=0.8504 eval->0.8504
Dataset train_path: ../data/add2.train.txt, val_path: ../data/add2.val.txt, train_split: None, vocab_size: 13
Model params: 0.59M


{'n_samples': 256000,
 'train_loss': 0.4747387766838074,
 'val_loss': 0.8504394292831421,
 'eval_loss': 0.8504394292831421}

In [24]:
# take a few samples:
ben.sample('1+1=')
ben.sample('34+7=')
ben.sample('78+99=')

1+1=1
34+7=41
78+99=177


In [25]:
# Let's measure the accuracy of training dataset - this should be mostly memorization, as the model trained on these data
train_ds = ben.train_dataset

#split each aa+bb=cc into a prompt: 'aa+bb=' and an answer 'cc'
q,a=train_ds.sample_split(0, len(train_ds), sep='=', sep_included=-1)

print(q[:3])
print(a[:3])

['0+0=', '0+1=', '0+2=']
['0', '1', '2']


In [26]:
# Measure the accuracy - how good was the memorization? This may take a while and give different results than the number below
ben.measure_accuracy(q,a)

0.9073333333333333

In [27]:
# We should get a number above 90% for evaluating on train dataset. Further training would improve accuracy, 
# but the model would be overfitting - memorizing the given samples.
# What about the accuracy of the validation dataset, on which the model never trained?
val_ds = ben.val_dataset

#split each aa+bb=cc into a prompt: 'aa+bb=' and an answer 'cc'
q,a=val_ds.sample_split(0, len(val_ds), sep='=', sep_included=-1)

print(q[:3])
print(a[:3])

['90+0=', '90+1=', '90+2=']
['90', '91', '92']


In [22]:
# Validation dataset has sums starting in 90+..99+..., for example 90+2=92.
# The model did however see the reversed addition of 90.100 numbers, for example 2+90=92.
# Did it somehow learn the commutative property of addition?
ben.measure_accuracy(q,a)

0.592

In [33]:
# How is the model failing - let's see some incorrect answers:

wrongs = []
def test(q,a,g):
    global wrongs
    res = float(a == g)
    if not res: wrongs += [f"{q}{a} != {g}"]
    return res

ben.measure_accuracy(q,a, test_fn=test)

0.592

In [36]:
# let's see some examples:
wrongs[40:50]

['91+11=102 != 92',
 '91+12=103 != 93',
 '91+13=104 != 94',
 '91+14=105 != 94',
 '91+15=106 != 96',
 '91+16=107 != 97',
 '91+17=108 != 98',
 '91+18=109 != 99',
 '91+19=110 != 100',
 '91+29=120 != 110']

In [38]:
wrongs[200:210]

['95+55=150 != 141',
 '95+56=151 != 141',
 '95+65=160 != 150',
 '95+66=161 != 151',
 '95+75=170 != 160',
 '95+85=180 != 170',
 '95+86=181 != 171',
 '95+95=190 != 180',
 '95+96=191 != 181',
 '95+97=192 != 182']

In [None]:
# In many cases it's off by -10...

In [46]:
# let's try increaisng dropout from its 0.1 default to improve generalization

# set config settings that will override existing values - only dropout
cfg = empty_config()
cfg.model.set(dropout=0.2)

# init a new model with config
ben.init_new(cfg, name='add2drop')

# see total config:
print(ben.get_config().dump(1))

New random seed 2474415964
Initializing new model add2drop
Dataset train_path: ../data/add2.train.txt, val_path: ../data/add2.val.txt, train_split: None, vocab_size: 13
Model params: 0.59M
seed: 0 (int) 
sample: 
    max_len: 100 (int) 
    count: 1 (int) 
    start_text: None (NoneType) 
    start_text_sep: | (str) 
    emit_start: True (bool) 
    emit_after: None (NoneType) 
    emit_before: None (NoneType) 
    flush: True (bool) 
    eot_stop: 0 (int) 
    top: 1.0 (float) 
    temp: 1.0 (float) 
    max_batch_size: 256 (int) 
    multiline_prompt: False (bool) 
train: 
    eval_period: 100 (int) 
    eval_type: 1.0 (float) 
    eval_iters: 100 (int) 
    eval_save_checkpt: 1 (int) 
    eval_save_loss: csv,tensorboard (str) 
    sample_period: -5.0 (float) 
    log_period: 0.0 (float) 
dataset: 
    class_name: padlinechar (str) 
    train_path: ../data/add2.train.txt (str) 
    train_split: None (NoneType) 
    val_path: ../data/add2.val.txt (str) 
    params: None (NoneType) 
mo

In [47]:
# train for a bit more - 5000 batch iterations to give it time to converge
ben.train(iter_count=5000)

Training
Batches per epoch: 70
iter 0 (0.000 epoch): loss train=2.2807, val=2.2979, eval->2.2979
==> Saving model at iter=0, eval loss->2.2979 
4+
.CUDA max memory used: 164.88M
...................................................................................................iter 100 (1.422 epoch): loss train=1.0749, val=1.1097, eval->1.1097
==> Saving model at iter=100, eval loss->1.1097 
....................................................................................................iter 200 (2.844 epoch): loss train=0.8587, val=0.9122, eval->0.9122
==> Saving model at iter=200, eval loss->0.9122 
....................................................................................................iter 300 (4.267 epoch): loss train=0.7904, val=0.8829, eval->0.8829
==> Saving model at iter=300, eval loss->0.8829 
....................................................................................................iter 400 (5.689 epoch): loss train=0.7530, val=0.8817, eval->0.8817
==> 

In [48]:
# What's the loss of the best saved state?
ben.last_saved_state

{'n_samples': 281600,
 'train_loss': 0.5101192593574524,
 'val_loss': 0.8103063106536865,
 'eval_loss': 0.8103063106536865}

In [49]:
# Previous model has train_loss=0.47 and val_loss=0.87 - we got an improvement in validation loss.
val_ds = ben.val_dataset

#split each aa+bb=cc into a prompt: 'aa+bb=' and an answer 'cc'
q,a=val_ds.sample_split(0, len(val_ds), sep='=', sep_included=-1)

ben.measure_accuracy(q,a)

0.916

In [51]:
# Wow! Accuracy jumped to 91%. Let's get an idea of which cases are giving the model a hard time:
wrongs = []
ben.measure_accuracy(q,a, test_fn=test)

0.916

In [55]:
# Great, we jumped form 59% accuracy to over 91%! Is there any pattern on wrong additions?
wrongs

['90+1=91 != 90',
 '90+2=92 != 90',
 '90+3=93 != 90',
 '90+4=94 != 90',
 '90+5=95 != 90',
 '90+6=96 != 95',
 '91+1=92 != 90',
 '91+2=93 != 90',
 '91+3=94 != 90',
 '91+4=95 != 90',
 '91+5=96 != 94',
 '91+6=97 != 95',
 '91+7=98 != 96',
 '91+8=99 != 98',
 '92+1=93 != 90',
 '92+2=94 != 90',
 '92+3=95 != 90',
 '92+4=96 != 90',
 '92+5=97 != 94',
 '92+6=98 != 95',
 '92+7=99 != 96',
 '92+8=100 != 97',
 '92+9=101 != 109',
 '93+1=94 != 90',
 '93+2=95 != 90',
 '93+3=96 != 90',
 '93+4=97 != 90',
 '93+5=98 != 90',
 '93+6=99 != 95',
 '93+7=100 != 97',
 '93+8=101 != 97',
 '93+9=102 != 108',
 '94+1=95 != 90',
 '94+2=96 != 90',
 '94+3=97 != 90',
 '94+4=98 != 90',
 '94+5=99 != 90',
 '94+6=100 != 95',
 '94+7=101 != 106',
 '94+8=102 != 107',
 '94+9=103 != 108',
 '95+1=96 != 90',
 '95+2=97 != 10',
 '95+3=98 != 112',
 '95+4=99 != 12',
 '95+5=100 != 90',
 '95+6=101 != 105',
 '95+7=102 != 107',
 '95+8=103 != 107',
 '95+9=104 != 108',
 '96+1=97 != 10',
 '96+2=98 != 111',
 '96+3=99 != 12',
 '96+4=100 != 13',
 '

In [None]:
# Trouble happens when the second number is single digit...
# Perhaps using a zero-padded data format would allow better accuracy, like 99+07=107 ?