Learn to add two 2-digit numbers with fixed-size padded at the right blocks.

In [1]:
from gptbench import Sample, LogFlag, Train, empty_config

To create train and validation dataset:
python prepare_addition.py ../data/add2.txt 2 --sep="\n" --split=0.9

Creates add2.train.txt and add2.val.txt

In [2]:
with open('../data/add2.val.txt', 'r', newline=None) as f:
    val_data = f.read()
val_data[:500]

'90+0=90\n90+1=91\n90+2=92\n90+3=93\n90+4=94\n90+5=95\n90+6=96\n90+7=97\n90+8=98\n90+9=99\n90+10=100\n90+11=101\n90+12=102\n90+13=103\n90+14=104\n90+15=105\n90+16=106\n90+17=107\n90+18=108\n90+19=109\n90+20=110\n90+21=111\n90+22=112\n90+23=113\n90+24=114\n90+25=115\n90+26=116\n90+27=117\n90+28=118\n90+29=119\n90+30=120\n90+31=121\n90+32=122\n90+33=123\n90+34=124\n90+35=125\n90+36=126\n90+37=127\n90+38=128\n90+39=129\n90+40=130\n90+41=131\n90+42=132\n90+43=133\n90+44=134\n90+45=135\n90+46=136\n90+47=137\n90+48=138\n90+49=139\n90+50=140\n90+51=141\n'

In [3]:
do = Train('add2pad', log_mask=LogFlag.ALL)

# set datasets
do.set_datasets('padlinechar', train_path='../data/add2.train.txt', val_path='../data/add2.val.txt')

# set config settings
cfg = empty_config()
cfg.train.log_period=0
cfg.model.set(n_layer=6, n_head=6, n_embd=90, block_size=16)
cfg.train.set(sample_period=-5)
cfg.trainer.set(batch_size=128)

# and init a new model with config
if do.can_resume():
    do.init_resume(cfg)
else:
    do.init_new(cfg)
# print(do.get_config().dump(1))

New random seed 3952773132
Loading checkpoint from ./models/add2pad/
Checkpoint: iter=1900 (27.022 epoch), loss train=0.4691 val=0.7730 eval->0.7730
Dataset train_path: ../data/add2.train.txt, val_path: ../data/add2.val.txt, train_split: None, vocab_size: 13
Model params: 0.59M


In [4]:
start_text='30+6=|92+17=|1+1='
ans=[]
do.sample(start_text, dest=ans, emit_after='=')
ans

['36', '109', '2']

In [46]:
start_text='1+1='
ans=[]
do.sample(start_text, dest=ans, emit_after='=')
ans

['11']

In [42]:
do.val_dataset.get_vocab_items()

['\x00', '+', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '=']

In [43]:
do.val_dataset[0]

(tensor([11,  2,  1,  2, 12, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([ 2,  1,  2, 12, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]))

In [44]:
do.train(iter_count=10000)

Training
Batches per epoch: 70
iter 0 (0.000 epoch): loss train=2.2203, val=2.2427, eval->2.2427
==> Saving model at iter=0, eval loss->2.2427 
751=07+91209+10=2142+8+977915+51319264313433122355++5=28+4145+=7==30852=0+45661779997273423+9
.CUDA max memory used: 167.94M
...................................................................................................iter 100 (1.422 epoch): loss train=1.0493, val=1.0841, eval->1.0841
==> Saving model at iter=100, eval loss->1.0841 
....................................................................................................iter 200 (2.844 epoch): loss train=0.8439, val=0.9244, eval->0.9244
==> Saving model at iter=200, eval loss->0.9244 
....................................................................................................iter 300 (4.267 epoch): loss train=0.7623, val=0.8565, eval->0.8565
==> Saving model at iter=300, eval loss->0.8565 
.................................................................................

In [64]:
do.sample(start_text="10+20=")

10+20=30


In [56]:
do.sample(start_text="94+97=", emit_start=False)

191


In [5]:
def test_accuracy(first_number, second_number):
    qa = []
    for a in range(100):
        for b in range(100):
            qa.append((f'{a}+{b}=', f'{a+b}'))

    qa=qa[first_number*100:second_number*100]
    
    sep = do.get_config().sample.start_text_sep
    start_text = sep.join([q for q,a in qa])

    ans = []
    do.sample(start_text, dest=ans, emit_after='=')

    corr=0
    for i in range(len(qa)):
        q,a = qa[i]
        #print(q,a,ans[i])
        if a == ans[i]:
            corr+=1
        
    print(f'{corr/len(qa):.3f}')


In [7]:
test_accuracy(90, 100)

0.422


In [8]:
test_accuracy(0, 10)

0.537


In [9]:
test_accuracy(10, 90)

0.621
