Train a transformer model to convert decimal numbers to roman literals, ex:
56=LVI
https://en.wikipedia.org/wiki/Roman_numerals

In [1]:
from gptbench import Train, empty_config, LogFlag

In [2]:
ben = Train('dec2roman')

# set datasets
ben.set_datasets('padlinechar', train_path='../data/decimal2roman15000.txt', train_split=(13000-1)/15000) # -1 because numbers start at 1

# set config settings
cfg = empty_config()
cfg.train.log_period=0
cfg.model.set(n_layer=8, n_head=8, n_embd=96, block_size=32)
cfg.sample.set(top=1, max_batch_size=256) # top_k(1) - always pick the best item
cfg.train.set(sample_period=-5)
cfg.trainer.set(batch_size=128)

# and init a new model with config
if ben.can_load() and False:
    ben.load(cfg)
else:
    ben.init_new(cfg)
# print(do.get_config().dump(1))


New random seed 3355343838
Initializing new model dec2roman
Dataset train_path: ../data/decimal2roman15000.txt, val_path: None, train_split: 0.8666, vocab_size: 19
Model params: 0.90M


In [3]:
ben.val_dataset.get_src_data()[:70]

'13000=MMMMMMMMMMMMM\n13001=MMMMMMMMMMMMMI\n13002=MMMMMMMMMMMMMII\n13003=M'

In [6]:
ben.train(iter_count=100)
ben.save('locus19')

Training
Batches per epoch: 101
iter 100 (0.985 epoch): loss train=1.0587, val=1.2057, eval->1.2057
==> Saving model at iter=100, eval loss->1.2057 
....................................................................................................

In [7]:
ben.name

'locus19'

In [28]:
ben.train(iter_count=5000)

Training
Batches per epoch: 101
iter 0 (0.000 epoch): loss train=2.4836, val=2.6190, eval->2.6190
==> Saving model at iter=0, eval loss->2.6190 
L84==MMM8MM8
.CUDA max memory used: 545.82M
...................................................................................................iter 100 (0.985 epoch): loss train=1.0064, val=1.1340, eval->1.1340
==> Saving model at iter=100, eval loss->1.1340 
....................................................................................................iter 200 (1.969 epoch): loss train=0.6987, val=0.8803, eval->0.8803
==> Saving model at iter=200, eval loss->0.8803 
....................................................................................................iter 300 (2.954 epoch): loss train=0.5517, val=0.7506, eval->0.7506
==> Saving model at iter=300, eval loss->0.7506 
....................................................................................................iter 400 (3.939 epoch): loss train=0.4692, val=0.7291, eval->

KeyboardInterrupt: 

In [34]:
ds = ben.val_dataset
q,a=ds.sample_split(0, len(ds), sep='=', sep_included=-1)

errs = []
def test(q,a,g):
    global errs
    
    res = float(a == g)
    if not res:
        errs += [f"{q}: {a} != {g}"]
    return res
    
print(ben.measure_accuracy(q,a, test_fn=test))
print(len(errs), errs[:20])

0.0
['13000=: MMMMMMMMMMMMM != MMMMMMMMMMMM', '13001=: MMMMMMMMMMMMMI != MMMMMMMMMMMMI', '13002=: MMMMMMMMMMMMMII != MMMMMMMMMMMMII', '13003=: MMMMMMMMMMMMMIII != MMMMMMMMMMMMIII', '13004=: MMMMMMMMMMMMMIV != MMMMMMMMMMMMIV', '13005=: MMMMMMMMMMMMMV != MMMMMMMMMMMMV', '13006=: MMMMMMMMMMMMMVI != MMMMMMMMMMMMVI', '13007=: MMMMMMMMMMMMMVII != MMMMMMMMMMMMVII', '13008=: MMMMMMMMMMMMMVIII != MMMMMMMMMMMMVIII', '13009=: MMMMMMMMMMMMMIX != MMMMMMMMMMMMIX', '13010=: MMMMMMMMMMMMMX != MMMMMMMMMMMMCC', '13011=: MMMMMMMMMMMMMXI != MMMMMMMMMMMMCCI', '13012=: MMMMMMMMMMMMMXII != MMMMMMMMMMMMCCII', '13013=: MMMMMMMMMMMMMXIII != MMMMMMMMMMMMXIII', '13014=: MMMMMMMMMMMMMXIV != MMMMMMMMMMMMXIV', '13015=: MMMMMMMMMMMMMXV != MMMMMMMMMMMMXV', '13016=: MMMMMMMMMMMMMXVI != MMMMMMMMMMMMCCVI', '13017=: MMMMMMMMMMMMMXVII != MMMMMMMMMMMMCCVII', '13018=: MMMMMMMMMMMMMXVIII != MMMMMMMMMMMMXVIII', '13019=: MMMMMMMMMMMMMXIX != MMMMMMMMMMMMCCIX']


In [35]:
ds = ben.train_dataset
q,a=ds.sample_split(0, len(ds), sep='=', sep_included=-1)

errs = []
def test(q,a,g):
    global errs
    
    res = float(a == g)
    if not res:
        errs += [f"{q}: {a} != {g}"]
    return res
    
print(ben.measure_accuracy(q,a, test_fn=test))
print(len(errs), errs[:20])

0.9989229940764675
['3=: III != II', '8=: VIII != VII', '10=: X != ', '12=: XII != XI', '13=: XIII != XII', '16=: XVI != XV', '18=: XVIII != XVII', '40=: XL != X', '44=: XLIV != XLIX', '46=: XLVI != XLV', '54=: LIV != LIX', '56=: LVI != LV', '94=: XCIV != XCIX', '126=: CXXVI != CXXV']


In [37]:
# now resuming from best validation checkpoint
ben.init_resume(cfg)

New random seed 640745423
Loading checkpoint from ./models/num2roman/
Checkpoint: iter=700 (6.893 epoch), loss train=0.3052 val=0.6755 eval->0.6755
Dataset train_path: ../data/roman15000.txt, val_path: None, train_split: 0.8666, vocab_size: 19
Model params: 0.90M


In [38]:
ds = ben.val_dataset
q,a=ds.sample_split(0, len(ds), sep='=', sep_included=-1)

errs = []
def test(q,a,g):
    global errs
    
    res = float(a == g)
    if not res:
        errs += [f"{q}: {a} != {g}"]
    return res
    
print(ben.measure_accuracy(q,a, test_fn=test))
print(len(errs), errs[:20])

0.0
2001 ['13000=: MMMMMMMMMMMMM != MMMMMMMMMMMCC', '13001=: MMMMMMMMMMMMMI != MMMMMMMMMMMCCC', '13002=: MMMMMMMMMMMMMII != MMMMMMMMMMMCCII', '13003=: MMMMMMMMMMMMMIII != MMMMMMMMMMMCCIII', '13004=: MMMMMMMMMMMMMIV != MMMMMMMMMMMCCIV', '13005=: MMMMMMMMMMMMMV != MMMMMMMMMMMCCC', '13006=: MMMMMMMMMMMMMVI != MMMMMMMMMMMCCVI', '13007=: MMMMMMMMMMMMMVII != MMMMMMMMMMMCCVII', '13008=: MMMMMMMMMMMMMVIII != MMMMMMMMMMMMCCVIII', '13009=: MMMMMMMMMMMMMIX != MMMMMMMMMMMCCC', '13010=: MMMMMMMMMMMMMX != MMMMMMMMMMMCCC', '13011=: MMMMMMMMMMMMMXI != MMMMMMMMMMMCCCI', '13012=: MMMMMMMMMMMMMXII != MMMMMMMMMMMMCCCII', '13013=: MMMMMMMMMMMMMXIII != MMMMMMMMMMMCCCIII', '13014=: MMMMMMMMMMMMMXIV != MMMMMMMMMMMMCCIV', '13015=: MMMMMMMMMMMMMXV != MMMMMMMMMMMMCCL', '13016=: MMMMMMMMMMMMMXVI != MMMMMMMMMMMCCCVI', '13017=: MMMMMMMMMMMMMXVII != MMMMMMMMMMMMCCLVII', '13018=: MMMMMMMMMMMMMXVIII != MMMMMMMMMMMMCCLXIII', '13019=: MMMMMMMMMMMMMXIX != MMMMMMMMMMMCCCIX']


In [39]:
ds = ben.train_dataset
q,a=ds.sample_split(0, len(ds), sep='=', sep_included=-1)

errs = []
def test(q,a,g):
    global errs
    
    res = float(a == g)
    if not res:
        errs += [f"{q}: {a} != {g}"]
    return res
    
print(ben.measure_accuracy(q,a, test_fn=test))
print(len(errs), errs[:20])

0.5991999384568044
5210 ['1=: I != XI', '2=: II != XI', '3=: III != XI', '4=: IV != XI', '5=: V != XV', '6=: VI != XV', '7=: VII != LXIX', '8=: VIII != LXXII', '9=: IX != X7=MMMMMMMXVII', '12=: XII != XIX', '13=: XIII != XXII', '15=: XV != LI', '16=: XVI != LXII', '17=: XVII != LXXII', '18=: XVIII != LXXXII', '20=: XX != XV', '21=: XXI != XIX', '23=: XXIII != XXXII', '24=: XXIV != XLII', '25=: XXV != LIX']


In [41]:
ben = Train('dec2roman2')

# set datasets
ben.set_datasets('padlinechar', train_path='../data/decimal2roman15000.txt', train_split=(13000-1)/15000) # -1 because numbers start at 1

# set config settings
cfg = empty_config()
cfg.train.log_period=0
cfg.model.set(n_layer=8, n_head=8, n_embd=192, block_size=32, dropout=0.25)
cfg.sample.set(top=1, max_batch_size=256) # top_k(1) - always pick the best item
cfg.train.set(sample_period=-5)
cfg.trainer.set(batch_size=128)

# and init a new model with config
if ben.can_resume() and False:
    ben.init_resume(cfg)
else:
    ben.init_new(cfg)
# print(do.get_config().dump(1))


New random seed 1529726838
Initializing new model dec2roman2
Dataset train_path: ../data/decimal2roman15000.txt, val_path: None, train_split: 0.8666, vocab_size: 19
Model params: 3.57M


In [42]:
ben.train(iter_count=5000)

Training
Batches per epoch: 101
iter 0 (0.000 epoch): loss train=1.8668, val=2.2132, eval->2.2132
==> Saving model at iter=0, eval loss->2.2132 
=
.CUDA max memory used: 861.27M
...................................................................................................iter 100 (0.985 epoch): loss train=0.5731, val=0.7859, eval->0.7859
==> Saving model at iter=100, eval loss->0.7859 
....................................................................................................iter 200 (1.969 epoch): loss train=0.4833, val=0.6533, eval->0.6533
==> Saving model at iter=200, eval loss->0.6533 
....................................................................................................iter 300 (2.954 epoch): loss train=0.4139, val=0.6942, eval->0.6942
....................................................................................................iter 400 (3.939 epoch): loss train=0.3139, val=0.7174, eval->0.7174
.....................................................