Train a transformer model to convert decimal numbers to roman literals, ex:

56=LVI


https://en.wikipedia.org/wiki/Roman_numerals

In [1]:
from gptbench import Train, empty_config, LogFlag

In [2]:
ben = Train('dec2roman', seed=0xbeebaca)

# set datasets
ben.set_datasets(class_name='charline', 
                 train_path='../data/decimal2roman10k.txt', 
                 train_split=(9000-1)/10000,
                 pre_shuffle=True) # -1 because numbers start at 1

# set config settings
cfg = empty_config()
cfg.train.log_period=0
cfg.model.set(n_layer=6, n_head=6, n_embd=90, block_size=32)
cfg.sample.set(top=1, max_batch_size=256) # top_k(1) - always pick the best item
cfg.train.set(sample_period=-5)
cfg.trainer.set(batch_size=128)

# and init a new model with config
ben.init_new(cfg)

Initializing new model dec2roman
Dataset train_path: ../data/decimal2roman10k.txt, val_path: None, train_split: 0.8999, vocab_size: 19
Model params: 0.59M


In [3]:
ben.val_dataset.get_data()[:20]

['2209=MMCCIX',
 '5913=MMMMMCMXIII',
 '507=DVII',
 '8029=MMMMMMMMXXIX',
 '3685=MMMDCLXXXV',
 '7422=MMMMMMMCDXXII',
 '8805=MMMMMMMMDCCCV',
 '8390=MMMMMMMMCCCXC',
 '4128=MMMMCXXVIII',
 '7937=MMMMMMMCMXXXVII',
 '4076=MMMMLXXVI',
 '8075=MMMMMMMMLXXV',
 '5783=MMMMMDCCLXXXIII',
 '6607=MMMMMMDCVII',
 '3620=MMMDCXX',
 '6623=MMMMMMDCXXIII',
 '651=DCLI',
 '2822=MMDCCCXXII',
 '7117=MMMMMMMCXVII',
 '9709=MMMMMMMMMDCCIX']

In [4]:
ben.train(iter_count=3000)

Training
Iters per epoch: 70
iter 0 (0.000 epoch): loss train=2.6152, val=2.6174, eval->2.6174
==> Saving model at iter=0, eval loss->2.6174 
D
CUDA max memory used: 331.81M
....................................................................................................iter 100 (1.422 epoch): loss train=1.1048, val=1.1043, eval->1.1043
==> Saving model at iter=100, eval loss->1.1043 
....................................................................................................iter 200 (2.845 epoch): loss train=0.7043, val=0.7023, eval->0.7023
==> Saving model at iter=200, eval loss->0.7023 
....................................................................................................iter 300 (4.267 epoch): loss train=0.5250, val=0.5249, eval->0.5249
==> Saving model at iter=300, eval loss->0.5249 
....................................................................................................iter 400 (5.690 epoch): loss train=0.4119, val=0.4125, eval->0.4125
==> Sav

In [16]:
ben.load()

Loading checkpoint from ./models/dec2roman/
Checkpoint: iter=2900 (41.249 epoch), loss train=0.2199 val=0.2202 eval->0.2202
Dataset train_path: ../data/decimal2roman10k.txt, val_path: None, train_split: 0.8999, vocab_size: 19
Model params: 0.59M


In [18]:
ds = ben.val_dataset
q,a=ds.get_data_split(0, len(ds), sep='=', sep_included=-1)

errs = []
def test(q,a,g):
    global errs
    
    res = float(a == g)
    if not res:
        errs += [f"{q}{a} != {g}"]
    return res
    
print(ben.measure_accuracy(q,a, test_fn=test))
print(f'{len(errs)}/{len(ds)} errors: {errs[:20]}')

0.998001998001998
2/1001 errors: ['331=CCCXXXI != CCCXXI', '4=IV != I']


In [19]:
ds = ben.train_dataset
q,a=ds.get_data_split(0, len(ds), sep='=', sep_included=-1)

errs = []
print(ben.measure_accuracy(q,a, test_fn=test))
print(f'{len(errs)}/{len(ds)} errors: {errs[:20]}')

0.9958884320480054
37/8999 errors: ['831=DCCCXXXI != DCCCXXI', '37=XXXVII != XXVII', '21=XXI != XII', '79=LXXIX != LXIX', '381=CCCLXXXI != CCCLXXI', '33=XXXIII != XXIII', '36=XXXVI != XXVI', '881=DCCCLXXXI != DCCCLXXI', '39=XXXIX != XXIX', '96=XCVI != XVI', '9=IX != I', '989=CMLXXXIX != CMLXXIX', '89=LXXXIX != XXXIX', '3=III != II', '31=XXXI != XXII', '26=XXVI != XVI', '46=XLVI != XVIV', '38=XXXVIII != XXVIII', '75=LXXV != LXV', '481=CDLXXXI != CDLXXI']


In [11]:
ben.sample('225=')

225=CCXXV


In [12]:
# bit more training, up to 10k iters
ben.train(iter_count=7000)

Training
Resumed optimizer state
Iters per epoch: 70
CUDA max memory used: 336.55M
....................................................................................................iter 3000 (42.671 epoch): loss train=0.2197, val=0.2206, eval->0.2206
300=CCC
....................................................................................................iter 3100 (44.094 epoch): loss train=0.2198, val=0.2211, eval->0.2211
....................................................................................................iter 3200 (45.516 epoch): loss train=0.2199, val=0.2209, eval->0.2209
....................................................................................................iter 3300 (46.939 epoch): loss train=0.2196, val=0.2212, eval->0.2212
....................................................................................................iter 3400 (48.361 epoch): loss train=0.2196, val=0.2219, eval->0.2219
...........................................................

In [13]:
# No better state was saved

{'n_samples': 1267072,
 'train_loss': 0.21919450163841248,
 'val_loss': 0.22224929928779602,
 'eval_loss': 0.22224929928779602}