Train a transformer model to convert decimal numbers from roman literals, ex:

LVII=57

https://en.wikipedia.org/wiki/Roman_numerals


In [1]:
from gptbench import Train, empty_config, LogFlag

In [2]:
ben = Train('roman2dec', seed=0xb0bb1a5)

# set datasets
ben.set_datasets(class_name='charline', 
                 train_path='../data/roman2decimal10k.txt', 
                 train_split=(9000-1)/10000,
                 pre_shuffle=True) # -1 because numbers start at 1

# set config settings
cfg = empty_config()
cfg.model.set(n_layer=6, n_head=6, n_embd=90, block_size=32)
cfg.sample.set(top=1, max_batch_size=256) # top_k(1) - always pick the best item
cfg.train.set(log_period=0, sample_period=-5)
cfg.trainer.set(batch_size=128)

# and init a new model with config
ben.init_new(cfg)

Initializing new model roman2dec
Dataset train_path: ../data/roman2decimal10k.txt, val_path: None, train_split: 0.8999, vocab_size: 19
Model params: 0.59M


In [3]:
ben.val_dataset.get_data()[:20]

['MMMMMMMMDCCCV=8805',
 'MMMMMMDCCLXXIV=6774',
 'MMMMMMDXIII=6513',
 'MCMXI=1911',
 'MMMMMMMMMDCVII=9607',
 'MMMMMMMMMDXXXVI=9536',
 'MMMMDCIII=4603',
 'MMMMLXXIX=4079',
 'MMDCCLXXXV=2785',
 'MMMMMMMMCDXXII=8422',
 'MMMMMMMCCLXIX=7269',
 'MMDXXVII=2527',
 'MMMMMDCCX=5710',
 'MMMCDXLII=3442',
 'MMMMMMDLXXXIV=6584',
 'MDCCVII=1707',
 'MMMMMMCMLXXIX=6979',
 'MMMCCIII=3203',
 'DCLXV=665',
 'MMMMMMMMCCCLVI=8356']

In [4]:
ben.train(iter_count=5000)

Training
Iters per epoch: 70
iter 0 (0.000 epoch): loss train=2.5839, val=2.5864, eval->2.5864
==> Saving model at iter=0, eval loss->2.5864 
VVMMMMMMC5MMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM
CUDA max memory used: 331.81M
....................................................................................................iter 100 (1.422 epoch): loss train=1.1103, val=1.1166, eval->1.1166
==> Saving model at iter=100, eval loss->1.1166 
....................................................................................................iter 200 (2.845 epoch): loss train=0.7847, val=0.7901, eval->0.7901
==> Saving model at iter=200, eval loss->0.7901 
....................................................................................................iter 300 (4.267 epoch): loss train=0.6495, val=0.6543, eval->0.6543
==> Saving model at iter=300, eval loss->0.6543 
................................................................................

In [7]:
ds = ben.val_dataset
q,a=ds.get_data_split(0, len(ds), sep='=', sep_included=-1)

errs = []
def test(q,a,g):
    global errs
    
    res = float(a == g)
    if not res:
        errs += [f"{q}={a} != {g}"]
    return res
    
print(ben.measure_accuracy(q,a, test_fn=test))
print(f'{len(errs)}/{len(ds)} errors: {errs[:20]}')

0.999000999000999
1/1001 errors: ['VI==6 != 5']


In [6]:
ds = ben.train_dataset
q,a=ds.get_data_split(0, len(ds), sep='=', sep_included=-1)

errs = []
print(ben.measure_accuracy(q,a, test_fn=test))
print(f'{len(errs)}/{len(ds)} errors: {errs[:20]}')

0.9993332592510279
6 ['III==3 != 2', 'VIII==8 != 08', 'MMMMMMMMMM==10000 != 9000', 'VII==7 != 07', 'IX==9 != 1', 'II==2 != 1']


In [None]:
# train a bit more?