In [46]:
import os
from tqdm import tqdm
import numpy as np
from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)
import tiktoken
from datasets import load_dataset # huggingface datasets
from tokenizers.processors import TemplateProcessing


In [47]:
num_proc = 8

# number of workers in load_dataset() call
# best number might be different from num_proc above as it also depends on NW speed.
# it is better than 1 usually though
num_proc_load_dataset = num_proc

In [48]:
dataset = load_dataset("evanfrick/lichess", num_proc=num_proc_load_dataset)

# owt by default only contains the 'train' split, so create a test split
split_dataset = dataset["train"].train_test_split(test_size=0.00025, seed=2357, shuffle=True)
split_dataset['val'] = split_dataset.pop('test') # rename the test split to val

# this results in:
# >>> split_dataset
# DatasetDict({
#     train: Dataset({
#         features: ['text'],
#         num_rows: 8009762
#     })
#     val: Dataset({
#         features: ['text'],
#         num_rows: 4007
#     })
# })

# we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
tokenizer = Tokenizer.from_file("/data/evan/chess-llm/tokenizer.model")
tokenizer.build

Resolving data files: 100%|██████████| 40/40 [00:01<00:00, 26.84it/s]


In [6]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'game', 'result'],
        num_rows: 19052702
    })
    val: Dataset({
        features: ['id', 'game', 'result'],
        num_rows: 4765
    })
})

In [7]:
enc = tiktoken.get_encoding("gpt2")

In [16]:
split_dataset['train'][0]['game']

'<w>e2-e4<b>e7-e6<w>d2-d3<b>d7-d5<w>Nb1-d2<b>Ng8-f6<w>Ng1-f3<b>c7-c5<w>g2-g3<b>Nb8-c6<w>Bf1-g2<b>Bf8-e7<w>O-O<b>O-O<w>Rf1-e1<b>b7-b5<w>e4-e5<b>Nf6-d7<w>Nd2-f1<b>a7-a5<w>h2-h4<b>c5-c4<w>d3-d4<b>b5-b4<w>c2-c3<b>a5-a4<w>a2-a3<b>b4xc3<w>b2xc3<b>Nc6-a5<w>h4-h5<b>Na5-b3<w>Ra1-a2<b>Nd7-b8<w>Bc1-f4<b>Nb8-a6<w>h5-h6<b>g7-g6<w>Nf1-h2<b>Na6-c7<w>Nh2-g4<b>Nc7-b5<w>Qd1-c2<b>Nb5xa3<w>Ra2xa3<b>Be7xa3<w>Ng4-f6+<b>Kg8-h8<w>Nf3-g5<b>Ba3-e7<w>Nf6xh7<b>a4-a3<w>Nh7xf8<b>Qd8xf8<w>Qc2-a2<b>Bc8-d7<w>Bg2-f3<b>Ra8-b8<w>Bf3-d1<b>Bd7-a4<w>Kg1-g2<b>Ba4-c6<w>Ng5-f3<b>Nb3-a5<w>Re1-h1<b>Rb8-b2<w>Qa2-a1<b>Na5-b3<w>Bd1xb3<b>c4xb3<w>Bf4-g5<b>Rb2-a2<w>Qa1-b1<b>Ra2-b2<w>Qb1-c1<b>Rb2-c2<w>Qc1-f4<b>b3-b2<w>Bg5-f6+'

In [22]:
len(enc.encode_ordinary(split_dataset['train'][0]['game']))

663

In [43]:
ids = tokenizer.encode(split_dataset['train'][0]['game'], add_special_tokens=True).ids
ids

[0,
 22908,
 1,
 22560,
 0,
 22974,
 1,
 22725,
 0,
 10215,
 1,
 14211,
 0,
 18693,
 1,
 23034,
 0,
 23235,
 1,
 21117,
 0,
 14625,
 1,
 14475,
 0,
 23247,
 1,
 23247,
 0,
 14403,
 1,
 22239,
 0,
 22911,
 1,
 9795,
 0,
 12549,
 1,
 23025,
 0,
 22134,
 1,
 22038,
 0,
 22140,
 1,
 22185,
 0,
 22587,
 1,
 22254,
 0,
 23031,
 1,
 22758,
 0,
 22707,
 1,
 17571,
 0,
 22896,
 1,
 8241,
 0,
 13743,
 1,
 20433,
 0,
 21189,
 1,
 9945,
 0,
 22833,
 1,
 22266,
 0,
 9597,
 1,
 5421,
 0,
 11523,
 1,
 19743,
 0,
 8871,
 1,
 546,
 0,
 19194,
 1,
 10830,
 0,
 18568,
 1,
 14991,
 0,
 18651,
 1,
 10845,
 0,
 19710,
 1,
 22809,
 0,
 5124,
 1,
 19974,
 0,
 18807,
 1,
 5799,
 0,
 7095,
 1,
 2421,
 0,
 18081,
 1,
 1461,
 0,
 5715,
 1,
 14721,
 0,
 16851,
 1,
 12495,
 0,
 11931,
 1,
 1089,
 0,
 5433,
 1,
 8241,
 0,
 16776,
 1,
 22932,
 0,
 12105,
 1,
 9027,
 0,
 21051,
 1,
 11379,
 0,
 12333,
 1,
 3003,
 0,
 5019,
 1,
 22986,
 0,
 8338]

In [44]:
tokenizer.decode(ids, skip_special_tokens=False)

'<w> e2-e4 <b> e7-e6 <w> d2-d3 <b> d7-d5 <w> Nb1-d2 <b> Ng8-f6 <w> Ng1-f3 <b> c7-c5 <w> g2-g3 <b> Nb8-c6 <w> Bf1-g2 <b> Bf8-e7 <w> O-O <b> O-O <w> Rf1-e1 <b> b7-b5 <w> e4-e5 <b> Nf6-d7 <w> Nd2-f1 <b> a7-a5 <w> h2-h4 <b> c5-c4 <w> d3-d4 <b> b5-b4 <w> c2-c3 <b> a5-a4 <w> a2-a3 <b> b4xc3 <w> b2xc3 <b> Nc6-a5 <w> h4-h5 <b> Na5-b3 <w> Ra1-a2 <b> Nd7-b8 <w> Bc1-f4 <b> Nb8-a6 <w> h5-h6 <b> g7-g6 <w> Nf1-h2 <b> Na6-c7 <w> Nh2-g4 <b> Nc7-b5 <w> Qd1-c2 <b> Nb5xa3 <w> Ra2xa3 <b> Be7xa3 <w> Ng4-f6+ <b> Kg8-h8 <w> Nf3-g5 <b> Ba3-e7 <w> Nf6xh7 <b> a4-a3 <w> Nh7xf8 <b> Qd8xf8 <w> Qc2-a2 <b> Bc8-d7 <w> Bg2-f3 <b> Ra8-b8 <w> Bf3-d1 <b> Bd7-a4 <w> Kg1-g2 <b> Ba4-c6 <w> Ng5-f3 <b> Nb3-a5 <w> Re1-h1 <b> Rb8-b2 <w> Qa2-a1 <b> Na5-b3 <w> Bd1xb3 <b> c4xb3 <w> Bf4-g5 <b> Rb2-a2 <w> Qa1-b1 <b> Ra2-b2 <w> Qb1-c1 <b> Rb2-c2 <w> Qc1-f4 <b> b3-b2 <w> Bg5-f6+'

In [45]:
tokenizer.eos_token

AttributeError: 'tokenizers.Tokenizer' object has no attribute 'eos_token'

In [None]:
def process(example):
        #ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
        ids = tokenizer.batch_encode(example['game'])
        #ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
        # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
        out = {'ids': ids, 'len': len(ids)}
        return out