In [51]:
from itertools import chain
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

import numpy as np

In [None]:
data = Path.home() / "github_data"

paths = [
    [
        str(x) for x in lang_dir.glob("*")
    ] 
    for lang_dir in data.iterdir()
    if lang_dir.is_dir()
]
paths = list(chain(*paths))

In [4]:
len(paths)

52870

In [7]:
2**15

32768

In [9]:
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=2**15, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
], show_progress=True)






In [18]:
tokenizer.save_model(".", "tokenizer")

['./tokenizer-vocab.json', './tokenizer-merges.txt']

In [19]:
tokenizer = ByteLevelBPETokenizer(
    "./tokenizer-vocab.json",
    "./tokenizer-merges.txt",
)

In [27]:
s = 'print("Hello, World!")'
encoding = tokenizer.encode('print("Hello, World!")')

print(len(s), len(encoding.ids))

22 6


In [42]:
max(len(token) for token in tokenizer.get_vocab().keys())

128

In [53]:
tokens, ids = zip(*tokenizer.get_vocab().items())
tokens = np.array(tokens)
ids = np.array(ids)

In [55]:
order = np.argsort(ids)
tokens = tokens[order]
ids = ids[order]

In [67]:
chr(288)

'Ġ'

In [94]:
tokens_bytes = []
lengths = []

for token, token_id in zip(tokens, ids):
    token_bytes = tokenizer.decode([token_id]).encode("utf-8")
    lengths.append(len(token_bytes))
    tokens_bytes.append(token_bytes)
    # break

In [101]:
lengths = np.array(lengths)

In [107]:
pointers = np.r_[0, (lengths + 1).cumsum()[:-1]]

In [113]:
len(tokens_bytes).to_bytes(length=4, signed=False, byteorder="little")

b'\x00\x80\x00\x00'

In [121]:

with open("tokenizer_vocab.bin", "wb") as file:
    n_tokens = len(tokens_bytes)
    sum_len = (lengths + 1).sum()

    file.write(n_tokens.to_bytes(length=4, signed=False, byteorder="little"))
    file.write(int(sum_len).to_bytes(length=4, signed=False, byteorder="little"))
    
    for offset in pointers:
        file.write(int(offset).to_bytes(length=8, signed=False, byteorder="little"))
        
    for token_bytes in tokens_bytes:
        file.write(token_bytes)
        file.write(b"\0")

In [128]:
tokenizer.encode("H").ids

[44]

In [120]:
tokens[-1]

'10010'

In [82]:
tokenizer.encode('ĠHello, world').ids

[133, 259, 12945, 16, 8044]

In [99]:
tokens_bytes[133]

b'\xef\xbf\xbd'

In [83]:
[
    tokenizer.decode([token_id])
    for token_id in tokenizer.encode('ĠHello, world').ids
]

['�', '�', 'Hello', ',', ' world']

In [74]:
list(token_bytes)

[60, 115, 62]

In [43]:
tokenizer.get_vocab_size()

32768

In [39]:
for id in tokenizer.encode("кек").ids:
    print(tokenizer.decode([id]))

к
ек


In [35]:
tokenizer.decode([225]).encode("utf8")[0]

32

In [None]:
import torch
from torch import nn

model = nn.Sequential(
    nn.Embedding()
)

In [29]:
for id in encoding.ids:
    print(tokenizer.decode([id]))

print
("
Hello
,
 World
!")
