# **byteBuddy**

### **tokenization**
*inspiration:* https://tiktokenizer.vercel.app

* why can't LLM spell words?
* why can't LLM do super-simple string processing tasks like reversing a string?
* why is LLM worse at non-English languages (e.g. Japanese)?
* why is LLM bad at simple arithmetic?
* why did GPT-2 have more than necessary trouble coding in Python?
* why did my LLM abruptly halt when it sees the string "<|endoftext|>"?
* what is this weird warning I get about a "trailing whitespace"?
* why does the LLM break if I ask about "SolidGoldMagikarp"?
* why should I prefer to use YAML over JSON with LLMs?
* why is LLM not actually end-to-end language modelling?

## unicode code points

In [1]:
s = 'hello, world'
t = [ord(c) for c in s]
s = [chr(i) for i in t]
s, t

(['h', 'e', 'l', 'l', 'o', ',', ' ', 'w', 'o', 'r', 'l', 'd'],
 [104, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100])

## unicode byte encodings

In [2]:
s = 'こんにちは世界'
print(s.encode('utf-8'))
print(s.encode('utf-16'))
print(s.encode('utf-32'))

b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c'
b'\xff\xfeS0\x930k0a0o0\x16NLu'
b'\xff\xfe\x00\x00S0\x00\x00\x930\x00\x00k0\x00\x00a0\x00\x00o0\x00\x00\x16N\x00\x00Lu\x00\x00'


In [3]:
print(list(s.encode('utf-8')))
print(list(s.encode('utf-16')))
print(list(s.encode('utf-32')))

[227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 228, 184, 150, 231, 149, 140]
[255, 254, 83, 48, 147, 48, 107, 48, 97, 48, 111, 48, 22, 78, 76, 117]
[255, 254, 0, 0, 83, 48, 0, 0, 147, 48, 0, 0, 107, 48, 0, 0, 97, 48, 0, 0, 111, 48, 0, 0, 22, 78, 0, 0, 76, 117, 0, 0]


In [4]:
text = 'Hello, 世界! 🌍😊 Привет мир! 🚀🌟 नमस्ते दुनिया! 🎉📚 こんにちは世界! 🎌🐉 안녕하세요 세계! 🎵🖋️'
tokens = text.encode('utf-8')   # raw bytes
tokens = list(tokens)           # convert to a list of integers in range 0...255 for convenience
print('----------')
print(text)
print('length:', len(text))
print('----------')
print(tokens)
print('length:', len(tokens))

----------
Hello, 世界! 🌍😊 Привет мир! 🚀🌟 नमस्ते दुनिया! 🎉📚 こんにちは世界! 🎌🐉 안녕하세요 세계! 🎵🖋️
length: 72
----------
[72, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140, 33, 32, 240, 159, 140, 141, 240, 159, 152, 138, 32, 208, 159, 209, 128, 208, 184, 208, 178, 208, 181, 209, 130, 32, 208, 188, 208, 184, 209, 128, 33, 32, 240, 159, 154, 128, 240, 159, 140, 159, 32, 224, 164, 168, 224, 164, 174, 224, 164, 184, 224, 165, 141, 224, 164, 164, 224, 165, 135, 32, 224, 164, 166, 224, 165, 129, 224, 164, 168, 224, 164, 191, 224, 164, 175, 224, 164, 190, 33, 32, 240, 159, 142, 137, 240, 159, 147, 154, 32, 227, 129, 147, 227, 130, 147, 227, 129, 171, 227, 129, 161, 227, 129, 175, 228, 184, 150, 231, 149, 140, 33, 32, 240, 159, 142, 140, 240, 159, 144, 137, 32, 236, 149, 136, 235, 133, 149, 237, 149, 152, 236, 132, 184, 236, 154, 148, 32, 236, 132, 184, 234, 179, 132, 33, 32, 240, 159, 142, 181, 240, 159, 150, 139, 239, 184, 143]
length: 169


In [5]:
text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'
tokens = text.encode('utf-8')   # raw bytes
tokens = list(tokens)           # convert to a list of integers in range 0...255 for convenience
print('----------')
print(text)
print('length:', len(text))
print('----------')
print(tokens)
print('length:', len(tokens))

----------
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
length: 445
----------
[76, 111, 114, 101, 109, 32, 105, 112, 115, 117, 109, 32, 100, 111, 108, 111, 114, 32, 115, 105, 116, 32, 97, 109, 101, 116, 44, 32, 99, 111, 110, 115, 101, 99, 116, 101, 116, 117, 114, 32, 97, 100, 105, 112, 105, 115, 99, 105, 110, 103, 32, 101, 108, 105, 116, 44, 32, 115, 101, 100, 32, 100, 111, 32, 101, 105, 117, 115, 109, 111, 100, 32, 116, 101, 109, 112, 111, 114, 32, 105, 110, 99, 105, 100, 105, 100, 117, 110, 116, 32, 117, 116, 32, 108, 97, 98, 111, 114, 101, 32, 101, 116, 32, 100, 111, 108, 111, 114, 101, 3

## **byte-pair encoding (BPE) algorithm**

In [6]:
def get_stats(ids):
    counts = {}
    for pair in zip(ids, ids[1:]):      # pythonic way to iterate consecutive elements
        counts[pair] = counts.get(pair, 0) + 1
    return counts

In [7]:
stats = get_stats(tokens)
print(stats)
print(sorted(((v, k) for k, v in stats.items()), reverse=True))

{(76, 111): 1, (111, 114): 9, (114, 101): 7, (101, 109): 2, (109, 32): 6, (32, 105): 7, (105, 112): 3, (112, 115): 1, (115, 117): 2, (117, 109): 3, (32, 100): 6, (100, 111): 6, (111, 108): 6, (108, 111): 4, (114, 32): 5, (32, 115): 4, (115, 105): 3, (105, 116): 6, (116, 32): 16, (32, 97): 7, (97, 109): 3, (109, 101): 1, (101, 116): 3, (116, 44): 3, (44, 32): 4, (32, 99): 6, (99, 111): 4, (111, 110): 4, (110, 115): 2, (115, 101): 5, (101, 99): 2, (99, 116): 1, (116, 101): 5, (116, 117): 2, (117, 114): 4, (97, 100): 2, (100, 105): 2, (112, 105): 2, (105, 115): 5, (115, 99): 1, (99, 105): 5, (105, 110): 7, (110, 103): 1, (103, 32): 1, (32, 101): 10, (101, 108): 2, (108, 105): 5, (101, 100): 1, (100, 32): 5, (111, 32): 3, (101, 105): 1, (105, 117): 1, (117, 115): 1, (115, 109): 1, (109, 111): 3, (111, 100): 2, (32, 116): 1, (109, 112): 1, (112, 111): 1, (110, 99): 1, (105, 100): 5, (100, 117): 1, (117, 110): 3, (110, 116): 5, (32, 117): 3, (117, 116): 3, (32, 108): 3, (108, 97): 5, (97, 98

In [8]:
top_pair = max(stats, key=stats.get)
top_pair

(116, 32)

In [9]:
def merge(ids, pair, idx):
    # in the list of int(s) (ids), replace all consecutive occurences of pair with the new token idx
    newIds = []

    i = 0
    while i < len(ids):
        # if we are not at the very last position AND the pair matches, replace it
        if i < len(ids)-1 and pair[0] == ids[i] and pair[1] == ids[i+1]:
            newIds.append(idx)
            i += 2
        else:
            newIds.append(ids[i])
            i += 1
            
    return newIds

In [10]:
tokens = merge(tokens, top_pair, 256)
print(tokens)
print('length:', len(tokens))

[76, 111, 114, 101, 109, 32, 105, 112, 115, 117, 109, 32, 100, 111, 108, 111, 114, 32, 115, 105, 256, 97, 109, 101, 116, 44, 32, 99, 111, 110, 115, 101, 99, 116, 101, 116, 117, 114, 32, 97, 100, 105, 112, 105, 115, 99, 105, 110, 103, 32, 101, 108, 105, 116, 44, 32, 115, 101, 100, 32, 100, 111, 32, 101, 105, 117, 115, 109, 111, 100, 32, 116, 101, 109, 112, 111, 114, 32, 105, 110, 99, 105, 100, 105, 100, 117, 110, 256, 117, 256, 108, 97, 98, 111, 114, 101, 32, 101, 256, 100, 111, 108, 111, 114, 101, 32, 109, 97, 103, 110, 97, 32, 97, 108, 105, 113, 117, 97, 46, 32, 85, 256, 101, 110, 105, 109, 32, 97, 100, 32, 109, 105, 110, 105, 109, 32, 118, 101, 110, 105, 97, 109, 44, 32, 113, 117, 105, 115, 32, 110, 111, 115, 116, 114, 117, 100, 32, 101, 120, 101, 114, 99, 105, 116, 97, 116, 105, 111, 110, 32, 117, 108, 108, 97, 109, 99, 111, 32, 108, 97, 98, 111, 114, 105, 115, 32, 110, 105, 115, 105, 32, 117, 256, 97, 108, 105, 113, 117, 105, 112, 32, 101, 120, 32, 101, 97, 32, 99, 111, 109, 109, 1

### training the tokenizer

In [11]:
text = open('tiny_shakespeare.txt', 'r', encoding='utf-8').read()
tokens = list(text.encode('utf-8'))
print(tokens[:80])
print(len(tokens))

[70, 105, 114, 115, 116, 32, 67, 105, 116, 105, 122, 101, 110, 58, 10, 66, 101, 102, 111, 114, 101, 32, 119, 101, 32, 112, 114, 111, 99, 101, 101, 100, 32, 97, 110, 121, 32, 102, 117, 114, 116, 104, 101, 114, 44, 32, 104, 101, 97, 114, 32, 109, 101, 32, 115, 112, 101, 97, 107, 46, 10, 10, 65, 108, 108, 58, 10, 83, 112, 101, 97, 107, 44, 32, 115, 112, 101, 97, 107, 46]
1115394


In [12]:
vocab_size = 324            # the desired final vocabulary size
num_merges = vocab_size - 256
ids = list(tokens)          # copy so we don't destroy the original list


merges = {}                 # (int, int) -> (int)
for i in range(num_merges):
    stats = get_stats(ids)
    pair = max(stats, key=stats.get)

    idx = 256 + i
    ids = merge(ids, pair, idx)

    print(f'merging {pair} into a new token {idx}')
    merges[pair] = idx

merging (101, 32) into a new token 256
merging (116, 104) into a new token 257
merging (116, 32) into a new token 258
merging (115, 32) into a new token 259
merging (100, 32) into a new token 260
merging (44, 32) into a new token 261
merging (111, 117) into a new token 262
merging (101, 114) into a new token 263
merging (105, 110) into a new token 264
merging (121, 32) into a new token 265
merging (97, 110) into a new token 266
merging (58, 10) into a new token 267
merging (111, 114) into a new token 268
merging (111, 32) into a new token 269
merging (101, 110) into a new token 270
merging (10, 10) into a new token 271
merging (97, 114) into a new token 272
merging (32, 257) into a new token 273
merging (111, 110) into a new token 274
merging (108, 108) into a new token 275
merging (104, 97) into a new token 276
merging (44, 10) into a new token 277
merging (46, 271) into a new token 278
merging (105, 259) into a new token 279
merging (101, 115) into a new token 280
merging (121, 262) 

In [13]:
vocab = {idx : bytes([idx]) for idx in range(256)}
for (p0, p1), idx in merges.items():
    vocab[idx] = vocab[p0] + vocab[p1]

In [14]:
print('token length:', len(tokens))
print('ids length:', len(ids))
print(f'compression ratio: {len(tokens)/len(ids):.2f}x')

token length: 1115394
ids length: 736256
compression ratio: 1.51x


**Note:** The tokenizer is a completely seperate, independent module from the LLM. It has its own training dataset of text (which could be different from that of the LLM), on which you train the vocabulary using the Byte Pair Encoding (BPE) algorithm. It then translates back and forth between raw text and sequences of tokens. The LLM later only ever sees the tokens and never directly deals with any text.

<img src="images/tokenizer_llm.png"/>

## **decoding**

In [15]:
def decode(ids):
    # given ids (list of integers), return a Python string
    tokens = b''.join(vocab[idx] for idx in ids)
    text = tokens.decode('utf-8', errors='replace')
    return text

In [16]:
print(decode(ids)[:80])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.


## **encoding**

In [17]:
def encode(text):
    # given a string, return a list of integers (the tokens)
    tokens = list(text.encode('utf-8'))

    while len(tokens) > 1:
        stats = get_stats(tokens)
        pair = min(stats, key=lambda p: merges.get(p, float('inf')))
        
        # nothing else can be merged
        if pair not in merges:
            break   
        idx = merges[pair]
        tokens = merge(tokens, pair, idx)                

    return tokens

In [18]:
print(encode(text)[:80])

[70, 299, 310, 67, 316, 105, 122, 270, 267, 66, 101, 300, 256, 119, 256, 112, 114, 111, 99, 101, 101, 260, 266, 265, 102, 117, 114, 257, 263, 261, 104, 314, 287, 256, 115, 112, 286, 107, 278, 65, 275, 267, 83, 112, 286, 107, 261, 115, 112, 286, 107, 278, 70, 299, 310, 67, 316, 105, 122, 270, 267, 89, 262, 32, 272, 256, 97, 308, 114, 280, 111, 108, 118, 101, 260, 114, 97, 257, 322, 283]


In [19]:
val_text = 'Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.'
print(val_text == decode(encode(val_text)))

True


### forced splits using regex expressions
**GitHub repository:** https://github.com/openai/gpt-2

**regular expression pattern**

1. `'|'s|'t|'re|'ve|'m|'ll|'d` - Matches contractions like 's, 't, 're, 've, 'm, 'll, 'd.
2. ` ?\p{L}+` - Matches sequences of one or more letters (\p{L}), optionally preceded by a space.
3. ` ?\p{N}+` - Matches sequences of one or more digits (\p{N}), optionally preceded by a space.
4. ` ?[^\s\p{L}\p{N}]+` - Matches sequences of one or more characters that are neither whitespace, letters (\p{L}), nor digits (\p{N}), optionally preceded by a space.
5. `\s+(?!\S)` - Matches one or more whitespace characters that are followed by more whitespace (essentially matches sequences of whitespace characters).
6. `\s+` - Matches one or more whitespace characters.

In [20]:
import regex as re
gpt2pattern = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
print(re.findall(gpt2pattern, "hello, world! I'm GPT-2 tokenizer!"))

['hello', ',', ' world', '!', ' I', "'m", ' GPT', '-', '2', ' tokenizer', '!']


### tiktoken OpenAI library
**GitHub repository:** https://github.com/openai/tiktoken

In [21]:
import tiktoken
# GPT2 tokenizer (does not merge spaces)
enc = tiktoken.get_encoding('r50k_base') 
print(enc.encode('hello, world!   ')) 

[31373, 11, 995, 0, 220, 220, 220]


In [22]:
import tiktoken
# GPT4 tokenizer (merge spaces)
enc = tiktoken.get_encoding('cl100k_base') 
print(enc.encode('hello, world!   '))

[15339, 11, 1917, 0, 262]


### GPT-2 tokenizer inference demonstration

In [23]:
import os
import json

# equivalent to `vocab` implementation
with open('gpt2_files/encoder.json', 'r') as f:
    encoder = json.load(f)

# equivalent to `merges` implementation
with open('gpt2_files/vocab.bpe', 'r') as f:
    bpe_data = f.read()

bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:]]

In [24]:
encoder

{'!': 0,
 '"': 1,
 '#': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 '(': 7,
 ')': 8,
 '*': 9,
 '+': 10,
 ',': 11,
 '-': 12,
 '.': 13,
 '/': 14,
 '0': 15,
 '1': 16,
 '2': 17,
 '3': 18,
 '4': 19,
 '5': 20,
 '6': 21,
 '7': 22,
 '8': 23,
 '9': 24,
 ':': 25,
 ';': 26,
 '<': 27,
 '=': 28,
 '>': 29,
 '?': 30,
 '@': 31,
 'A': 32,
 'B': 33,
 'C': 34,
 'D': 35,
 'E': 36,
 'F': 37,
 'G': 38,
 'H': 39,
 'I': 40,
 'J': 41,
 'K': 42,
 'L': 43,
 'M': 44,
 'N': 45,
 'O': 46,
 'P': 47,
 'Q': 48,
 'R': 49,
 'S': 50,
 'T': 51,
 'U': 52,
 'V': 53,
 'W': 54,
 'X': 55,
 'Y': 56,
 'Z': 57,
 '[': 58,
 '\\': 59,
 ']': 60,
 '^': 61,
 '_': 62,
 '`': 63,
 'a': 64,
 'b': 65,
 'c': 66,
 'd': 67,
 'e': 68,
 'f': 69,
 'g': 70,
 'h': 71,
 'i': 72,
 'j': 73,
 'k': 74,
 'l': 75,
 'm': 76,
 'n': 77,
 'o': 78,
 'p': 79,
 'q': 80,
 'r': 81,
 's': 82,
 't': 83,
 'u': 84,
 'v': 85,
 'w': 86,
 'x': 87,
 'y': 88,
 'z': 89,
 '{': 90,
 '|': 91,
 '}': 92,
 '~': 93,
 '¡': 94,
 '¢': 95,
 '£': 96,
 '¤': 97,
 '¥': 98,
 '¦': 99,
 '§': 100

In [25]:
bpe_merges

[('Ġ', 't'),
 ('Ġ', 'a'),
 ('h', 'e'),
 ('i', 'n'),
 ('r', 'e'),
 ('o', 'n'),
 ('Ġt', 'he'),
 ('e', 'r'),
 ('Ġ', 's'),
 ('a', 't'),
 ('Ġ', 'w'),
 ('Ġ', 'o'),
 ('e', 'n'),
 ('Ġ', 'c'),
 ('i', 't'),
 ('i', 's'),
 ('a', 'n'),
 ('o', 'r'),
 ('e', 's'),
 ('Ġ', 'b'),
 ('e', 'd'),
 ('Ġ', 'f'),
 ('in', 'g'),
 ('Ġ', 'p'),
 ('o', 'u'),
 ('Ġa', 'n'),
 ('a', 'l'),
 ('a', 'r'),
 ('Ġt', 'o'),
 ('Ġ', 'm'),
 ('Ġo', 'f'),
 ('Ġ', 'in'),
 ('Ġ', 'd'),
 ('Ġ', 'h'),
 ('Ġan', 'd'),
 ('i', 'c'),
 ('a', 's'),
 ('l', 'e'),
 ('Ġt', 'h'),
 ('i', 'on'),
 ('o', 'm'),
 ('l', 'l'),
 ('en', 't'),
 ('Ġ', 'n'),
 ('Ġ', 'l'),
 ('s', 't'),
 ('Ġ', 're'),
 ('v', 'e'),
 ('Ġ', 'e'),
 ('r', 'o'),
 ('l', 'y'),
 ('Ġb', 'e'),
 ('Ġ', 'g'),
 ('Ġ', 'T'),
 ('c', 't'),
 ('Ġ', 'S'),
 ('i', 'd'),
 ('o', 't'),
 ('Ġ', 'I'),
 ('u', 't'),
 ('e', 't'),
 ('Ġ', 'A'),
 ('Ġ', 'is'),
 ('Ġ', 'on'),
 ('i', 'm'),
 ('a', 'm'),
 ('o', 'w'),
 ('a', 'y'),
 ('a', 'd'),
 ('s', 'e'),
 ('Ġth', 'at'),
 ('Ġ', 'C'),
 ('i', 'g'),
 ('Ġf', 'or'),
 ('a', 'c'),
 ('Ġ

#### special tokens

In [26]:
# 256 raw byte tokens + 50,000 merges + 1 special token
len(encoder)

50257

In [27]:
encoder['<|endoftext|>']

50256

### sentencepiece tokenizer
**GitHub repository:** https://github.com/google/sentencepiece 

Commonly used because (unlike tiktoken), it can efficiently both train and inference BPE tokenizers. It is used in both **LLama** and **Mistral** series. sentencepiece runs BPE on the Unicode code points directly! It then has an option `character_coverage` for what to do with very very rare codepoints that appear a few times, and it either maps them onto an UNK token, or if `byte_fallback` is turned on, it encodes them with UTF-8 and then, encodes the raw bytes instead.

In other words, tiktoken encodes to UTF-8 and then, BPE bytes. sentencepiece BPEs the codepoints and optionally falls back to UTF-8 for rare codepoints (rarity is determined by `character_coverage` hyperparameter), which then gets translated to byte tokens. 


**sentencepiece** Training Options markdown [LINK](https://github.com/google/sentencepiece/blob/master/doc/options.md)
<br>
**sentencepiece** Training Options [protocol](https://protobuf.dev) buffer [LINK](https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto#L193)

In [28]:
import sentencepiece as spm

with open('tiny_shakespeare.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#### Llama2 tokenizer

In [29]:
import os

# train a sentencepiece model on it
options = dict(
    # input specifications
    input='tiny_shakespeare.txt',
    input_format='text',

    # output specifications
    model_prefix='t1k_bfT',                     # output filename prefix

    # algorithm specifications
    model_type='bpe',                       # bpe algorithm
    vocab_size=1000,

    # normalization
    normalization_rule_name='identity',     # turn off normalization
    remove_extra_whitespaces=False,
    input_sentence_size=200000000,          # max nummber of training sentences
    max_sentence_length=4192,               # max number of bytes per sentence
    seed_sentencepiece_size=1000000,
    shuffle_input_sentence=True,

    # rare word treatment
    character_coverage=0.99995,
    byte_fallback=True,

    # merge rules
    split_digits=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True,
    max_sentencepiece_length=16,
    add_dummy_prefix=True,
    allow_whitespace_only_pieces=True,

    # special tokens
    unk_id=0,                               # the UNK token must exist
    bos_id=1,
    eos_id=2,
    pad_id=-1,                              # others are optional, set to -1 to turn off

    # systems
    num_threads=os.cpu_count(),             # use all system resources
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: tiny_shakespeare.txt
  input_format: text
  model_prefix: t1k_bfT
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 11
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 1
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_pr

In [30]:
sp = spm.SentencePieceProcessor()
sp.load('t1k_bfT.model')

iner.cc(268) LOG(INFO) Added: freq=190 size=540 all=11320 active=1409 piece=uke
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=181 size=560 all=11524 active=1613 piece=▁true
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=175 size=580 all=11646 active=1735 piece=▁doth
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=165 size=600 all=11775 active=1864 piece=▁Ed
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=165 min_freq=55
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=162 size=620 all=11911 active=1134 piece=▁MENEN
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=158 size=640 all=12015 active=1238 piece=▁PETRUCHIO
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=150 size=660 all=12268 active=1491 piece=ilt
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=145 size=680 all=12379 active=1602 piece=less
trainer_interface.cc(687) LOG(INFO) Saving model: t1k_bfT.model
trainer_interface.cc(699) LOG(INFO) Saving vocabs: t1k_bfT.vocab


True

In [31]:
vocab = [(sp.id_to_piece(idx), idx) for idx in range(sp.get_piece_size())]
vocab

[('<unk>', 0),
 ('<s>', 1),
 ('</s>', 2),
 ('<0x00>', 3),
 ('<0x01>', 4),
 ('<0x02>', 5),
 ('<0x03>', 6),
 ('<0x04>', 7),
 ('<0x05>', 8),
 ('<0x06>', 9),
 ('<0x07>', 10),
 ('<0x08>', 11),
 ('<0x09>', 12),
 ('<0x0A>', 13),
 ('<0x0B>', 14),
 ('<0x0C>', 15),
 ('<0x0D>', 16),
 ('<0x0E>', 17),
 ('<0x0F>', 18),
 ('<0x10>', 19),
 ('<0x11>', 20),
 ('<0x12>', 21),
 ('<0x13>', 22),
 ('<0x14>', 23),
 ('<0x15>', 24),
 ('<0x16>', 25),
 ('<0x17>', 26),
 ('<0x18>', 27),
 ('<0x19>', 28),
 ('<0x1A>', 29),
 ('<0x1B>', 30),
 ('<0x1C>', 31),
 ('<0x1D>', 32),
 ('<0x1E>', 33),
 ('<0x1F>', 34),
 ('<0x20>', 35),
 ('<0x21>', 36),
 ('<0x22>', 37),
 ('<0x23>', 38),
 ('<0x24>', 39),
 ('<0x25>', 40),
 ('<0x26>', 41),
 ('<0x27>', 42),
 ('<0x28>', 43),
 ('<0x29>', 44),
 ('<0x2A>', 45),
 ('<0x2B>', 46),
 ('<0x2C>', 47),
 ('<0x2D>', 48),
 ('<0x2E>', 49),
 ('<0x2F>', 50),
 ('<0x30>', 51),
 ('<0x31>', 52),
 ('<0x32>', 53),
 ('<0x33>', 54),
 ('<0x34>', 55),
 ('<0x35>', 56),
 ('<0x36>', 57),
 ('<0x37>', 58),
 ('<0x38>', 5

In [32]:
ids = sp.encode('Hello, 世界! 🌍😊 Привет мир! 🚀🌟 नमस्ते दुनिया! 🎉📚 こんにちは世界! 🎌🐉 안녕하세요 세계! 🎵🖋️')
print(ids)

[329, 429, 942, 954, 939, 231, 187, 153, 234, 152, 143, 984, 939, 243, 162, 143, 144, 243, 162, 155, 141, 939, 211, 162, 212, 131, 211, 187, 211, 181, 211, 184, 212, 133, 939, 211, 191, 211, 187, 212, 131, 984, 939, 243, 162, 157, 131, 243, 162, 143, 162, 939, 227, 167, 171, 227, 167, 177, 227, 167, 187, 227, 168, 144, 227, 167, 167, 227, 168, 138, 939, 227, 167, 169, 227, 168, 132, 227, 167, 171, 227, 167, 194, 227, 167, 178, 227, 167, 193, 984, 939, 243, 162, 145, 140, 243, 162, 150, 157, 939, 230, 132, 150, 230, 133, 150, 230, 132, 174, 230, 132, 164, 230, 132, 178, 231, 187, 153, 234, 152, 143, 984, 939, 243, 162, 145, 143, 243, 162, 147, 140, 939, 239, 152, 139, 238, 136, 152, 240, 152, 155, 239, 135, 187, 239, 157, 151, 939, 239, 135, 187, 237, 182, 135, 984, 939, 243, 162, 145, 184, 243, 162, 153, 142, 242, 187, 146]


In [33]:
print([sp.id_to_piece(idx) for idx in ids])

['▁H', 'ell', 'o', ',', '▁', '<0xE4>', '<0xB8>', '<0x96>', '<0xE7>', '<0x95>', '<0x8C>', '!', '▁', '<0xF0>', '<0x9F>', '<0x8C>', '<0x8D>', '<0xF0>', '<0x9F>', '<0x98>', '<0x8A>', '▁', '<0xD0>', '<0x9F>', '<0xD1>', '<0x80>', '<0xD0>', '<0xB8>', '<0xD0>', '<0xB2>', '<0xD0>', '<0xB5>', '<0xD1>', '<0x82>', '▁', '<0xD0>', '<0xBC>', '<0xD0>', '<0xB8>', '<0xD1>', '<0x80>', '!', '▁', '<0xF0>', '<0x9F>', '<0x9A>', '<0x80>', '<0xF0>', '<0x9F>', '<0x8C>', '<0x9F>', '▁', '<0xE0>', '<0xA4>', '<0xA8>', '<0xE0>', '<0xA4>', '<0xAE>', '<0xE0>', '<0xA4>', '<0xB8>', '<0xE0>', '<0xA5>', '<0x8D>', '<0xE0>', '<0xA4>', '<0xA4>', '<0xE0>', '<0xA5>', '<0x87>', '▁', '<0xE0>', '<0xA4>', '<0xA6>', '<0xE0>', '<0xA5>', '<0x81>', '<0xE0>', '<0xA4>', '<0xA8>', '<0xE0>', '<0xA4>', '<0xBF>', '<0xE0>', '<0xA4>', '<0xAF>', '<0xE0>', '<0xA4>', '<0xBE>', '!', '▁', '<0xF0>', '<0x9F>', '<0x8E>', '<0x89>', '<0xF0>', '<0x9F>', '<0x93>', '<0x9A>', '▁', '<0xE3>', '<0x81>', '<0x93>', '<0xE3>', '<0x82>', '<0x93>', '<0xE3>', '<0x81

In [34]:
import os

# train a sentencepiece model on it
options = dict(
    # input specifications
    input='tiny_shakespeare.txt',
    input_format='text',

    # output specifications
    model_prefix='t1k_bfF',                     # output filename prefix

    # algorithm specifications
    model_type='bpe',                       # bpe algorithm
    vocab_size=1000,

    # normalization
    normalization_rule_name='identity',     # turn off normalization
    remove_extra_whitespaces=False,
    input_sentence_size=200000000,          # max nummber of training sentences
    max_sentence_length=4192,               # max number of bytes per sentence
    seed_sentencepiece_size=1000000,
    shuffle_input_sentence=True,

    # rare word treatment
    character_coverage=0.99995,
    byte_fallback=False,

    # merge rules
    split_digits=True,
    split_by_unicode_script=True,
    split_by_whitespace=True,
    split_by_number=True,
    max_sentencepiece_length=16,
    add_dummy_prefix=True,
    allow_whitespace_only_pieces=True,

    # special tokens
    unk_id=0,                               # the UNK token must exist
    bos_id=1,
    eos_id=2,
    pad_id=-1,                              # others are optional, set to -1 to turn off

    # systems
    num_threads=os.cpu_count(),             # use all system resources
)

spm.SentencePieceTrainer.train(**options)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: tiny_shakespeare.txt
  input_format: text
  model_prefix: t1k_bfF
  model_type: BPE
  vocab_size: 1000
  self_test_sample_size: 0
  character_coverage: 0.99995
  input_sentence_size: 200000000
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 4192
  num_threads: 11
  num_sub_iterations: 2
  max_sentencepiece_length: 16
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 1
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 1
  required_chars: 
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 0
  bos_id: 1
  eos_id: 2
  pad_id: -1
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_pr

In [35]:
sp = spm.SentencePieceProcessor()
sp.load('t1k_bfF.model')

True

In [36]:
vocab = [(sp.id_to_piece(idx), idx) for idx in range(sp.get_piece_size())]
vocab

[('<unk>', 0),
 ('<s>', 1),
 ('</s>', 2),
 ('▁t', 3),
 ('he', 4),
 ('▁a', 5),
 ('ou', 6),
 ('▁s', 7),
 ('▁m', 8),
 ('▁w', 9),
 ('in', 10),
 ('re', 11),
 ('ha', 12),
 ('▁the', 13),
 ('nd', 14),
 ('▁b', 15),
 ('is', 16),
 ('or', 17),
 ('▁f', 18),
 ('▁I', 19),
 ('er', 20),
 ('ll', 21),
 ('it', 22),
 ('on', 23),
 ('▁d', 24),
 ('▁c', 25),
 ('▁n', 26),
 ('▁l', 27),
 ('▁y', 28),
 ('es', 29),
 ('en', 30),
 ('▁th', 31),
 ('ar', 32),
 ('▁h', 33),
 ('▁o', 34),
 ('▁to', 35),
 ('▁you', 36),
 ('▁p', 37),
 ('▁T', 38),
 ('hat', 39),
 ('▁A', 40),
 ('▁he', 41),
 ('st', 42),
 ('ve', 43),
 ('ot', 44),
 ('▁and', 45),
 ('ow', 46),
 ('ing', 47),
 ('▁of', 48),
 ('an', 49),
 ('om', 50),
 ('▁g', 51),
 ('at', 52),
 ('▁be', 53),
 ('▁W', 54),
 ('se', 55),
 ('▁my', 56),
 ('▁in', 57),
 ('▁ha', 58),
 ('ce', 59),
 ('le', 60),
 ('ay', 61),
 ('ld', 62),
 ('ir', 63),
 ('et', 64),
 ('ed', 65),
 ('ut', 66),
 ('▁B', 67),
 ('▁me', 68),
 ('im', 69),
 ('▁S', 70),
 ('ith', 71),
 ('▁not', 72),
 ('▁H', 73),
 ('ch', 74),
 ('▁that'

In [37]:
ids = sp.encode('Hello, 世界! 🌍😊 Привет мир! 🚀🌟 नमस्ते दुनिया! 🎉📚 こんにちは世界! 🎌🐉 안녕하세요 세계! 🎵🖋️')
print(ids)

[73, 173, 942, 954, 939, 0, 984, 939, 0, 939, 0, 939, 0, 984, 939, 0, 939, 0, 939, 0, 984, 939, 0, 939, 0, 984, 939, 0, 939, 0, 939, 0, 984, 939, 0]


In [38]:
print([sp.id_to_piece(idx) for idx in ids])

['▁H', 'ell', 'o', ',', '▁', '<unk>', '!', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '!', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '!', '▁', '<unk>', '▁', '<unk>', '!', '▁', '<unk>', '▁', '<unk>', '▁', '<unk>', '!', '▁', '<unk>']
