# Tokenization
```
intro_to_tokenization()
tokenization_examples()
character_tokenizer()
byte_tokenizer()
word_tokenizer()
bpe_tokenizer()
```

In [18]:
from abc import ABC
import tiktoken
import regex

class Tokenizer(ABC):
    """Abstract interface for a tokenizer."""
    def encode(self, string: str) -> list[int]:
        raise NotImplementedError
    def decode(self, indices: list[int]) -> str:
        raise NotImplementedError

## tokenization_examples()

In [2]:
def get_gpt2_tokenizer():
    # Code: https://github.com/openai/tiktoken
    # You can use cl100k_base for the gpt3.5-turbo or gpt4 tokenizer
    return tiktoken.get_encoding("gpt2")

tokenizer = get_gpt2_tokenizer()

In [None]:
# 'byte' size per token - larger means longer text can get more compressed

def get_compression_ratio(string: str, indices: list[int]) -> float:
    """Given `string` that has been tokenized into `indices`, ."""
    num_bytes = len(bytes(string, encoding="utf-8"))  # @inspect num_bytes
    
    num_tokens = len(indices)                       # @inspect num_tokens
    return num_bytes / num_tokens

In [None]:
string = "Hello, 🌍! 你好!"
print(bytes(string, encoding="utf-8"))
print(bytes("Hello", encoding="utf-8"))
print(len(bytes(string, encoding="utf-8")))


indices = tokenizer.encode(string)
print(indices)

reconstructed_string = tokenizer.decode(indices)
print(reconstructed_string)

# utf-8 maps unicodes into 2~4 bytes, ascii into 1 byte
compression_ratio = get_compression_ratio(string, indices)
print(compression_ratio)

b'Hello, \xf0\x9f\x8c\x8d! \xe4\xbd\xa0\xe5\xa5\xbd!'
b'Hello'
20
[15496, 11, 12520, 234, 235, 0, 220, 19526, 254, 25001, 121, 0]
Hello, 🌍! 你好!
1.6666666666666667


## character_tokenizer()
* each character can be converted into code point (int)
    * code point: unique int value that represents unicode text
    * `ord` str->int, `chr` int-> str

Issues with character_tokenizer
* has very large vocabulary (approx 150K unicode characters)
    * allocate one slot for evey character -> inefficient
* many characters are rare -> inefficient

In [10]:
class CharacterTokenizer(Tokenizer):
    """Represent a string as a sequence of Unicode code points."""
    def encode(self, string: str) -> list[int]:
        return list(map(ord, string))
    def decode(self, indices: list[int]) -> str:
        return "".join(map(chr, indices))
    
tokenizer = CharacterTokenizer()

In [None]:
string = "Hello, 🌍! 你好!"

indices = tokenizer.encode(string)
reconstructed_string = tokenizer.decode(indices)

vocabulary_size = max(indices) + 1
compression_ratio = get_compression_ratio(string, indices)
print(compression_ratio) # 

1.5384615384615385


## byte_tokenizer()
* unicode (utf-8) -> **sequence of bytes** -> each byte is integer between 0~255 (8 bits)
    * utf-8: single unicode code point can be represented by 1~4 bytes (32bits)
* vocab size is small (256), but compression ratio is terrible (1)
    * -> token sequence will be too long

In [14]:
class ByteTokenizer(Tokenizer):
    """Represent a string as a sequence of bytes."""
    def encode(self, string: str) -> list[int]:
        string_bytes = string.encode("utf-8")  # @inspect string_bytes
        indices = list(map(int, string_bytes))  # @inspect indices
        return indices
    def decode(self, indices: list[int]) -> str:
        string_bytes = bytes(indices)  # @inspect string_bytes
        string = string_bytes.decode("utf-8")  # @inspect string
        return string
tokenizer = ByteTokenizer()

In [17]:
string = "Hello, 🌍! 你好!"
indices = tokenizer.encode(string)
print(len(indices), indices)

20 [72, 101, 108, 108, 111, 44, 32, 240, 159, 140, 141, 33, 32, 228, 189, 160, 229, 165, 189, 33]


## word_tokenizer()
* split strings into words (`r"\w+|."`)
    * `\w+` (단어 문자) or `.` any
    * includes whitespace

In [22]:
GPT2_TOKENIZER_REGEX=r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""

In [20]:
string = "I'll say supercalifragilisticexpialidocious!"
segments = regex.findall(r"\w+|.", string)
print(segments)

['I', "'", 'll', ' ', 'say', ' ', 'supercalifragilisticexpialidocious', '!']


In [None]:
# fancier (gpt-2)

segments = regex.findall(GPT2_TOKENIZER_REGEX, string)
print(segments)

['I', "'ll", ' say', ' supercalifragilisticexpialidocious', '!']


## bpe_tokenizer()
* byte-pair encoding: train the tokenizer on raw text
    * common sequences will merge

In [27]:
def merge(indices: list[int], pair: tuple[int, int], new_index: int) -> list[int]:  # @inspect indices, @inspect pair, @inspect new_index
    """Return `indices`, but with all instances of `pair` replaced with `new_index`."""
    new_indices = []  # @inspect new_indices
    i = 0  # @inspect i
    while i < len(indices):
        if i + 1 < len(indices) and indices[i] == pair[0] and indices[i + 1] == pair[1]:
            new_indices.append(new_index)
            i += 2
        else:
            new_indices.append(indices[i])
            i += 1
    return new_indices

In [28]:
from dataclasses import dataclass

@dataclass(frozen=True)
class BPETokenizerParams:
    """All you need to specify a BPETokenizer."""
    vocab: dict[int, bytes]     # index -> bytes
    merges: dict[tuple[int, int], int]  # index1,index2 -> new_index
    
class BPETokenizer(Tokenizer):
    """BPE tokenizer given a set of merges and a vocabulary."""
    def __init__(self, params: BPETokenizerParams):
        self.params = params
    def encode(self, string: str) -> list[int]:
        indices = list(map(int, string.encode("utf-8")))  # @inspect indices
        # Note: this is a very slow implementation
        for pair, new_index in self.params.merges.items():  # @inspect pair, @inspect new_index
            indices = merge(indices, pair, new_index)
        return indices
    def decode(self, indices: list[int]) -> str:
        bytes_list = list(map(self.params.vocab.get, indices))  # @inspect bytes_list
        string = b"".join(bytes_list).decode("utf-8")  # @inspect string
        return string

In [None]:
from collections import defaultdict

def train_bpe(string: str, num_merges: int) -> BPETokenizerParams:  # @inspect string, @inspect num_merges
    # Start with the list of bytes of string.
    indices = list(map(int, string.encode("utf-8")))  # @inspect indices
    merges: dict[tuple[int, int], int] = {}  # index1, index2 => merged index
    vocab: dict[int, bytes] = {x: bytes([x]) for x in range(256)}  # index -> bytes
    for i in range(num_merges):
        # Count the number of occurrences of each pair of tokens
        counts = defaultdict(int)
        for index1, index2 in zip(indices, indices[1:]):  # For each adjacent pair
            counts[(index1, index2)] += 1  # @inspect counts
        # Find the most common pair.
        pair = max(counts, key=counts.get)  # @inspect pair
        index1, index2 = pair

        # Merge that pair.
        new_index = 256 + i  # @inspect new_index
        merges[pair] = new_index  # @inspect merges
        vocab[new_index] = vocab[index1] + vocab[index2]  # @inspect vocab
        indices = merge(indices, pair, new_index)  # @inspect indices
    return BPETokenizerParams(vocab=vocab, merges=merges)

In [32]:
string = "the cat in the hat"  # @inspect string
params = train_bpe(string, num_merges=3)

In [None]:
print(len(params.vocab)) # 256 + 3 merges
params.merges # 3 merges

259


{(116, 104): 256, (256, 101): 257, (257, 32): 258}

In [38]:
tokenizer = BPETokenizer(params)

string = "the quick brown fox"
indices = tokenizer.encode(string)
print(indices)

[258, 113, 117, 105, 99, 107, 32, 98, 114, 111, 119, 110, 32, 102, 111, 120]
