In [None]:
# Unicode -> Defines ~155k chars across 168 scripts
# Python -> ord() -> Gives unicode integer representation for each char, chr() -> converts integer unicode code point to coressponding char
# 
# Impractical to train tokenizer directly on these code points -> very large vocab, and very sparse
# 
# Instead use unicode encoding -> convert unicode char into sequence of bytes (UTF-8, UTF-16, UTF-32)
# UTF-8 -> dominant encoding for the Internet (vocab size of 256 more manageable)
# When using byte-level tokenization, don't have to worry about OOV tokens, since any input text can be expressed as sequence of integers from 0-255

a = out[0].encode("utf-8") # -> converts to Python bytes object
list(a) # convert string into sequence of bytes
# If you just prepend a string with  b" for eg. b"Hello There", you get the sequence of bytes for this string where each ASCII unicode character is converted to a single byte, however this does not work for non ASCII-characters that might map to more than
# one byte. 

[115, 111, 109, 101]

In [None]:
b = bytes(list(a)).decode("utf-8") # or a.decode("utf-8")
b

'some'

In [None]:
# Subword tokenization -> midpoint between word-level tokenization and byte-level tokenization, trades off a slightly bigger vocab size for a better compression of input byte sequence. Byte-level tokenization has vocab size of 256, sub-word is slightly higher with
# byte pairs frequently appearing together in input as additional vocab entries.

# Byte-level BPE tokenizer -> Vocab items are bytes, or merges sequence of bytes -> gives best of both worlds in terms of out-of-vocab handling and mangeable input sequence length.

In [14]:
"""
Pre-tokeization
"""

import regex as re

PAT = r"""'(?:[sdmt]|ll|ve|re)| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
out = re.findall(PAT, "some text that i'll pre-tokenize") # re.finditer is preferable, as we don't have to store all pre-tokeizer words, it's an iterator

In [None]:
"""
Computing BPE Merges
"""

# Prefer lexicographically greater pair, when deciding between pairs with same frequency on which to merge.
# For efficiency during BPE training -> don't consider pairs that cross pre-token boundaries
# Some tokens should be never split into multiple tokens -> eg. end_of_text (should be added to vocab so they have fixed token ID)
# 

In [15]:
b"<|endoftext|>"

b'<|endoftext|>'

In [16]:
list(b"<|endoftext|>")

[60, 124, 101, 110, 100, 111, 102, 116, 101, 120, 116, 124, 62]

In [17]:
a = "<|endoftext|>".encode("utf-8")

In [18]:
list(a)

[60, 124, 101, 110, 100, 111, 102, 116, 101, 120, 116, 124, 62]