## Tokenizer Analysis

In [1]:
!pip install --upgrade tiktoken
!pip install transformers

Collecting tiktoken
  Downloading tiktoken-0.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Downloading tiktoken-0.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (781 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m782.0/782.0 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: regex, tiktoken
Successfully installed regex-2024.9.11 tiktoken-0.7.0
[0mCollecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.7

## OpenAI tokenizer

In [2]:
import tiktoken

encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')

In [14]:
len(encoding.token_byte_values()) + len(encoding.special_tokens_set)

100261

In [37]:
print(f"Vocab Size :: {encoding.n_vocab}")
print(f"Max Allowed tokenizer :: {encoding.max_token_value}")

Vocab Size :: 100277
Max Allowed tokenizer :: 100276


In [52]:
encoding.special_tokens_set

{'<|endofprompt|>',
 '<|endoftext|>',
 '<|fim_middle|>',
 '<|fim_prefix|>',
 '<|fim_suffix|>'}

OpenAI tokenizer contains number tokenzs from `000 - 9999`
`00-99`
`0-9`

## Google Gemma Tokenizer

In [39]:
from transformers import AutoTokenizer

----


User need to be loged in to huggingface using acces token before using gemma model

```python
import huggingface_hub

huggingface_hub.login()
```



----

In [43]:
tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [49]:
print(f"Vocab Size :: {tokenizer.vocab_size}")
print(f"Max Model Length :: {tokenizer.model_max_length}")

Vocab Size :: 256000
Max Model Length :: 1000000000000000019884624838656



`gemma tokenizer is 2x larger than openAI tiktokens`

In [51]:
tokenizer.special_tokens_map

{'bos_token': '<bos>',
 'eos_token': '<eos>',
 'unk_token': '<unk>',
 'pad_token': '<pad>',
 'additional_special_tokens': ['<start_of_turn>', '<end_of_turn>']}

## Tokenizers Comparision

In [73]:
def print_comparision(text):
    print(f"Original text :: {text}")
    print("-"*50)
    print(f"OpenaAi Tokenizer :: {[encoding.decode_single_token_bytes(i) for i in encoding.encode(text)]}")
    print("\n")
    print(f"Gemma Tokenizer :: {tokenizer.tokenize(text)}")

In [74]:
text = "hello world this is xxxname"

print_comparision(text)

Original text :: hello world this is xxxname
--------------------------------------------------
OpenaAi Tokenizer :: [b'hello', b' world', b' this', b' is', b' xxx', b'name']


Gemma Tokenizer :: ['hello', '▁world', '▁this', '▁is', '▁xxx', 'name']


In [75]:
text = "https://www.cdfvfvfd.com"

print_comparision(text)

Original text :: https://www.cdfvfvfd.com
--------------------------------------------------
OpenaAi Tokenizer :: [b'https', b'://', b'www', b'.c', b'df', b'v', b'fv', b'fd', b'.com']


Gemma Tokenizer :: ['https', '://', 'www', '.', 'cdf', 'v', 'fv', 'fd', '.', 'com']


In [79]:
text = "http://www.cdfvfvfd.us"

print_comparision(text)

Original text :: http://www.cdfvfvfd.us
--------------------------------------------------
OpenaAi Tokenizer :: [b'http', b'://', b'www', b'.c', b'df', b'v', b'fv', b'fd', b'.us']


Gemma Tokenizer :: ['http', '://', 'www', '.', 'cdf', 'v', 'fv', 'fd', '.', 'us']


In [81]:
text = """class BoxFactory(DjangoModelFactory):

    class Meta:
        model = Box
        django_get_or_create = ('label',)

    creator = factory.SubFactory(UserFactory)
    content = factory.Faker('sentence', nb_words=10)
"""

print_comparision(text)

Original text :: class BoxFactory(DjangoModelFactory):

    class Meta:
        model = Box
        django_get_or_create = ('label',)

    creator = factory.SubFactory(UserFactory)
    content = factory.Faker('sentence', nb_words=10)

--------------------------------------------------
OpenaAi Tokenizer :: [b'class', b' Box', b'Factory', b'(D', b'jango', b'Model', b'Factory', b'):\n\n', b'   ', b' class', b' Meta', b':\n', b'       ', b' model', b' =', b' Box', b'\n', b'       ', b' django', b'_get', b'_or', b'_create', b' =', b" ('", b'label', b"',", b')\n\n', b'   ', b' creator', b' =', b' factory', b'.Sub', b'Factory', b'(User', b'Factory', b')\n', b'   ', b' content', b' =', b' factory', b'.F', b'aker', b"('", b'sentence', b"',", b' nb', b'_words', b'=', b'10', b')\n']


Gemma Tokenizer :: ['class', '▁Box', 'Factory', '(', 'Django', 'Model', 'Factory', '):', '\n\n', '▁▁▁▁', 'class', '▁Meta', ':', '\n', '▁▁▁▁▁▁▁▁', 'model', '▁=', '▁Box', '\n', '▁▁▁▁▁▁▁▁', 'django', '_', 'get', '_', '

In [82]:
text = """bool InternalPackFrom(const T& message, URL_TYPE* dst_url,
                      VALUE_TYPE* dst_value) {
  return InternalPackFromLite(message, kTypeGoogleApisComPrefix,
                              GetAnyMessageName<T>(), dst_url, dst_value);
}
PROTOBUF_EXPORT bool InternalPackFrom(const Message& message, URL_TYPE* dst_url,
                                      VALUE_TYPE* dst_value);

"""

print_comparision(text)

Original text :: bool InternalPackFrom(const T& message, URL_TYPE* dst_url,
                      VALUE_TYPE* dst_value) {
  return InternalPackFromLite(message, kTypeGoogleApisComPrefix,
                              GetAnyMessageName<T>(), dst_url, dst_value);
}
PROTOBUF_EXPORT bool InternalPackFrom(const Message& message, URL_TYPE* dst_url,
                                      VALUE_TYPE* dst_value);


--------------------------------------------------
OpenaAi Tokenizer :: [b'bool', b' Internal', b'Pack', b'From', b'(const', b' T', b'&', b' message', b',', b' URL', b'_TYPE', b'*', b' dst', b'_url', b',\n', b'                     ', b' VALUE', b'_TYPE', b'*', b' dst', b'_value', b')', b' {\n', b' ', b' return', b' Internal', b'Pack', b'From', b'Lite', b'(message', b',', b' k', b'Type', b'Google', b'Apis', b'Com', b'Prefix', b',\n', b'                             ', b' Get', b'Any', b'Message', b'Name', b'<T', b'>(),', b' dst', b'_url', b',', b' dst', b'_value', b');\n', b'}\n', b'PR

In [83]:
text = """<doctype>
<html>
    <table>
        <thead>
        </thead>
        <tr></tr>
    </table>
    <sample-component>
    </sample-component>
</html>
"""

print_comparision(text)

Original text :: <doctype>
<html>
    <table>
        <thead>
        </thead>
        <tr></tr>
    </table>
    <sample-component>
    </sample-component>
</html>

--------------------------------------------------
OpenaAi Tokenizer :: [b'<', b'doctype', b'>\n', b'<html', b'>\n', b'   ', b' <', b'table', b'>\n', b'       ', b' <', b'thead', b'>\n', b'       ', b' </', b'thead', b'>\n', b'       ', b' <', b'tr', b'></', b'tr', b'>\n', b'   ', b' </', b'table', b'>\n', b'   ', b' <', b'sample', b'-component', b'>\n', b'   ', b' </', b'sample', b'-component', b'>\n', b'</', b'html', b'>\n']


Gemma Tokenizer :: ['<', 'doctype', '>', '\n', '<', 'html', '>', '\n', '▁▁▁▁', '<table>', '\n', '▁▁▁▁▁▁▁▁', '<thead>', '\n', '▁▁▁▁▁▁▁▁', '</thead>', '\n', '▁▁▁▁▁▁▁▁', '<tr>', '</tr>', '\n', '▁▁▁▁', '</table>', '\n', '▁▁▁▁', '<', 'sample', '-', 'component', '>', '\n', '▁▁▁▁', '</', 'sample', '-', 'component', '>', '\n', '</', 'html', '>', '\n']


In [95]:
gemma_vocab = set(i.replace("▁", " ") for i in tokenizer.get_vocab())
openai_vocab = set(i.decode("utf-8", errors="replace") for i in encoding.token_byte_values())

In [106]:
common_vocab = gemma_vocab.intersection(openai_vocab)
openai_other_vocab = openai_vocab - gemma_vocab

print("No of common vocab :: ", len(common_vocab))
print(f"No of different vocab in openai :: ", len(openai_other_vocab))

No of common vocab ::  67651
No of different vocab in openai ::  31906


In [109]:
list(openai_other_vocab)[-100:-1]

['.userName',
 '.question',
 '.database',
 ' buz',
 '(suffix',
 ' Lorem',
 ' unus',
 'NewItem',
 '\tdst',
 '998',
 '(se',
 'Seleccione',
 '_INSTANCE',
 '}\\.[',
 '.swing',
 '\\Category',
 'ialect',
 '_Array',
 '.training',
 '755',
 ' onHide',
 'isode',
 '[W',
 'ssql',
 'ItemAt',
 ' ...)\n',
 '_available',
 '_vehicle',
 '-menu',
 ' suprem',
 ' beforeSend',
 'istributor',
 '.ASCII',
 '/screens',
 '.stop',
 ' Interr',
 '(h',
 '_COMPILE',
 'razier',
 '.Result',
 '.present',
 'phoon',
 '(/^\\',
 '.filters',
 '<Select',
 '/class',
 '_RECEIVED',
 'verting',
 '_Number',
 '.clipsToBounds',
 '*T',
 '_shapes',
 'mpr',
 ',list',
 '\tvalue',
 '//**\n',
 '>Contact',
 '_JSON',
 '(per',
 ' quotid',
 ")'),",
 'IRONMENT',
 '.CreateDirectory',
 '////\n',
 '.BorderFactory',
 '雅黑',
 '-defined',
 '(cm',
 '()")\n',
 'FRINGEMENT',
 '(input',
 ' attributeName',
 'xBF',
 '_binding',
 'nonnull',
 "'>\n\n",
 'ег',
 '_Account',
 '-build',
 'uentes',
 "'}}>\n",
 '.Surface',
 ' Positioned',
 '\t\t\t\t           ',
 

based on the sample analysis all the other remaining tokens are representing the code base

In [120]:
print_comparision("000")

Original text :: 000
--------------------------------------------------
OpenaAi Tokenizer :: [b'000']


Gemma Tokenizer :: ['0', '0', '0']


In [122]:
print_comparision("whatareyoudoingfortodayjhon")

Original text :: whatareyoudoingfortodayjhon
--------------------------------------------------
OpenaAi Tokenizer :: [b'wh', b'ata', b'rey', b'oud', b'o', b'ing', b'fort', b'oday', b'jh', b'on']


Gemma Tokenizer :: ['what', 'are', 'you', 'doing', 'for', 'today', 'j', 'hon']


In [124]:
print_comparision("🦖🥲")

Original text :: 🦖🥲
--------------------------------------------------
OpenaAi Tokenizer :: [b'\xf0\x9f', b'\xa6', b'\x96', b'\xf0\x9f', b'\xa5', b'\xb2']


Gemma Tokenizer :: ['🦖', '🥲']


In [134]:
len(encoding.token_byte_values())

100256

In [197]:
import regex as re

pat_str = "|".join(
        [
            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
            r"""\p{N}{1,3}""",
            r""" ?[^\s\p{L}\p{N}]+[\r\n/]*""",
            r"""\s*[\r\n]+""",
            r"""\s+(?!\S)""",
            r"""\s+""",
        ]
    )

In [199]:
for i in re.findall(pat_str, "Hi this is \\n jagan. how are man!42@#43434\n\n\n\n\n"):
    print(i, end="||")

Hi|| this|| is|| \||n|| jagan||.|| how|| are|| man||!||42||@#||434||34||




||

In [254]:
text = "Hello 我今天能为你做什么🤣🦖"

In [255]:
"뫐뻐닐".encode("utf-8")

b'\xeb\xab\x90\xeb\xbb\x90\xeb\x8b\x90'

In [256]:
b'\xd0\xba\xd0\xbe\xd0\xb2'.decode("utf-16")

'뫐뻐닐'

In [260]:
for i in text:
    print(f"{i} -> {i.encode('utf-8')}")

H -> b'H'
e -> b'e'
l -> b'l'
l -> b'l'
o -> b'o'
  -> b' '
我 -> b'\xe6\x88\x91'
今 -> b'\xe4\xbb\x8a'
天 -> b'\xe5\xa4\xa9'
能 -> b'\xe8\x83\xbd'
为 -> b'\xe4\xb8\xba'
你 -> b'\xe4\xbd\xa0'
做 -> b'\xe5\x81\x9a'
什 -> b'\xe4\xbb\x80'
么 -> b'\xe4\xb9\x88'
🤣 -> b'\xf0\x9f\xa4\xa3'
🦖 -> b'\xf0\x9f\xa6\x96'


In [258]:
mergeable_ranks = encoding._mergeable_ranks

In [215]:
def recover_merges(mergeable_ranks):
    # the `merges` are already the byte sequences in their merged state.
    # so we have to recover the original pairings. We can do this by doing
    # a small BPE training run on all the tokens, in their order.
    # also see https://github.com/openai/tiktoken/issues/60
    # also see https://github.com/karpathy/minbpe/issues/11#issuecomment-1950805306
    merges = {}
    for token, rank in mergeable_ranks.items():
        if len(token) == 1:
            continue # skip raw bytes
        pair = tuple(bpe(mergeable_ranks, token, max_rank=rank))
        assert len(pair) == 2
        # recover the integer ranks of the pair
        ix0 = mergeable_ranks[pair[0]]
        ix1 = mergeable_ranks[pair[1]]
        merges[(ix0, ix1)] = rank

    return merges


def bpe(mergeable_ranks, token, max_rank):
    # helper function used in get_gpt4_merges() to reconstruct the merge forest
    parts = [bytes([b]) for b in token]
    while True:
        min_idx = None
        min_rank = None
        for i, pair in enumerate(zip(parts[:-1], parts[1:])):
            rank = mergeable_ranks.get(pair[0] + pair[1])
            if rank is not None and (min_rank is None or rank < min_rank):
                min_idx = i
                min_rank = rank
        if min_rank is None or (max_rank is not None and min_rank >= max_rank):
            break
        assert min_idx is not None
        parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:]
    return parts


In [272]:
mergeable_ranks

{b'!': 0,
 b'"': 1,
 b'#': 2,
 b'$': 3,
 b'%': 4,
 b'&': 5,
 b"'": 6,
 b'(': 7,
 b')': 8,
 b'*': 9,
 b'+': 10,
 b',': 11,
 b'-': 12,
 b'.': 13,
 b'/': 14,
 b'0': 15,
 b'1': 16,
 b'2': 17,
 b'3': 18,
 b'4': 19,
 b'5': 20,
 b'6': 21,
 b'7': 22,
 b'8': 23,
 b'9': 24,
 b':': 25,
 b';': 26,
 b'<': 27,
 b'=': 28,
 b'>': 29,
 b'?': 30,
 b'@': 31,
 b'A': 32,
 b'B': 33,
 b'C': 34,
 b'D': 35,
 b'E': 36,
 b'F': 37,
 b'G': 38,
 b'H': 39,
 b'I': 40,
 b'J': 41,
 b'K': 42,
 b'L': 43,
 b'M': 44,
 b'N': 45,
 b'O': 46,
 b'P': 47,
 b'Q': 48,
 b'R': 49,
 b'S': 50,
 b'T': 51,
 b'U': 52,
 b'V': 53,
 b'W': 54,
 b'X': 55,
 b'Y': 56,
 b'Z': 57,
 b'[': 58,
 b'\\': 59,
 b']': 60,
 b'^': 61,
 b'_': 62,
 b'`': 63,
 b'a': 64,
 b'b': 65,
 b'c': 66,
 b'd': 67,
 b'e': 68,
 b'f': 69,
 b'g': 70,
 b'h': 71,
 b'i': 72,
 b'j': 73,
 b'k': 74,
 b'l': 75,
 b'm': 76,
 b'n': 77,
 b'o': 78,
 b'p': 79,
 b'q': 80,
 b'r': 81,
 b's': 82,
 b't': 83,
 b'u': 84,
 b'v': 85,
 b'w': 86,
 b'x': 87,
 b'y': 88,
 b'z': 89,
 b'{': 90,
 b'|': 9

In [217]:
recover_merges(mergeable_ranks)

{(220, 220): 256,
 (256, 256): 257,
 (72, 77): 258,
 (220, 83): 259,
 (257, 257): 260,
 (68, 81): 261,
 (256, 220): 262,
 (78, 77): 263,
 (220, 64): 264,
 (81, 68): 265,
 (64, 83): 266,
 (82, 83): 267,
 (68, 77): 268,
 (78, 81): 269,
 (259, 71): 270,
 (198, 198): 271,
 (220, 66): 272,
 (75, 68): 273,
 (220, 82): 274,
 (72, 83): 275,
 (64, 77): 276,
 (64, 81): 277,
 (64, 75): 278,
 (270, 68): 279,
 (26, 198): 280,
 (220, 79): 281,
 (220, 69): 282,
 (78, 84): 283,
 (220, 28): 284,
 (72, 82): 285,
 (257, 262): 286,
 (258, 70): 287,
 (68, 82): 288,
 (220, 86): 289,
 (72, 263): 290,
 (68, 67): 291,
 (72, 66): 292,
 (220, 65): 293,
 (220, 67): 294,
 (68, 83): 295,
 (220, 76): 296,
 (220, 78): 297,
 (197, 197): 298,
 (81, 78): 299,
 (64, 82): 300,
 (68, 75): 301,
 (66, 83): 302,
 (77, 67): 303,
 (220, 258): 304,
 (220, 71): 305,
 (268, 83): 306,
 (72, 67): 307,
 (220, 77): 308,
 (64, 76): 309,
 (260, 262): 310,
 (259, 78): 311,
 (220, 265): 312,
 (12, 12): 313,
 (220, 90): 314,
 (297, 69): 31

https://github.com/karpathy/minbpe/issues/25