|<h2>Course:</h2>|<h1><a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">A deep understanding of AI language model mechanisms</a></h1>|
|-|:-:|
|<h2>Part 1:</h2>|<h1>Tokenizations and embeddings<h1>|
|<h2>Section:</h2>|<h1>Words to tokens to numbers<h1>|
|<h2>Lecture:</h2>|<h1><b>Exploring ChatGPT4's tokenizer<b></h1>|

<br>

<h5><b>Teacher:</b> Mike X Cohen, <a href="https://sincxpress.com" target="_blank">sincxpress.com</a></h5>
<h5><b>Course URL:</b> <a href="https://udemy.com/course/dullms_x/?couponCode=202508" target="_blank">udemy.com/course/dullms_x/?couponCode=202508</a></h5>
<i>Using the code without the course may lead to confusion or errors.</i>

In [1]:
import numpy as np
import matplotlib.pyplot as plt

# matplotlib defaults
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats('svg')

In [2]:
# need to install the tiktoken library to get OpenAI's tokenizer
# note: it's tik-token, not tiktok-en :P
#!pip install tiktoken
import tiktoken

In [3]:
# GPT-4's tokenizer
tokenizer = tiktoken.get_encoding('cl100k_base')
dir(tokenizer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_core_bpe',
 '_encode_bytes',
 '_encode_only_native_bpe',
 '_encode_single_piece',
 '_mergeable_ranks',
 '_pat_str',
 '_special_tokens',
 'decode',
 'decode_batch',
 'decode_bytes',
 'decode_bytes_batch',
 'decode_single_token_bytes',
 'decode_tokens_bytes',
 'decode_with_offsets',
 'encode',
 'encode_batch',
 'encode_ordinary',
 'encode_ordinary_batch',
 'encode_single_token',
 'encode_to_numpy',
 'encode_with_unstable',
 'eot_token',
 'is_special_token',
 'max_token_value',
 'n_vocab',
 'name',
 'special_tokens_set',
 'token_byte_values']

In [4]:
# get help
tokenizer??

[31mType:[39m           Encoding
[31mString form:[39m    <Encoding 'cl100k_base'>
[31mFile:[39m           ~/Code/LLM_course/Part1_TokensEmbeddings/text2numbers/.venv/lib/python3.12/site-packages/tiktoken/core.py
[31mSource:[39m        
[38;5;28;01mclass[39;00m Encoding:
    [38;5;28;01mdef[39;00m __init__(
        self,
        name: str,
        *,
        pat_str: str,
        mergeable_ranks: dict[bytes, int],
        special_tokens: dict[str, int],
        explicit_n_vocab: int | [38;5;28;01mNone[39;00m = [38;5;28;01mNone[39;00m,
    ):
        [33m"""Creates an Encoding object.[39m

[33m        See openai_public.py for examples of how to construct an Encoding object.[39m

[33m        Args:[39m
[33m            name: The name of the encoding. It should be clear from the name of the encoding[39m
[33m                what behaviour to expect, in particular, encodings with different special tokens[39m
[33m                should have different names.[39m
[33m

In [5]:
# vocab size
tokenizer.n_vocab

100277

In [6]:
tokenizer.decode([tokenizer.eot_token])

'<|endoftext|>'

In [7]:
# but not all tokens are valid, e.g.,
print(tokenizer.n_vocab)
tokenizer.decode([100261])

100277


KeyError: 'Invalid token for decoding: 100261'

In [None]:
# list of all tokens:
# https://github.com/vnglst/gpt4-tokens/blob/main/decode-tokens.ipynb

# Explore some tokens

In [8]:
for i in range(1000,1050):
  print(f'{i} = {tokenizer.decode([i])}')

1000 = indow
1001 = lement
1002 = pect
1003 = ash
1004 = [i
1005 =  use
1006 = .F
1007 = pec
1008 =  ad
1009 = ove
1010 = ception
1011 = ength
1012 = include
1013 = ader
1014 =                            
1015 = atus
1016 = Th
1017 = itle
1018 = rit
1019 = void
1020 = ().
1021 = (

1022 =  off
1023 =  other
1024 =  &&
1025 = ';

1026 = ms
1027 =  been
1028 =  te
1029 = ml
1030 = co
1031 = nc
1032 = 13
1033 = ervice
1034 =  %
1035 = **

1036 = ann
1037 = ade
1038 = 




1039 = lock
1040 = const
1041 = 100
1042 = ponse
1043 =  sup
1044 = ++
1045 = date
1046 =  acc
1047 =  had
1048 =  bu
1049 = 200


# Tokenization!

In [14]:
text = "My name is Mike and I like toothpaste-flavored chocolate."
tokens = tokenizer.encode(text)
print(tokens)

for t in tokens:
    print(tokenizer.decode([t]))

[5159, 836, 374, 11519, 323, 358, 1093, 26588, 57968, 12556, 76486, 18414, 13]
My
 name
 is
 Mike
 and
 I
 like
 tooth
paste
-fl
avored
 chocolate
.


In [10]:
text.split()

['My',
 'name',
 'is',
 'Mike',
 'and',
 'I',
 'like',
 'toothpaste-flavored',
 'chocolate.']

In [11]:
for word in text.split():
  print(f'"{word}" comprises token(s) {tokenizer.encode(word)}')

"My" comprises token(s) [5159]
"name" comprises token(s) [609]
"is" comprises token(s) [285]
"Mike" comprises token(s) [35541]
"and" comprises token(s) [438]
"I" comprises token(s) [40]
"like" comprises token(s) [4908]
"toothpaste-flavored" comprises token(s) [998, 8942, 57968, 12556, 76486]
"chocolate." comprises token(s) [331, 14136, 13]


In [12]:
for t in tokens:
  print(f'Token {t:>6} is "{tokenizer.decode([t])}"')

Token   5159 is "My"
Token    836 is " name"
Token    374 is " is"
Token  11519 is " Mike"
Token    323 is " and"
Token    358 is " I"
Token   1093 is " like"
Token  26588 is " tooth"
Token  57968 is "paste"
Token  12556 is "-fl"
Token  76486 is "avored"
Token  18414 is " chocolate"
Token     13 is "."


In [None]:
# with special (non-ASCII) characters
tokenizer.encode('â')

# How long are the tokens?

In [None]:
# initialize lengths vector
token_lengths = np.zeros(tokenizer.n_vocab)

# get the number of characters in each token
for idx in range(tokenizer.n_vocab):
  try:
    token_lengths[idx] = len(tokenizer.decode([idx]))
  except:
    token_lengths[idx] = np.nan

# count unique lengths
uniqueLengths,tokenCount = np.unique(token_lengths,return_counts=True)



# visualize
_,axs = plt.subplots(1,2,figsize=(12,4))
axs[0].plot(token_lengths,'k.',markersize=3,alpha=.4)
axs[0].set(xlim=[0,tokenizer.n_vocab],xlabel='Token index',ylabel='Token length (characters)',
           title='GPT4 token lengths')

axs[1].bar(uniqueLengths,tokenCount,color='k',edgecolor='gray')
axs[1].set(xlim=[0,max(uniqueLengths)],xlabel='Token length (chars)',ylabel='Token count (log scale)',
           title='Distribution of token lengths')

plt.tight_layout()
plt.show()

# Many word-tokens start with spaces

In [None]:
# single-token words with vs. without spaces
print( tokenizer.encode(' Michael') )
print( tokenizer.encode('Michael') )

In [None]:
# multi-token words without a space
print( tokenizer.encode(' Peach') )
print( tokenizer.encode('Peach') )

In [None]:
peach = tokenizer.encode('Peach')
[tokenizer.decode([p]) for p in peach]

# The Time Machine book encoded

In [None]:
import requests
import re
text = requests.get('https://www.gutenberg.org/files/35/35-0.txt').text

# split by punctuation
words = re.split(r'([,.:;—?_!"“()\']|--|\s)',text)
words = [item.strip() for item in words if item.strip()]
print(f'There are {len(words)} words.')
words[10000:10050]

In [None]:
# tokens of a random word in the text
someRandomWord = np.random.choice(words)
print(f'"{someRandomWord}" has token {tokenizer.encode(someRandomWord)}')

In [None]:
for t in words[:20]:
  print(f'"{t}" has {len(tokenizer.encode(t))} tokens')

In [None]:
for spelling in ['book','Book','bOok']:
  print(f'"{spelling}" has tokens {tokenizer.encode(spelling)}')

# But do we need to separate the text into words?

In [None]:
# what happens if we just tokenize the raw (unprocessed) text?
tmTokens = tokenizer.encode(text)
print(f'The text has {len(tmTokens):,} tokens and {len(words):,} words.')

In [None]:
# check out some tokens

for t in tmTokens[9990:10020]:
  print(f'Token {t:>6}: "{tokenizer.decode([t])}"')

In [None]:
print(tokenizer.decode(tmTokens[9990:10020]))