# Example code of how LLama tokenizes text

Llama 3 family uses a version of BPE similar enough to OpenAI's tiktoken. So we can just implement a toy version of it.
(See https://github.com/meta-llama/llama3/blob/main/llama/tokenizer.py)

In [1]:
# imports
import tiktoken


In [2]:
# Load the tokenizer
tokenizer = tiktoken.encoding_for_model("gpt-4o")

# Tokenize a text
text = "Hello, how are you?"
tokens = tokenizer.encode(text)
print(tokens)

[13225, 11, 1495, 553, 481, 30]


In [4]:
#show which token string corresponds to each token id
for token in tokens:
    print(tokenizer.decode_single_token_bytes(token))

b'Hello'
b','
b' how'
b' are'
b' you'
b'?'


In [3]:
text = "José da Silva Souza Monteiro, brasileiro, advogado, 43 anos, CPF 053.123.456-78"
tokens = tokenizer.encode(text)
print(tokens)


[155983, 1033, 54886, 151240, 35632, 4509, 11, 59965, 11, 138381, 11, 220, 5320, 11680, 11, 85805, 220, 41954, 13, 7633, 13, 19354, 12, 4388]


In [5]:
#show which token string corresponds to each token id
token_bytes = [tokenizer.decode_single_token_bytes(token) for token in tokens]


In [6]:
token_bytes

[b'Jos\xc3\xa9',
 b' da',
 b' Silva',
 b' Souza',
 b' Monte',
 b'iro',
 b',',
 b' brasileiro',
 b',',
 b' advogado',
 b',',
 b' ',
 b'43',
 b' anos',
 b',',
 b' CPF',
 b' ',
 b'053',
 b'.',
 b'123',
 b'.',
 b'456',
 b'-',
 b'78']

In [7]:
#show the tokens as strings, not bytes
token_s_l = [token.decode('utf-8', errors='replace') for token in token_bytes]
token_s_l

['José',
 ' da',
 ' Silva',
 ' Souza',
 ' Monte',
 'iro',
 ',',
 ' brasileiro',
 ',',
 ' advogado',
 ',',
 ' ',
 '43',
 ' anos',
 ',',
 ' CPF',
 ' ',
 '053',
 '.',
 '123',
 '.',
 '456',
 '-',
 '78']

In [21]:
#simulate lllm-style embeddings for each token
#create a random matrix of len(tokens) x llama-3-1-8b-embedding-dim
import numpy as np
llama_embedding_dim = 4096
embedding_matrix = np.random.rand(len(tokens), llama_embedding_dim)



In [16]:
#create a dictionary with the token as key and the embedding as value
embedding_dict = {token: embedding for token, embedding in zip(tokens, embedding_matrix)}

for key in embedding_dict.keys():
    print(key)


32698
986
35932
1036
6808
63037
1194
430
190647
171448
11
59965
138381
220
5320
11680
85805
41954
13
7633
19354
12
4388


In [25]:
#show a random embedding from the dictionary and the string corresponding to the token
import random
random_key = random.choice(list(embedding_dict.keys()))
print(embedding_dict[random_key])
print(tokenizer.decode_single_token_bytes(random_key).decode('utf-8', errors='replace'))


[0.71597359 0.12929455 0.43666185 ... 0.51584709 0.68081917 0.12534674]
 CPF
