# 1. Data Preparation and Sampling

In [7]:
with open('the-verdict.txt', 'r') as file:
    raw_text = file.read()

print(f"Total number of characters: {len(raw_text)}")
print(raw_text[:100])

Total number of characters: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


# Tokenization


In [13]:
import re

text = "Hello World, This is a test"

# Split the text into tokens using regex based on whitespace
result = re.split(r'(\s)', text)
print(result)

['Hello', ' ', 'World,', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [14]:
# Split the text into tokens using regex based on whitespace, comma and period
result = re.split(r'(\s|[.,])', text)
print(result)


['Hello', ' ', 'World', ',', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test']


In [15]:
# Remove white space and empty string   
result = [item.strip() for item in result if item.strip()]
print(result)



['Hello', 'World', ',', 'This', 'is', 'a', 'test']


Keep or removing whitespace for embeddding depends on the strcuture of the text and application requirements.
For eg - In python code white space can represent a new line or a new statement or indentation and thus it is important to keep it.

In [16]:
# We also want question marks, double quotes, single quotes, commas, periods, double dashes, and other punctuation marks.
text = "Hello World, How is this? This is a test!"
result = re.split(r'(\s|--|[.,?:;!"()\'])', text)
result = [item.strip() for item in result if item.strip()]
print(result)



['Hello', 'World', ',', 'How', 'is', 'this', '?', 'This', 'is', 'a', 'test', '!']


In [17]:
preprocessed_text = re.split(r'(\s|--|[.,?:/;_!"()\'])', raw_text)
preprocessed_text = [item.strip() for item in preprocessed_text if item.strip()]
print(preprocessed_text[:30])



['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [18]:
print(len(preprocessed_text))

4690


# Converting tokens to ids

In [19]:
all_words = sorted(set(preprocessed_text))
vocab_size = len(all_words)
print(f"Vocabulary size: {vocab_size}")




Vocabulary size: 1130


In [37]:
vocab = {vocab: index for index, vocab in enumerate(all_words)}


In [38]:
for i, item in enumerate(vocab):
    print(f"({item}, {i})")




(!, 0)
(", 1)
(', 2)
((, 3)
(), 4)
(,, 5)
(--, 6)
(., 7)
(:, 8)
(;, 9)
(?, 10)
(A, 11)
(Ah, 12)
(Among, 13)
(And, 14)
(Are, 15)
(Arrt, 16)
(As, 17)
(At, 18)
(Be, 19)
(Begin, 20)
(Burlington, 21)
(But, 22)
(By, 23)
(Carlo, 24)
(Chicago, 25)
(Claude, 26)
(Come, 27)
(Croft, 28)
(Destroyed, 29)
(Devonshire, 30)
(Don, 31)
(Dubarry, 32)
(Emperors, 33)
(Florence, 34)
(For, 35)
(Gallery, 36)
(Gideon, 37)
(Gisburn, 38)
(Gisburns, 39)
(Grafton, 40)
(Greek, 41)
(Grindle, 42)
(Grindles, 43)
(HAD, 44)
(Had, 45)
(Hang, 46)
(Has, 47)
(He, 48)
(Her, 49)
(Hermia, 50)
(His, 51)
(How, 52)
(I, 53)
(If, 54)
(In, 55)
(It, 56)
(Jack, 57)
(Jove, 58)
(Just, 59)
(Lord, 60)
(Made, 61)
(Miss, 62)
(Money, 63)
(Monte, 64)
(Moon-dancers, 65)
(Mr, 66)
(Mrs, 67)
(My, 68)
(Never, 69)
(No, 70)
(Now, 71)
(Nutley, 72)
(Of, 73)
(Oh, 74)
(On, 75)
(Once, 76)
(Only, 77)
(Or, 78)
(Perhaps, 79)
(Poor, 80)
(Professional, 81)
(Renaissance, 82)
(Rickham, 83)
(Riviera, 84)
(Rome, 85)
(Russian, 86)
(Sevres, 87)
(She, 88)
(Stroud, 89

In [39]:
vocab_id_to_str = {index: item for index, item in enumerate(vocab)}

vocab_id_to_str

{0: '!',
 1: '"',
 2: "'",
 3: '(',
 4: ')',
 5: ',',
 6: '--',
 7: '.',
 8: ':',
 9: ';',
 10: '?',
 11: 'A',
 12: 'Ah',
 13: 'Among',
 14: 'And',
 15: 'Are',
 16: 'Arrt',
 17: 'As',
 18: 'At',
 19: 'Be',
 20: 'Begin',
 21: 'Burlington',
 22: 'But',
 23: 'By',
 24: 'Carlo',
 25: 'Chicago',
 26: 'Claude',
 27: 'Come',
 28: 'Croft',
 29: 'Destroyed',
 30: 'Devonshire',
 31: 'Don',
 32: 'Dubarry',
 33: 'Emperors',
 34: 'Florence',
 35: 'For',
 36: 'Gallery',
 37: 'Gideon',
 38: 'Gisburn',
 39: 'Gisburns',
 40: 'Grafton',
 41: 'Greek',
 42: 'Grindle',
 43: 'Grindles',
 44: 'HAD',
 45: 'Had',
 46: 'Hang',
 47: 'Has',
 48: 'He',
 49: 'Her',
 50: 'Hermia',
 51: 'His',
 52: 'How',
 53: 'I',
 54: 'If',
 55: 'In',
 56: 'It',
 57: 'Jack',
 58: 'Jove',
 59: 'Just',
 60: 'Lord',
 61: 'Made',
 62: 'Miss',
 63: 'Money',
 64: 'Monte',
 65: 'Moon-dancers',
 66: 'Mr',
 67: 'Mrs',
 68: 'My',
 69: 'Never',
 70: 'No',
 71: 'Now',
 72: 'Nutley',
 73: 'Of',
 74: 'Oh',
 75: 'On',
 76: 'Once',
 77: 'Only',
 7

# Creating a tokenizer class for encoding and decoding

In [46]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_id = vocab # mappings from tokens to token ids
        self.id_to_str = {index: item for index, item in enumerate(vocab)} # mappings from token ids to tokens

    def encode(self, text):
        preprocessed_text = re.split(r'(\s|--|[.,?:/;_!"()\'])', text)
        preprocessed_text = [item.strip() for item in preprocessed_text if item.strip()]
        return [self.str_to_id[item] for item in preprocessed_text]

    def decode(self, ids):
        text =  " ".join([self.id_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!";:()\'])', r'\1', text)
        return text


In [47]:

tokenizer = SimpleTokenizerV1(vocab)

text = """ 
I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. 
"""
encoded_text = tokenizer.encode(text)
print(encoded_text)

[53, 469, 988, 296, 180, 975, 215, 989, 751, 9, 157, 67, 7, 38, 2, 850, 1086, 1077, 908, 485, 987, 5, 568, 988, 393, 1085, 5, 53, 268, 585, 475, 7]


In [48]:
print(tokenizer.decode(encoded_text))

I found the couple at tea beneath their palm-trees; and Mrs. Gisburn' s welcome was so genial that, in the ensuing weeks, I claimed it frequently.


If the the text given consist of a word that is not in the vocabulary, it will lead to an error.
That is the reason why LLMs a re trained on a large corpus of text.

In [50]:
text = """ 
Hello World, How is this? This is a test!
"""
encoded_text = tokenizer.encode(text)
print(encoded_text)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/Users/ualguest/Desktop/Building_LLM/venv/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code
  File "/var/folders/3g/r3ht76p907vdqxh325xx85tm0000ks/T/ipykernel_21159/1300363719.py", line 4, in <module>
    encoded_text = tokenizer.encode(text)
                   ^^^^^^^^^^^^^^^^^^^^^^
  File "/var/folders/3g/r3ht76p907vdqxh325xx85tm0000ks/T/ipykernel_21159/3007820361.py", line 9, in encode
    return [self.str_to_id[item] for item in preprocessed_text]
            ~~~~~~~~~~~~~~^^^^^^
KeyError: 'Hello'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/ualguest/Desktop/Building_LLM/venv/lib/python3.12/site-packages/pygments/styles/__init__.py", line 45, in get_style_by_name
ModuleNotFoundError: No module named 'pygments.styles.default'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
 

## Special context tokens

LLMs use Special context tokens to handle words which are not in the vocabulary.

Adding two more tokens, <|unk|> and <|endoftext|> to the vocabulary.

In [51]:
all_words = sorted(set(preprocessed_text))
all_words.extend(['<|unk|>', '<|endoftext|>'])

vocab = {vocab: index for index, vocab in enumerate(all_words)}

In [52]:
vocab_size = len(vocab.items())
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 1132


In [53]:
for i, item in enumerate(vocab):
    print(f"({item}, {i})")


(!, 0)
(", 1)
(', 2)
((, 3)
(), 4)
(,, 5)
(--, 6)
(., 7)
(:, 8)
(;, 9)
(?, 10)
(A, 11)
(Ah, 12)
(Among, 13)
(And, 14)
(Are, 15)
(Arrt, 16)
(As, 17)
(At, 18)
(Be, 19)
(Begin, 20)
(Burlington, 21)
(But, 22)
(By, 23)
(Carlo, 24)
(Chicago, 25)
(Claude, 26)
(Come, 27)
(Croft, 28)
(Destroyed, 29)
(Devonshire, 30)
(Don, 31)
(Dubarry, 32)
(Emperors, 33)
(Florence, 34)
(For, 35)
(Gallery, 36)
(Gideon, 37)
(Gisburn, 38)
(Gisburns, 39)
(Grafton, 40)
(Greek, 41)
(Grindle, 42)
(Grindles, 43)
(HAD, 44)
(Had, 45)
(Hang, 46)
(Has, 47)
(He, 48)
(Her, 49)
(Hermia, 50)
(His, 51)
(How, 52)
(I, 53)
(If, 54)
(In, 55)
(It, 56)
(Jack, 57)
(Jove, 58)
(Just, 59)
(Lord, 60)
(Made, 61)
(Miss, 62)
(Money, 63)
(Monte, 64)
(Moon-dancers, 65)
(Mr, 66)
(Mrs, 67)
(My, 68)
(Never, 69)
(No, 70)
(Now, 71)
(Nutley, 72)
(Of, 73)
(Oh, 74)
(On, 75)
(Once, 76)
(Only, 77)
(Or, 78)
(Perhaps, 79)
(Poor, 80)
(Professional, 81)
(Renaissance, 82)
(Rickham, 83)
(Riviera, 84)
(Rome, 85)
(Russian, 86)
(Sevres, 87)
(She, 88)
(Stroud, 89

In [54]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_id = vocab
        self.id_to_str = {index: item for index, item in enumerate(vocab)}

    def encode(self, text):
        preprocessed_text = re.split(r'(\s|--|[.,?:/;_!"()\'])', text)
        preprocessed_text = [item.strip() for item in preprocessed_text if item.strip()] # remove white space
        preprocessed_text = [item if item in self.str_to_id else '<|unk|>' for item in preprocessed_text] # replace unknown words with <|unk|>
        token_ids = [self.str_to_id[item] for item in preprocessed_text]
        return token_ids

    def decode(self, ids):
        text = " ".join([self.id_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.?!";:()\'])', r'\1', text)
        return text

In [55]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = """ 
Hello World, How is this? This is a test!
"""
text2 = """ 
I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. 
"""

text = "<|endoftext|>".join((text1, text2))


In [56]:
print(text)

 
Hello World, How is this? This is a test!
<|endoftext|> 
I found the couple at tea beneath their palm-trees; and Mrs. Gisburn's welcome was so genial that, in the ensuing weeks, I claimed it frequently. 



In [57]:
encoded_text = tokenizer.encode(text)
print(encoded_text)

[1130, 1130, 5, 52, 584, 999, 10, 97, 584, 115, 1130, 0, 1131, 53, 469, 988, 296, 180, 975, 215, 989, 751, 9, 157, 67, 7, 38, 2, 850, 1086, 1077, 908, 485, 987, 5, 568, 988, 393, 1085, 5, 53, 268, 585, 475, 7]


In [58]:
print(tokenizer.decode(encoded_text))

<|unk|> <|unk|>, How is this? This is a <|unk|>! <|endoftext|> I found the couple at tea beneath their palm-trees; and Mrs. Gisburn' s welcome was so genial that, in the ensuing weeks, I claimed it frequently.


So far, we have discussed tokenization as an essential step in processing text as input to LLMs. Depending on the LLM, some researchers also consider additional special tokens such as the following:

[BOS] (beginning of sequence): This token marks the start of a text. It signifies to the LLM where a piece of content begins.

[EOS] (end of sequence): This token is positioned at the end of a text, and is especially useful when concatenating multiple unrelated texts, similar to <|endoftext|>. For instance, when combining two different Wikipedia articles or books, the [EOS] token indicates where one article ends and the next one begins.

[PAD] (padding): When training LLMs with batch sizes larger than one, the batch might contain texts of varying lengths. To ensure all texts have the same length, the shorter texts are extended or "padded" using the [PAD] token, up to the length of the longest text in the batch

Tokenzier used for GPT doesnt use [BOS] and [EOS] [PAD] <unk> tokens. It only uses <|endoftext|> token for simplicity.

GPT uses Byte Pair Encoding (BPE) for tokenization. (next notebook)