### Load dataset

In [1]:
with open("data/the-verdict.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()
    
print ("Number of characters:", len("".join(lines)))
print("Number of lines:", len(lines))
print("Number of words:", sum(len(line.split()) for line in lines))
print("Number of paragraphs:", sum(1 for line in lines if line.strip() == ""))
print("Number of sentences:", sum(line.count('.') + line.count('!') + line.count('?') for line in lines))

Number of characters: 20479
Number of lines: 165
Number of words: 3634
Number of paragraphs: 82
Number of sentences: 260


### Tokenization
**Goal:** Tokenise 20479 characters

In [2]:
import re # Tokenization
# Tokenize the text into words
# This regex matches words, including those with apostrophes and hyphens

text = "".join(lines)
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
tokens = [token for token in tokens if token.strip()]  # Remove empty tokens
print("Number of tokens:", len(tokens))

Number of tokens: 4690


**Goal:** Converting tokens into token IDs
1. Build vocabulary: List of tokens sorted in an alphabetical order
2. Assign a number to each token

In [3]:
all_words = sorted(set(tokens))
print("Vocabulary size:", len(all_words))

Vocabulary size: 1130


**Goal:** Create vocabulary itself.

In [4]:
vocab= {token: integer for integer, token in enumerate(all_words)}
for i,item in enumerate(vocab.items()):
    print(f"{i}: {item}")
    if i>=10:
        break

0: ('!', 0)
1: ('"', 1)
2: ("'", 2)
3: ('(', 3)
4: (')', 4)
5: (',', 5)
6: ('--', 6)
7: ('.', 7)
8: (':', 8)
9: (';', 9)
10: ('?', 10)


**Goal:** Create a tokenizer class
1. Encoder
2. Decoder

In [5]:
class TokenizerV1:
    def __init__(self, vocab): # Initialize the tokenizer with a vocabulary
        """        Args:
            vocab (dict): A dictionary mapping tokens to unique integer IDs.
        """
        self.str_to_int = vocab
        self.int_to_str = {v: k for k, v in vocab.items()}

    def encode(self, text): # Tokenization
        # Split the text into tokens using regex
        # This regex matches words, including those with apostrophes and hyphens
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [token for token in tokens if token.strip()]
        token_ids = [self.str_to_int.get(token, -1) for token in tokens]
        return [token_id for token_id in token_ids if token_id != -1]

    def decode(self, token_ids):
        text = ' '.join(self.int_to_str.get(token_id, '') for token_id in token_ids)
        return text.replace('  ', ' ').strip() # Replace double spaces with single space

In [6]:
tokenizer = TokenizerV1(vocab)
# Example usage
text = "Hello, world! This is a test."
encoded = tokenizer.encode(text)
print("Encoded:", encoded)
decoded = tokenizer.decode(encoded)
print("Decoded:", decoded)
print("Original text:", text)

Encoded: [5, 0, 97, 584, 115, 7]
Decoded: , ! This is a .
Original text: Hello, world! This is a test.


### Adding special context tokens
**Handle unknown tokens or unknown words**<br>
Include 2 special tokens <|unknown|> and <|endoftext|> to the exsisting vocabulary

In [7]:
print("Vocabulary size:", len(all_words))
all_tokens = sorted(list(set(tokens)))  
all_tokens.extend(['<|unk|>', '<|endoftext|>'])  # Add special tokens
vocab = {token: integer for integer, token in enumerate(all_tokens)}
print("New vocabulary size:", len(vocab.items()))  # Check the size of the vocabulary

Vocabulary size: 1130
New vocabulary size: 1132


In [8]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(f"{i}: {item}")
    if i >= 5:
        break

0: ('younger', 1127)
1: ('your', 1128)
2: ('yourself', 1129)
3: ('<|unk|>', 1130)
4: ('<|endoftext|>', 1131)


**Goal:** Extend the previous tokeniser class
1. Replace unknown/new words with <|unk|>
2. Replace spaces

In [9]:
class TokenizerV2:
    def __init__(self, vocab): # Initialize the tokenizer with a vocabulary
        """        Args:
            vocab (dict): A dictionary mapping tokens to unique integer IDs.
        """
        self.str_to_int = vocab
        self.int_to_str = {v: k for k, v in vocab.items()}
    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int 
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

Test the tokeniser

In [10]:
tokenizer = TokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))
print("Original text:", text)
encoded = tokenizer.encode(text)
print("Encoded:", encoded)
decoded = tokenizer.decode(encoded)
print("Decoded:", decoded)
print("Original text:", text)

Original text: Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
Encoded: [1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]
Decoded: <|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
Original text: Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


### Byte-pair encoding (BPE)
This encoding is used for training LLMs

In [13]:
import importlib
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

tiktoken version: 0.9.0


In [14]:
tokenizer = tiktoken.get_encoding("gpt2")

In [15]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)

integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})

print(integers)

[15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]


**Test the encoder decoder model**

In [16]:
integers = tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

[33901, 86, 343, 86, 220, 959]
Akwirw ier
