### Creating tokens

In [1]:
import re
class Vocabulary:
    SPLIT_PATTERN = re.compile(r'([,.:;?!_"()\']|--|\s)')
    END_OF_TEXT = '<|endoftext|>'
    UNKNOWN = '<|unk|>'
    
    def __init__(self, filename):
        self.filename = filename
        
    def __read_content__(self):
        with open(self.filename, 'r', encoding='utf-8') as f:
            content = f.read()
        return content
    
    def create_vocabulary(self):
        content = self.__read_content__()
        tokens = self.SPLIT_PATTERN.split(content)
        tokens = [token.strip() for token in tokens if token.strip()]
        
        unique_tokens = sorted(set(tokens))
        unique_tokens.extend((self.UNKNOWN, self.END_OF_TEXT))  # Add padding and unknown tokens
        return {token: idx for idx, token in enumerate(unique_tokens)}


In [2]:
class TokenizerV1:
    ENCODE_SPLIT_PATTERN = re.compile(r'([,.:;?!_"()\']|--|\s)')
    DECODE_SUB_PATTERN = re.compile(r'\s+([,.?!"()\'])')
    
    def __init__(self, vocabulary_or_file_name):
        if isinstance(vocabulary_or_file_name, str):
            creator = Vocabulary(vocabulary_or_file_name)
            self.vocabulary = creator.create_vocabulary()
        elif isinstance(vocabulary_or_file_name, dict):
            self.vocabulary = vocabulary_or_file_name
        else:
            raise ValueError("Vocabulary must be a filename or a dictionary.")
        self.reverse_vocabulary = {idx: token for token, idx in self.vocabulary.items()}

    def encode(self, text):
        preprocessed = self.ENCODE_SPLIT_PATTERN.split(text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        preprocessed = [
            item if item in self.vocabulary else Vocabulary.UNKNOWN 
            for item in preprocessed
        ]
        return [self.vocabulary.get(word, -1) for word in preprocessed]
    
    def decode(self, ids):
        text = ' '.join(self.reverse_vocabulary.get(idx, '') for idx in ids)
        return self.DECODE_SUB_PATTERN.sub(r'\1', text)
    

In [5]:
class TokenizerTeset:
    def __init__(self, filename):
        self.tokenizer = TokenizerV1(filename)
    
    def test_tokenizer(self, test_texts):
        test_text = test_texts if isinstance(test_texts, str) else f' {Vocabulary.END_OF_TEXT} '.join(test_texts)
        encoded = self.tokenizer.encode(test_text)
        decoded = self.tokenizer.decode(encoded)
        
        print(f"Original text: {test_text}")
        print(f"Encoded tokens: {encoded}")
        print(f"Decoded text: {decoded}\n\n")

tester = TokenizerTeset('verdict.txt')
tester.test_tokenizer( """It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.""")
tester.test_tokenizer([  "It's the last he painted, you know,","Mrs. Gisburn said with pardonable pride."])
tester.test_tokenizer([ 'Dawud says Hello',  'In the sunlit terraces of the palace.'])

Original text: It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


Original text: It's the last he painted, you know, <|endoftext|> Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1131, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know, <|endoftext|> Mrs. Gisburn said with pardonable pride.


Original text: Dawud says Hello <|endoftext|> In the sunlit terraces of the palace.
Encoded tokens: [1130, 858, 1130, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]
Decoded text: <|unk|> says <|unk|> <|endoftext|> In the sunlit terraces of the <|unk|>.




## BYTE PAIR ENCODING (BPE)

In [6]:
! pip3 install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting requests>=2.26.0 (from tiktoken)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests>=2.26.0->tiktoken)
  Downloading charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests>=2.26.0->tiktoken)
  Downloading idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests>=2.26.0->tiktoken)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests>=2.26.0->tiktoken)
  Downloading certifi-2025.4.26-py3-none-any.whl.metadata (2.5 kB)
Downloading tiktoken-0.9.0-cp312-cp312-manylinux_2_17_x8

In [28]:
import tiktoken
print(f"tiktoken version: {tiktoken.__version__}\n")

# Suppoorted encoding  -  https://github.com/openai/tiktoken/blob/main/tiktoken/model.py
# example: gpt2, o200k_base, cl100k_base, r50k_base, p50k_base, p50k_edit, r50k_edit
tokenizer = tiktoken.get_encoding("gpt2")

text =  ( "Hello, world! This is a test of the tiktoken library. <|endoftext|>"
            "It is designed to tokenize text efficiently for use with OpenAI's GPT models."
            "Tokenization is the process of converting text into tokens, which are the basic units of meaning. <|endoftext|>"
            "This library supports various encodings, including GPT-2 and GPT-3."
            "I'm from SomeunkownPlace, and I love coding!"
)

# encoding text
tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(f'Encoding:\nNumber of tokens: {len(tokens)}\nTokens: {tokens}\n')

# decoding tokens back to text
decoded_t =  tokenizer.decode(tokens)
print(f'Decoding:\nDecoded text: {decoded_t}\nDecoded text matches original: {decoded_t == text}\n')

tiktoken version: 0.9.0

Encoding:
Number of tokens: 88
Tokens: [15496, 11, 995, 0, 770, 318, 257, 1332, 286, 262, 256, 1134, 30001, 5888, 13, 220, 50256, 1026, 318, 3562, 284, 11241, 1096, 2420, 18306, 329, 779, 351, 4946, 20185, 338, 402, 11571, 4981, 13, 30642, 1634, 318, 262, 1429, 286, 23202, 2420, 656, 16326, 11, 543, 389, 262, 4096, 4991, 286, 3616, 13, 220, 50256, 1212, 5888, 6971, 2972, 2207, 375, 654, 11, 1390, 402, 11571, 12, 17, 290, 402, 11571, 12, 18, 13, 40, 1101, 422, 2773, 2954, 593, 27271, 11, 290, 314, 1842, 19617, 0]

Decoding:
Decoded text: Hello, world! This is a test of the tiktoken library. <|endoftext|>It is designed to tokenize text efficiently for use with OpenAI's GPT models.Tokenization is the process of converting text into tokens, which are the basic units of meaning. <|endoftext|>This library supports various encodings, including GPT-2 and GPT-3.I'm from SomeunkownPlace, and I love coding!
Decoded text matches original: True

