### Creating tokens

In [7]:
with open('verdict.txt','r', encoding='utf-8') as f:
    raw_text = f.read()
print(f'Verdit has {len(raw_text)} characters')
print(raw_text[:99])  # Print the first 100 characters for a quick check

Verdit has 20479 characters
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re
def convert_text_to_tokens(text):
    # Split the text into sentences using regex
    sentences = re.split(r'([,.:;?!_"()\']|--|\s)', text)
    # Remove any leading/trailing whitespace from each sentence
    return [word.strip() for word in sentences if word.strip()]

tokens = convert_text_to_tokens(raw_text)

print(f'The text has been broken down into {len(tokens)} tokens.')
print(tokens[:99])  # Print the first 100 tokens for a quick check

The text has been broken down into 4690 tokens.
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself', 'in', 'a', 'villa', 'on', 'the', 'Riviera', '.', '(', 'Though', 'I', 'rather', 'thought', 'it', 'would', 'have', 'been', 'Rome', 'or', 'Florence', '.', ')', '"', 'The', 'height', 'of', 'his', 'glory', '"', '--', 'that', 'was', 'what', 'the', 'women', 'called', 'it', '.', 'I', 'can', 'hear', 'Mrs', '.', 'Gideon', 'Thwing', '--', 'his', 'last', 'Chicago', 'sitter']


#### Assign token ID

In [40]:
def create_vocabulary(tokens):
    unique_tokens = sorted(set(tokens))
    unique_tokens.extend(['<|endoftext|>', '<|unk|>'])  # Add padding and unknown tokens
    return {token: idx for idx, token in enumerate(unique_tokens)}

vocabulary = create_vocabulary(tokens)
print(f'Vocabulary size: {len(vocabulary)}')
for token in list(vocabulary.items())[9:19]:
    print(f'{token}')


Vocabulary size: 1132
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)


In [None]:
class TokenizerV1:
    def __init__(self, vocabulary):
        self.vocabulary = vocabulary
        self.reverse_vocabulary = {idx: token for token, idx in vocabulary.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?!_"()\']|--|\s)', text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        preprocessed = [
            item if item not in vocabulary
            else '<|unk|>'  for item in preprocessed
        ]
        return [self.vocabulary.get(word, -1) for word in preprocessed]
    
    def decode(self, ids):
        text = ' '.join(self.reverse_vocabulary.get(idx, '') for idx in ids)
        # replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text
    

In [42]:
tokenizer = TokenizerV1(vocabulary)
def test_tokenizer(test_text:str):
    encoded = tokenizer.encode(test_text)
    decoded = tokenizer.decode(encoded)
    
    print(f"Original text: {test_text}")
    print(f"Encoded tokens: {encoded}")
    print(f"Decoded text: {decoded}")
test_tokenizer( """It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.""")

Original text: It's the last he painted, you know,"  Mrs. Gisburn said with pardonable pride.
Encoded tokens: [56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
Decoded text: It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


In [None]:
# special context tokens