In [1]:
# Read text from the book
with open("the-verdict.txt", encoding="utf-8") as fd:
    text = fd.read()

print("Total number of characters: ", len(text))
print("Sample Text(first 100 characters): ", text[:99])

Total number of characters:  20479
Sample Text(first 100 characters):  I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
# Split the text based on whitespaces, commas and dots.
import re

expr = r'([,.:;?_!"()\'\s]|--)'
preproccessed_text = re.split(expr, text)

preproccessed_text = [token for token in preproccessed_text if token.strip()]
print("Total length of preprocced text: ", len(preproccessed_text))
print("Sample preproccessed text: ", preproccessed_text[:50])

Total length of preprocced text:  4690
Sample preproccessed text:  ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in', 'the', 'height', 'of', 'his', 'glory', ',', 'he', 'had', 'dropped', 'his', 'painting', ',', 'married', 'a', 'rich', 'widow', ',', 'and', 'established', 'himself']


In [3]:

# Convert Tokens to Token IDs

sorted_tokens = sorted(set(preproccessed_text))
print("Vocabulary size: ", len(sorted_tokens))

# Build vocabulary
# Vocabulary is mapping that maps a word to a token ID

vocabulary = { token:integer for integer, token in enumerate(sorted_tokens) }
print('Sample vocabulary:')
for token, token_id in vocabulary.items():
    if token_id < 10:
        print(f'({token}, {token_id})')


class SimpleTokenizerV1:
    def __init__(self, vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = { token_id: token for token, token_id in vocabulary.items() }

    def encode(self, text):
        preproccessed_text = re.split(r'([,.:;?_!"()\'\s]|--)', text)
        preproccessed_text = [token for token in preproccessed_text if token.strip()]
        ids = [ self.str_to_int[token] for token in preproccessed_text ]
        return ids

    def decode(self, ids):
        text = " ".join([ self.int_to_str[id_] for id_ in ids ])
        # Replace space before punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

tokenizer = SimpleTokenizerV1(vocabulary)
token_ids = tokenizer.encode(text)

Vocabulary size:  1130
Sample vocabulary:
(!, 0)
(", 1)
(', 2)
((, 3)
(), 4)
(,, 5)
(--, 6)
(., 7)
(:, 8)
(;, 9)


In [4]:
# Special Context Tokens: To handle cases for unknown words that are not part of the vocabulary

sorted_tokens.extend(["<|endoftext|>", "<|unk|>"])
print(len(sorted_tokens))
vocabulary = { token: token_id for token_id, token in enumerate(sorted_tokens) }

for id_, vocab in enumerate(list(vocabulary.items())[-5:]):
    print(vocab)


class SimpleTokenizerV2:
    def __init__(self, vocabulary):
        self.str_to_int = vocabulary
        self.int_to_str = { token_id: token for token, token_id in vocabulary.items() }

    def encode(self, text):
        preproccessed_text = re.split(r'([,.:;?_!"()\'\s]|--)', text)
        preproccessed_text = [token for token in preproccessed_text if token.strip()]
        ids = []
        for token in preproccessed_text:
            if token not in self.str_to_int:
                ids.append(self.str_to_int['<|unk|>'])
            else:
                ids.append(self.str_to_int[token])
        return ids

    def decode(self, ids):
        text = " ".join([ self.int_to_str[id_] for id_ in ids ])
        # Replace space before punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

tokenizer = SimpleTokenizerV2(vocabulary)
first_phrase = "Hello! do you like tea?"
second_phrase = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((first_phrase, second_phrase))
token_ids = tokenizer.encode(text)
print(token_ids)
tokenizer.decode(token_ids)

1132
('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)
[1131, 0, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


'<|unk|>! do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

In [11]:
# Use Tiktoken
import tiktoken

text = "Hello! do you like tea? <|endoftext|> In the sunlit terraces of the of someunknownplace."
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(token_ids)
print(tokenizer.decode(token_ids))

[15496, 0, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 286, 262, 286, 617, 34680, 5372, 13]
Hello! do you like tea? <|endoftext|> In the sunlit terraces of the of someunknownplace.
