In [1]:
import os
import urllib.request
import re

In [2]:
file_path = "the-verdict.txt"

if not os.path.isfile(file_path):
    url = ("https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/refs/heads/main/ch02/01_main-chapter-code/the-verdict.txt")
    print("Downloading data :", file_path, "\nat :", url)
    urllib.request.urlretrieve(url, file_path)


In [3]:
with open(file_path, "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of characters :", len(raw_text))
print("type :", type(raw_text))
print("sample:", raw_text[:86])

Total number of characters : 20479
type : <class 'str'>
sample: I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--


In [4]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [i.strip() for i in preprocessed if i.strip()]
print(len(preprocessed), preprocessed[:10])

4690 ['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius']


In [5]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

1130


In [6]:
vocab = {str: id for id, str in enumerate(all_words)}

In [35]:
class TokenizerV1:
    def __init__(self, vocab: dict[str, int]):
        self.str_to_int: dict[str, int] = vocab
        self.int_to_str: dict[int, str] = {id: str for str, id in vocab.items()}

    def encode(self, text: str):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [i.strip() for i in preprocessed if i.strip()]

        preprocessed = [i if i in self.str_to_int else "<|unk|>" for i in preprocessed]
        
        ids = [self.str_to_int[str] for str in preprocessed]

        return ids

    def decode(self, ids: list[int]):
        text = " ".join([self.int_to_str[id] for id in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)

        return text

In [36]:
tokenizer = TokenizerV1(vocab)
text = "great surprise to me to hear that, in the height of"
tokenizer.decode(tokenizer.encode(text))

'great surprise to me to hear that, in the height of'

In [37]:
all_tokens = sorted(set(preprocessed))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token: id for id, token in enumerate(all_tokens)}
print(len(vocab))

1132


In [40]:
text1 = "Hello."    # unseen word
text2 = "Cool dog." # unseen words
text = " <|endoftext|> ".join([text1, text2])
text

'Hello. <|endoftext|> Cool dog.'

In [41]:
tokenizer = TokenizerV1(vocab)
print(tokenizer.encode(text), "->", tokenizer.decode(tokenizer.encode(text)))

[1131, 7, 1130, 1131, 1131, 7] -> <|unk|>. <|endoftext|> <|unk|> <|unk|>.
