In [4]:
import numpy
import torch
import re

The corpus file the-verdict.txt is downloaded using corpus_file_download.py script.
It will serve as our training data for the exercises.

In [3]:

with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print(f"Length of text: {len(raw_text)} characters")
print(raw_text[:42])


Length of text: 20479 characters
I HAD always thought Jack Gisburn rather a


In [6]:
text = "Hello, world. This is a test."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', 'world.', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test.']


In [8]:
# Split but keep the delimiters (commas, periods, and spaces) as tokens.
text = "Hello, world. This is a test."
result = re.split(r'([,.]|\s)', text)
print(result) 

['Hello', ',', '', ' ', 'world', '.', '', ' ', 'This', ' ', 'is', ' ', 'a', ' ', 'test', '.', '']


In [11]:
# More comprehensive tokenizer that keeps various punctuation as separate tokens (but removes empty tokens).
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\`]|--|\s)', text)
result = [tok.strip() for tok in result if tok.strip()]
print(result) 

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


In [39]:
# Tokenizing the actual corpus text - printing its length and first tokens.
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
   
# Split but keep various punctuation as separate tokens (but removes empty tokens).
preprocessed = re.split(r'([,.:;?_!"()\'`]|--|\s)', raw_text)
preprocessed = [tok.strip() for tok in preprocessed if tok.strip()]
print(len(preprocessed))
print(preprocessed[:30])

all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(f"Vocab size: {vocab_size}")
print(all_words[:20])


4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']
Vocab size: 1130
['!', '"', "'", '(', ')', ',', '--', '.', ':', ';', '?', 'A', 'Ah', 'Among', 'And', 'Are', 'Arrt', 'As', 'At', 'Be']


In [47]:
vocab = {token:index for index, token in enumerate(all_words)}
inverse_vocab = {index:token for index, token in enumerate(all_words)}

for index, item in enumerate(vocab.items()):
    if index >= 18:
        break
    print(item)


('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)


In [None]:
import simple_tokenizer_v1

# Tecnically reload the module to reflect any recent changes made to it externally.
# import importlib
# importlib.reload(simple_tokenizer_v1)

tokenizer = simple_tokenizer_v1.SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know,"
          Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids) 
    

encoded tokens: ['"', 'It', "'", 's', 'the', 'last', 'he', 'painted', ',', 'you', 'know', ',', '"', 'Mrs', '.', 'Gisburn', 'said', 'with', 'pardonable', 'pride', '.']
[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]


Now we will check what happens when we use an unseen token (not present in the corpus).

In [56]:
text = "This is an unseenword test."
ids = tokenizer.encode(text)
print(ids)

encoded tokens: ['This', 'is', 'an', 'unseenword', 'test', '.']


KeyError: 'unseenword'

The next tokenizer version supports unknown and end-of-text tokens, so not it 
can handle unseen words and seperate token blocks.

In [57]:
import simple_tokenizer_v2
tokenizer_v2 = simple_tokenizer_v2.SimpleTokenizerV2(vocab)
text = "This is an unseenword test."
ids = tokenizer_v2.encode(text)
print("ids:", ids)
decoded_text = tokenizer_v2.decode(ids)
print("decoded text:", decoded_text)


encoded tokens: ['This', 'is', 'an', '<|unk|>', '<|unk|>', '.']
ids: [97, 584, 156, 1130, 1130, 7]
decoded text: This is an <|unk|> <|unk|>.


In [59]:
import simple_tokenizer_v2
tokenizer_v2 = simple_tokenizer_v2.SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join([text1, text2])
ids = tokenizer_v2.encode(text)
print("ids:", ids)
decoded_text = tokenizer_v2.decode(ids)
print("decoded text:", decoded_text)

encoded tokens: ['<|unk|>', ',', 'do', 'you', 'like', 'tea', '?', '<|endoftext|>', 'In', 'the', 'sunlit', 'terraces', 'of', 'the', '<|unk|>', '.']
ids: [1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]
decoded text: <|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
