In [1]:
import re

In [3]:
with open('../../the-verdict.txt', 'r', encoding='utf-8') as f:
    text = f.read()

print("Total Number of Characters:",len(text))

Total Number of Characters: 20480


### Creating a basic tokenizer

In [4]:
test_text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', test_text)
result = [item.strip() for item in result if item.strip()]
print(result)

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']


### Using the basic tokenizer on the text file

In [5]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]

print(len(preprocessed))
print(preprocessed[:15])

4690
['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow']


### Extracting only unique tokens and sorting them alphabetically to determine the vocabulary size

In [6]:
all_words = sorted(set(preprocessed))
len(all_words)

1130

### Creating a vocabulary and print its first 51 entries for illustration purposes.

In [6]:
vocab = {token:integer for integer, token in enumerate(all_words)}

for i,item in enumerate(vocab.items()):
    print(item)
    if i >=20:
        break


('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)
('A', 11)
('Ah', 12)
('Among', 13)
('And', 14)
('Are', 15)
('Arrt', 16)
('As', 17)
('At', 18)
('Be', 19)
('Begin', 20)


### Implementing a full simple text tokenizer (Encoder and Decoder)

In [7]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = {id:token for token, id in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]

        ids = [self.token_to_id[word] for word in preprocessed]
        return ids
    

    def decode(self, ids):
        text = " ".join([self.id_to_token[i] for i in ids])

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) # Removes spaces before the specified punctuation
        return text

### Encoding and decoding text using the SimpleTokenizer

In [8]:
tokenizer = SimpleTokenizerV1(vocab)

text = """"It's the last he painted, you know,"
Mrs. Gisburn said with pardonable pride."""

ids = tokenizer.encode(text)

print(ids)
print(tokenizer.decode(ids))

[1, 56, 2, 850, 988, 602, 533, 746, 5, 1126, 596, 5, 1, 67, 7, 38, 851, 1108, 754, 793, 7]
" It' s the last he painted, you know," Mrs. Gisburn said with pardonable pride.


### Applying the tokenizer to a new text

In [9]:
text = "Hello, do you want some tea"

print(tokenizer.encode(text))

KeyError: 'Hello'

KeyError: 'Hello' is produced because Hello is not used in the “The Verdict” short story therefore it is not contained in the vocabulary

### Adding special context tokens

In [12]:
all_tokens = all_words + ['<|endoftext|>', '<|unk|>']
print(all_tokens[-10:])
len(all_tokens)

['year', 'years', 'yellow', 'yet', 'you', 'younger', 'your', 'yourself', '<|endoftext|>', '<|unk|>']


1132

### Updating the Vocab with the special tokens

In [13]:
vocab = {tokens:integer for integer, tokens in enumerate(all_tokens)}

In [14]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1127)
('your', 1128)
('yourself', 1129)
('<|endoftext|>', 1130)
('<|unk|>', 1131)


### Updating the simple tokenizer to handle unknown words

In [33]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = { i:s for s,i in vocab.items()}

    
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        
        # Replacing unknown words with <|unk|> token
        preprocessed = [item if item in self.token_to_id else "<|unk|>" for item in preprocessed]

        ids = [self.token_to_id[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.id_to_token[i] for i in ids])
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [34]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [35]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

[1131, 5, 355, 1126, 628, 975, 10, 1130, 55, 988, 956, 984, 722, 988, 1131, 7]


In [36]:
print(tokenizer.decode(tokenizer.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.
