#### Reading in a short story as text sample into Python.

##### Creating Tokens

In [2]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read();
    
print("Total numbers of characters: ", len(raw_text));
print("\nFirst 100 characters are: \n")
print(raw_text[:99]);

Total numbers of characters:  20479

First 100 characters are: 

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


##### text splitting, generating tokens

In [6]:
import re

text = "Hello, it's me!, I have been wondering, if after all these years you still remeber me."
result = re.split(r'(\s)', text)
print(result)

['Hello,', ' ', "it's", ' ', 'me!,', ' ', 'I', ' ', 'have', ' ', 'been', ' ', 'wondering,', ' ', 'if', ' ', 'after', ' ', 'all', ' ', 'these', ' ', 'years', ' ', 'you', ' ', 'still', ' ', 'remeber', ' ', 'me.']


**comma and period**

In [7]:
result = re.split(r'[,.]|\s', text)
print(result)

['Hello', '', "it's", 'me!', '', 'I', 'have', 'been', 'wondering', '', 'if', 'after', 'all', 'these', 'years', 'you', 'still', 'remeber', 'me', '']


**remove white spaces from the tokens generated above**

In [8]:
items = [item for item in result if item.strip()];
print(items)

['Hello', "it's", 'me!', 'I', 'have', 'been', 'wondering', 'if', 'after', 'all', 'these', 'years', 'you', 'still', 'remeber', 'me']


In [9]:
result = re.split(r'([,.:;?_!"()\']|--|\s)', text);
result = [item for item in result if item.strip()]
print(result);

['Hello', ',', 'it', "'", 's', 'me', '!', ',', 'I', 'have', 'been', 'wondering', ',', 'if', 'after', 'all', 'these', 'years', 'you', 'still', 'remeber', 'me', '.']


**apply this to raw text we extracted above**

In [10]:
preprocessed = re.split(r'([,.:;"?_()/\']|--|\s)', raw_text);
preprocessed = [item for item in preprocessed if item.strip()];
print(preprocessed[:30]);

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


**Token IDs**

In [12]:
all_words = sorted(set(preprocessed));
vocab_size = len(all_words);

print(vocab_size);

1148


In [13]:
vocab = {token: integer for integer, token in enumerate(all_words)};

In [14]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break;

('"', 0)
("'", 1)
('(', 2)
(')', 3)
(',', 4)
('--', 5)
('.', 6)
(':', 7)
(';', 8)
('?', 9)
('A', 10)
('Ah', 11)
('Among', 12)
('And', 13)
('Are', 14)
('Arrt', 15)
('As', 16)
('At', 17)
('Be', 18)
('Begin', 19)
('Burlington', 20)
('But', 21)
('By', 22)
('Carlo', 23)
('Chicago', 24)
('Claude', 25)
('Come', 26)
('Croft', 27)
('Destroyed', 28)
('Devonshire', 29)
('Don', 30)
('Dubarry', 31)
('Emperors', 32)
('Florence', 33)
('For', 34)
('Gallery', 35)
('Gideon', 36)
('Gisburn', 37)
('Gisburn!', 38)
('Gisburns', 39)
('Grafton', 40)
('Greek', 41)
('Grindle', 42)
('Grindles', 43)
('HAD', 44)
('Had', 45)
('Hang', 46)
('Has', 47)
('He', 48)
('Her', 49)
('Hermia', 50)


#### Tokenizer Class

In [15]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids;
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # replace the space before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text;

**instantiate a new tokenizer object from SimpleTokenizerV1 class and tokenize the raw text**

In [17]:
tokenizer = SimpleTokenizerV1(vocab);

# out of this tokenizer object, we'll now pass a text to its encoder and get it ids
text = """"It's the last he painted, you know", 
        Mrs. Gisburn said with pardonable pride."""
        
ids = tokenizer.encode(text);
print(ids);

[0, 56, 1, 861, 1001, 609, 538, 756, 4, 1144, 603, 0, 4, 69, 6, 37, 862, 1123, 764, 804, 6]


In [None]:
# decodin the ids back to original text
tokenizer.decode(ids)

'" It\' s the last he painted, you know", Mrs. Gisburn said with pardonable pride.'

**what if we pass some text which is already in the vocab**

In [21]:
# sample_text = "Hello, how are you?"
# tokenizer.encode(sample_text)

# NOTE: this will throw an error

#### Adding Special Context Tokens, (Handling Unknow Words)

In [23]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

# now vocab is
vocab = {token: integer for integer, token in enumerate(all_tokens)}

In [24]:
len(vocab.items())

1150

In [25]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1145)
('your', 1146)
('yourself', 1147)
('<|endoftext|>', 1148)
('<|unk|>', 1149)


**construct Tokenizer version 2**

In [26]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s, i in vocab.items() }
        
    def encode(self, text):
        preprocessed = re.split(r'([,.:;"?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace space before the specified punctuation
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [27]:
tokenizer = SimpleTokenizerV2(vocab)

text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."

text = " <|endoftext|> ".join((text1, text2))

print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [31]:
tokenizer.encode(text)

[1149, 4, 360, 1144, 635, 988, 9, 1148, 55, 1001, 969, 997, 731, 1001, 1149, 6]

In [32]:
tokenizer.decode(tokenizer.encode(text))

'<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.'

**Next we'll see BP-Encoding, Byte-Pair Encoding**