In [1]:
with open('the-verdict.txt', "r") as f:
    raw_text = f.read()
print("Total number of characters: ", len(raw_text))

Total number of characters:  20479


In [2]:
import re
preprocessed = re.split(r'([,.?!"\'()_]|--|\s)', raw_text)
preprocessed = [item for item in preprocessed if item.strip()]
print("Total number of tokens: ", len(preprocessed))

Total number of tokens:  4649


In [3]:
all_words = sorted(set(preprocessed))
all_words.extend(["<|endoftext|>", "<|unk|>"])
vocab_size = len(all_words)
print("Vocabulary size: ", vocab_size)
# assign a unique integer to each word
vocab = {token: integer for integer, token in enumerate(all_words)}
print("Vocabulary: ", vocab)

Vocabulary size:  1161
Vocabulary:  {'!': 0, '"': 1, "'": 2, '(': 3, ')': 4, ',': 5, '--': 6, '.': 7, ':': 8, ';': 9, '?': 10, 'A': 11, 'Ah': 12, 'Among': 13, 'And': 14, 'Are': 15, 'Arrt': 16, 'As': 17, 'At': 18, 'Be': 19, 'Begin': 20, 'Burlington': 21, 'But': 22, 'By': 23, 'Carlo': 24, 'Carlo;': 25, 'Chicago': 26, 'Claude': 27, 'Come': 28, 'Croft': 29, 'Destroyed': 30, 'Devonshire': 31, 'Don': 32, 'Dubarry': 33, 'Emperors': 34, 'Florence': 35, 'For': 36, 'Gallery': 37, 'Gideon': 38, 'Gisburn': 39, 'Gisburns': 40, 'Grafton': 41, 'Greek': 42, 'Grindle': 43, 'Grindle:': 44, 'Grindles': 45, 'HAD': 46, 'Had': 47, 'Hang': 48, 'Has': 49, 'He': 50, 'Her': 51, 'Hermia': 52, 'His': 53, 'How': 54, 'I': 55, 'If': 56, 'In': 57, 'It': 58, 'Jack': 59, 'Jove': 60, 'Just': 61, 'Lord': 62, 'Made': 63, 'Miss': 64, 'Money': 65, 'Monte': 66, 'Moon-dancers': 67, 'Mr': 68, 'Mrs': 69, 'My': 70, 'Never': 71, 'No': 72, 'Now': 73, 'Nutley': 74, 'Of': 75, 'Oh': 76, 'On': 77, 'Once': 78, 'Only': 79, 'Or': 80, 'Pe

In [4]:
class SimpleTokenizer:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s, i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?!"\'()_]|--|\s)', text)
        preprocessed = [item for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in preprocessed]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"\'])', r'\1', text)
        return text

In [5]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

tokenizer = SimpleTokenizer(vocab)
print(tokenizer.encode(text))
print(tokenizer.decode(tokenizer.encode(text)))

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.
[1160, 5, 362, 1155, 642, 1000, 10, 1159, 57, 1013, 981, 1009, 738, 1013, 1160, 7]
<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [6]:
# now we're using BPE to encode and decode text
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(raw_text)

In [7]:
# building a dataset for batched inputs and targets
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i+1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [8]:
def create_dataloader(txt, batch_size=4, max_length=256, 
                      stride=128, shuffle=True, drop_last=True, 
                      num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, 
                            shuffle=shuffle, drop_last=drop_last, 
                            num_workers=num_workers)
    return dataloader

In [9]:
dataloader = create_dataloader(raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter = iter(dataloader)

# token embedding
output_dim = 256
vocab_size = 50257
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
inputs, targets = next(data_iter)
token_embeddings = token_embedding_layer(inputs)

# positional embedding
context_length = 4
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))

# input embedding
input_embedding = token_embeddings + pos_embeddings
print(input_embedding.shape)

torch.Size([8, 4, 256])
