In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
limited_lines = []
with open("../../Data/sherlock-holm.es_stories_plain-text_advs.txt", 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        limited_lines.append(line)

raw_text = ''.join(limited_lines)


In [3]:
raw_text

'\n\n\n\n                        THE ADVENTURES OF SHERLOCK HOLMES\n\n                               Arthur Conan Doyle\n\n\n\n                                Table of contents\n\n               A Scandal in Bohemia\n               The Red-Headed League\n               A Case of Identity\n               The Boscombe Valley Mystery\n               The Five Orange Pips\n               The Man with the Twisted Lip\n               The Adventure of the Blue Carbuncle\n               The Adventure of the Speckled Band\n               The Adventure of the Engineer\'s Thumb\n               The Adventure of the Noble Bachelor\n               The Adventure of the Beryl Coronet\n               The Adventure of the Copper Beeches\n\n\n\n\n\n\n\n\n\n\n                              A SCANDAL IN BOHEMIA\n\n\n\n\n\n                                Table of contents\n                                     Chapter 1\n                                     Chapter 2\n                                     Chapt

In [4]:
import tiktoken
tokenize = tiktoken.get_encoding("gpt2")

In [5]:
vocab_size = 50257
vocab_size

50257

In [6]:
from torch.utils.data import Dataset, DataLoader
import torch

In [7]:
class CustomDataset(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text)
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i+max_length]
            self.input_ids.append(torch.tensor(input_chunk))     
            
            target_chunk = token_ids[i+1 : i+1+max_length]
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
            

In [8]:
def create_dataloader(text, batch_size=4, max_length=256,stride=128,shuffle=True, drop_last=True,num_workers=0):
    tokenizer = tiktoken.get_encoding('gpt2')
    dataset = CustomDataset(text,tokenizer, max_length,stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last
    )
    return dataloader

In [9]:
dataloader = create_dataloader(raw_text,batch_size=8,max_length=4,stride=4, shuffle=False)

In [10]:
data_iter = iter(dataloader)
for i, batch in enumerate(data_iter):
    print(f"Batch {i}:", batch)

Batch 0: [tensor([[  628,   628,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,  3336, 43685,  3525],
        [29514,  3963,  6006,  1137]]), tensor([[  628,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [ 3336, 43685,  3525, 29514],
        [ 3963,  6006,  1137, 36840]])]
Batch 1: [tensor([[36840, 49707,    44,  1546],
        [  628,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220],
        [  220,   220,   220,   220]]), tensor([[49707,    44,  1546,   628],
        [  220,   220,  

In [11]:
inputs = None
targets = None
for inputs_fetch, targets_fetch in dataloader:
    inputs, targets = inputs_fetch, targets_fetch
    break

# Token Embedding

In [12]:
output_dim = 256
print(vocab_size, output_dim)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

50257 256


In [13]:
token_embedding = token_embedding_layer(inputs)
token_embedding

tensor([[[ 0.5168, -0.3948,  0.9135,  ...,  0.4169,  0.1081, -0.0491],
         [ 0.5168, -0.3948,  0.9135,  ...,  0.4169,  0.1081, -0.0491],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868]],

        [[ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868]],

        [[ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868]],

        ...,

        [[ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.7980, -0.6868],
         [ 0.1061, -0.5216, -0.6567,  ...,  1.2846,  0.79

In [14]:
token_embedding.shape

torch.Size([8, 4, 256])

# Positional Embedding

In [15]:
max_length = 4
context_legth = max_length
positional_embedding_layer = torch.nn.Embedding(context_legth, output_dim)
positional_embedding = positional_embedding_layer(torch.arange(context_legth))
print(positional_embedding)

tensor([[ 2.0976, -1.8155,  1.3291,  ...,  1.2820, -0.1454, -0.4901],
        [-0.2013,  2.4991,  0.7980,  ..., -0.2987, -1.0903,  1.3917],
        [-0.0936, -0.2354, -0.1271,  ..., -1.2937, -0.4619,  0.6261],
        [-0.0539,  0.1563,  0.5221,  ..., -0.4740,  0.7659, -1.3016]],
       grad_fn=<EmbeddingBackward0>)


In [16]:
positional_embedding.shape

torch.Size([4, 256])

In [17]:
input_embedding = token_embedding + positional_embedding
input_embedding

tensor([[[ 2.6144e+00, -2.2104e+00,  2.2426e+00,  ...,  1.6989e+00,
          -3.7258e-02, -5.3921e-01],
         [ 3.1546e-01,  2.1043e+00,  1.7115e+00,  ...,  1.1816e-01,
          -9.8216e-01,  1.3426e+00],
         [ 1.2495e-02, -7.5694e-01, -7.8381e-01,  ..., -9.0404e-03,
           3.3614e-01, -6.0691e-02],
         [ 5.2240e-02, -3.6530e-01, -1.3462e-01,  ...,  8.1067e-01,
           1.5639e+00, -1.9884e+00]],

        [[ 2.2037e+00, -2.3371e+00,  6.7240e-01,  ...,  2.5666e+00,
           6.5262e-01, -1.1770e+00],
         [-9.5237e-02,  1.9776e+00,  1.4131e-01,  ...,  9.8593e-01,
          -2.9229e-01,  7.0486e-01],
         [ 1.2495e-02, -7.5694e-01, -7.8381e-01,  ..., -9.0404e-03,
           3.3614e-01, -6.0691e-02],
         [ 5.2240e-02, -3.6530e-01, -1.3462e-01,  ...,  8.1067e-01,
           1.5639e+00, -1.9884e+00]],

        [[ 2.2037e+00, -2.3371e+00,  6.7240e-01,  ...,  2.5666e+00,
           6.5262e-01, -1.1770e+00],
         [-9.5237e-02,  1.9776e+00,  1.4131e-01,  .

In [18]:
input_embedding.shape

torch.Size([8, 4, 256])