In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
limited_lines = []
with open("../../Data/sherlock-holm.es_stories_plain-text_advs.txt", 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        limited_lines.append(line)

text = ''.join(limited_lines)


In [3]:
text

'\n\n\n\n                        THE ADVENTURES OF SHERLOCK HOLMES\n\n                               Arthur Conan Doyle\n\n\n\n                                Table of contents\n\n               A Scandal in Bohemia\n               The Red-Headed League\n               A Case of Identity\n               The Boscombe Valley Mystery\n               The Five Orange Pips\n               The Man with the Twisted Lip\n               The Adventure of the Blue Carbuncle\n               The Adventure of the Speckled Band\n               The Adventure of the Engineer\'s Thumb\n               The Adventure of the Noble Bachelor\n               The Adventure of the Beryl Coronet\n               The Adventure of the Copper Beeches\n\n\n\n\n\n\n\n\n\n\n                              A SCANDAL IN BOHEMIA\n\n\n\n\n\n                                Table of contents\n                                     Chapter 1\n                                     Chapter 2\n                                     Chapt

In [4]:
class Tokenization:
    class Dictionary:
        def __init__(self):
            self.vocab = {}
            
        def makeDictionary(self,text):
            raw_text = text.replace("\n"," ")
            raw_text = raw_text.lower()
            list_of_word = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
            list_of_word2 = [word.strip() for word in list_of_word if word.strip()]
            for word in list_of_word2:
                if word not in self.vocab:
                    self.vocab[word] = len(self.vocab)
                    
            self.vocab['|endoftext|'] = len(self.vocab)
                    
            return self.vocab
    
    def __init__(self,train_text):
        dic = Tokenization.Dictionary()
        self.str_to_int = dic.makeDictionary(train_text) 
        self.int_to_str = {y:x for x,y in self.str_to_int.items()}
        
    def encode(self,text):
        text = text.lower()
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        
        ids = []
        for word in preprocessed:
            if word in self.str_to_int:
                ids.append(self.str_to_int[word])
            else:
                ids.append(self.str_to_int['|endoftext|'])
        
        return ids
    
    def decode(self, ids):
        text = (" ".join([self.int_to_str[id] for id in ids]))
        text = re.sub(r'\s+([,.:;?!"()\'])',r'\1',text)
        return text

In [5]:
tokenize = Tokenization(text)

In [6]:
vocab_size = len(tokenize.str_to_int)
vocab_size

1852

In [7]:
from torch.utils.data import Dataset, DataLoader
import torch

In [8]:
class CustomDataset(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text)
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i+max_length]
            self.input_ids.append(torch.tensor(input_chunk))     
            
            target_chunk = token_ids[i+1 : i+1+max_length]
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
            

In [9]:
def create_dataloader(text, batch_size=4, max_length=256,stride=128,shuffle=True, drop_last=True,num_workers=0):
    tokenizer = Tokenization(text)
    dataset = CustomDataset(text,tokenizer, max_length,stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [10]:
limited_lines = []
with open("../../Data/sherlock-holm.es_stories_plain-text_advs.txt", 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        if i >= 1000:
            break
        limited_lines.append(line)

raw_text = ''.join(limited_lines)

In [11]:
raw_text

'\n\n\n\n                        THE ADVENTURES OF SHERLOCK HOLMES\n\n                               Arthur Conan Doyle\n\n\n\n                                Table of contents\n\n               A Scandal in Bohemia\n               The Red-Headed League\n               A Case of Identity\n               The Boscombe Valley Mystery\n               The Five Orange Pips\n               The Man with the Twisted Lip\n               The Adventure of the Blue Carbuncle\n               The Adventure of the Speckled Band\n               The Adventure of the Engineer\'s Thumb\n               The Adventure of the Noble Bachelor\n               The Adventure of the Beryl Coronet\n               The Adventure of the Copper Beeches\n\n\n\n\n\n\n\n\n\n\n                              A SCANDAL IN BOHEMIA\n\n\n\n\n\n                                Table of contents\n                                     Chapter 1\n                                     Chapter 2\n                                     Chapt

In [12]:
dataloader = create_dataloader(raw_text,batch_size=8,max_length=4,stride=4, shuffle=False)

In [13]:
data_iter = iter(dataloader)
for i, batch in enumerate(data_iter):
    print(f"Batch {i}:", batch)

Batch 0: [tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  2,  9, 10],
        [11, 12, 13,  0],
        [14, 15, 10, 16],
        [ 2, 17,  0, 18],
        [19, 20,  0, 21],
        [22, 23,  0, 24]]), tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 2,  9, 10, 11],
        [12, 13,  0, 14],
        [15, 10, 16,  2],
        [17,  0, 18, 19],
        [20,  0, 21, 22],
        [23,  0, 24, 25]])]
Batch 1: [tensor([[25,  0, 26, 27],
        [ 0, 28,  2,  0],
        [29, 30,  0, 28],
        [ 2,  0, 31, 32],
        [ 0, 28,  2,  0],
        [33, 34, 35, 36],
        [ 0, 28,  2,  0],
        [37, 38,  0, 28]]), tensor([[ 0, 26, 27,  0],
        [28,  2,  0, 29],
        [30,  0, 28,  2],
        [ 0, 31, 32,  0],
        [28,  2,  0, 33],
        [34, 35, 36,  0],
        [28,  2,  0, 37],
        [38,  0, 28,  2]])]
Batch 2: [tensor([[ 2,  0, 39, 40],
        [ 0, 28,  2,  0],
        [41, 42, 10, 11],
        [12, 13,  8,  2],
        [ 9, 43, 44, 43],


In [14]:
inputs = None
targets = None
for inputs_fetch, targets_fetch in dataloader:
    inputs, targets = inputs_fetch, targets_fetch
    break

# Token Embedding

In [15]:
output_dim = 256
print(vocab_size, output_dim)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

1852 256


In [16]:
token_embedding = token_embedding_layer(inputs)
token_embedding

tensor([[[-5.4160e-01,  1.5227e-01,  1.8953e+00,  ..., -7.3827e-01,
           1.5396e+00,  4.6112e-01],
         [ 1.7441e+00, -5.2954e-01, -9.9403e-01,  ...,  1.1027e+00,
          -2.8868e-01,  2.0133e+00],
         [ 1.2371e+00, -6.8477e-01,  1.4138e+00,  ...,  8.1044e-01,
          -2.0520e-01,  4.1709e-01],
         [-6.8596e-01,  1.0924e+00, -1.3720e-01,  ...,  1.8942e-02,
           3.3738e-02,  5.7413e-01]],

        [[ 1.1217e+00,  3.3612e-01,  2.4014e+00,  ...,  6.5150e-01,
          -8.7590e-02, -2.4103e-01],
         [ 1.2510e+00, -7.1867e-02,  4.4077e-01,  ..., -4.8727e-01,
           4.3821e-01,  2.4663e+00],
         [-7.9440e-02,  1.0894e+00, -3.8160e-02,  ..., -1.1925e+00,
           6.7314e-01, -1.3849e-01],
         [-5.2463e-01,  1.3315e+00, -1.4502e+00,  ...,  1.0312e+00,
           5.2043e-01, -1.1604e+00]],

        [[-9.5223e-02, -1.5763e+00, -3.1799e+00,  ...,  1.8843e+00,
          -1.0625e+00, -8.2811e-01],
         [ 1.2371e+00, -6.8477e-01,  1.4138e+00,  .

In [17]:
token_embedding.shape

torch.Size([8, 4, 256])

# Positional Embedding

In [18]:
max_length = 4
context_legth = max_length
positional_embedding_layer = torch.nn.Embedding(context_legth, output_dim)
positional_embedding = positional_embedding_layer(torch.arange(context_legth))
print(positional_embedding)

tensor([[-3.3228e-01, -4.6054e-01, -5.2970e-02,  ..., -5.9691e-01,
          6.5223e-01,  1.1228e+00],
        [-5.4049e-01,  3.3337e-03,  3.8325e-02,  ..., -6.7094e-01,
         -3.2736e-01,  8.2044e-01],
        [ 6.3245e-01, -4.8011e-01,  5.8900e-01,  ...,  4.9161e-01,
          4.3299e-01, -2.2139e+00],
        [ 3.4356e-01, -4.0916e+00,  1.0125e-01,  ..., -5.1761e-01,
         -1.0536e+00,  9.8905e-01]], grad_fn=<EmbeddingBackward0>)


In [19]:
positional_embedding.shape

torch.Size([4, 256])

In [20]:
input_embedding = token_embedding + positional_embedding
input_embedding

tensor([[[-0.8739, -0.3083,  1.8423,  ..., -1.3352,  2.1918,  1.5839],
         [ 1.2036, -0.5262, -0.9557,  ...,  0.4318, -0.6160,  2.8337],
         [ 1.8695, -1.1649,  2.0028,  ...,  1.3020,  0.2278, -1.7968],
         [-0.3424, -2.9992, -0.0359,  ..., -0.4987, -1.0199,  1.5632]],

        [[ 0.7894, -0.1244,  2.3485,  ...,  0.0546,  0.5646,  0.8818],
         [ 0.7105, -0.0685,  0.4791,  ..., -1.1582,  0.1108,  3.2867],
         [ 0.5530,  0.6093,  0.5508,  ..., -0.7009,  1.1061, -2.3524],
         [-0.1811, -2.7601, -1.3489,  ...,  0.5136, -0.5332, -0.1714]],

        [[-0.4275, -2.0369, -3.2329,  ...,  1.2874, -0.4103,  0.2947],
         [ 0.6966, -0.6814,  1.4522,  ...,  0.1395, -0.5326,  1.2375],
         [ 0.1846,  0.1742, -0.5028,  ..., -0.6142, -0.8018, -2.5450],
         [ 2.3353, -5.9176,  0.1596,  ...,  1.4642, -0.4724,  1.1727]],

        ...,

        [[ 0.9048, -1.1453,  1.3609,  ...,  0.2135,  0.4470,  1.5399],
         [ 0.4468, -0.5349, -0.2399,  ..., -2.6478, -1.33

In [21]:
input_embedding.shape

torch.Size([8, 4, 256])