In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
limited_lines = []
with open("../../../Data/sherlock-holm.es_stories_plain-text_advs.txt", 'r', encoding='utf-8') as f:
    for i, line in enumerate(f):
        limited_lines.append(line)

raw_text = ''.join(limited_lines)


In [3]:
raw_text



In [4]:
class Tokenization:
    class Dictionary:
        def __init__(self):
            self.vocab = {}
            
        
            
        def makeDictionary(self,text):
            raw_text = text.replace("\n"," ")
            raw_text = raw_text.lower()
            list_of_word = re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
            list_of_word2 = [word.strip() for word in list_of_word if word.strip()]
            for word in list_of_word2:
                if word not in self.vocab:
                    self.vocab[word] = len(self.vocab)
                    
            self.vocab['|endoftext|'] = len(self.vocab)
                    
            return self.vocab
    
    def __init__(self,train_text):
        dic = Tokenization.Dictionary()
        self.str_to_int = dic.makeDictionary(train_text) 
        self.int_to_str = {y:x for x,y in self.str_to_int.items()}
        
    def encode(self,text):
        text = text.lower()
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed = [word.strip() for word in preprocessed if word.strip()]
        
        ids = []
        for word in preprocessed:
            if word in self.str_to_int:
                ids.append(self.str_to_int[word])
            else:
                ids.append(self.str_to_int['|endoftext|'])
        
        return ids
    
    def decode(self, ids):
        text = (" ".join([self.int_to_str[id] for id in ids]))
        text = re.sub(r'\s+([,.:;?!"()\'])',r'\1',text)
        return text

In [5]:
tokenize = Tokenization(raw_text)

In [6]:
vocab_size = len(tokenize.str_to_int)
vocab_size

8224

In [7]:
from torch.utils.data import Dataset, DataLoader
import torch

In [8]:
class CustomDataset(Dataset):
    def __init__(self,text,tokenizer,max_length,stride):
        self.input_ids = []
        self.target_ids = []
        
        token_ids = tokenizer.encode(text)
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i : i+max_length]
            self.input_ids.append(torch.tensor(input_chunk))     
            
            target_chunk = token_ids[i+1 : i+1+max_length]
            self.target_ids.append(torch.tensor(target_chunk))
            
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
            

In [9]:
def create_dataloader(text, batch_size=4, max_length=256,stride=128,shuffle=True, drop_last=True,num_workers=0):
    tokenizer = Tokenization(text)
    dataset = CustomDataset(text,tokenizer, max_length,stride)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [10]:
dataloader = create_dataloader(raw_text,batch_size=4,max_length=4,stride=4, shuffle=False)

In [11]:
data_iter = iter(dataloader)
for i, batch in enumerate(data_iter):
    print(f"Batch {i}:", batch)

Batch 0: [tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  2,  9, 10],
        [11, 12, 13,  0]]), tensor([[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 2,  9, 10, 11],
        [12, 13,  0, 14]])]
Batch 1: [tensor([[14, 15, 10, 16],
        [ 2, 17,  0, 18],
        [19, 20,  0, 21],
        [22, 23,  0, 24]]), tensor([[15, 10, 16,  2],
        [17,  0, 18, 19],
        [20,  0, 21, 22],
        [23,  0, 24, 25]])]
Batch 2: [tensor([[25,  0, 26, 27],
        [ 0, 28,  2,  0],
        [29, 30,  0, 28],
        [ 2,  0, 31, 32]]), tensor([[ 0, 26, 27,  0],
        [28,  2,  0, 29],
        [30,  0, 28,  2],
        [ 0, 31, 32,  0]])]
Batch 3: [tensor([[ 0, 28,  2,  0],
        [33, 34, 35, 36],
        [ 0, 28,  2,  0],
        [37, 38,  0, 28]]), tensor([[28,  2,  0, 33],
        [34, 35, 36,  0],
        [28,  2,  0, 37],
        [38,  0, 28,  2]])]
Batch 4: [tensor([[ 2,  0, 39, 40],
        [ 0, 28,  2,  0],
        [41, 42, 10, 11],
        [12, 13,  8,  2]

In [12]:
inputs = None
targets = None
for inputs_fetch, targets_fetch in dataloader:
    inputs, targets = inputs_fetch, targets_fetch
    break

# Token Embedding

In [13]:
output_dim = 256
print(vocab_size, output_dim)
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

8224 256


In [14]:
token_embedding = token_embedding_layer(inputs)
token_embedding

tensor([[[-1.1249, -1.4407,  0.5253,  ...,  2.0414, -0.0495,  0.1640],
         [-0.9391,  0.5127, -0.3310,  ...,  0.6901, -0.0468, -0.0546],
         [-0.2000, -1.9973, -1.0564,  ..., -0.5074,  0.3478,  1.1216],
         [ 0.6997, -0.8606,  0.5228,  ...,  0.2489,  0.6698, -1.0389]],

        [[ 0.1321, -2.6245,  1.2223,  ..., -0.6684, -1.6200, -0.4125],
         [-0.3218, -0.1165, -0.3914,  ..., -0.3693, -0.1187, -1.0337],
         [ 1.0497,  1.2804,  1.4619,  ...,  1.8792,  0.1416, -0.5659],
         [ 2.0107, -1.7189, -0.3919,  ..., -0.0568, -1.1873,  0.0980]],

        [[-1.1530, -0.1432,  0.0414,  ..., -0.3528,  0.1549,  0.1201],
         [-0.2000, -1.9973, -1.0564,  ..., -0.5074,  0.3478,  1.1216],
         [-0.3808, -1.3081, -1.2883,  ...,  0.3175, -1.0637,  1.7206],
         [ 1.9916,  1.0475, -0.0915,  ...,  0.6726,  0.7325,  0.9202]],

        [[-0.7682,  0.3391, -1.3554,  ...,  2.4266,  0.0285,  1.1851],
         [-2.0578, -1.5660,  0.0647,  ...,  1.2724, -1.6902, -0.1505],


In [15]:
token_embedding.shape

torch.Size([4, 4, 256])

# Positional Embedding

In [16]:
max_length = 4
context_legth = max_length
positional_embedding_layer = torch.nn.Embedding(context_legth, output_dim)
positional_embedding = positional_embedding_layer(torch.arange(context_legth))
print(positional_embedding)

tensor([[-0.8367, -0.2494, -1.1368,  ..., -1.0188,  0.6450, -0.8334],
        [-0.9286,  1.5777,  0.9727,  ...,  1.9038,  1.3130, -0.4959],
        [ 0.9181,  0.1493, -0.9228,  ...,  0.2636, -0.1458, -0.3775],
        [-1.6666, -0.1617,  0.8095,  ...,  0.2481, -0.4450,  0.6339]],
       grad_fn=<EmbeddingBackward0>)


In [17]:
positional_embedding.shape

torch.Size([4, 256])

In [18]:
input_embedding = token_embedding + positional_embedding
input_embedding

tensor([[[-1.9616, -1.6901, -0.6115,  ...,  1.0226,  0.5955, -0.6694],
         [-1.8677,  2.0904,  0.6417,  ...,  2.5939,  1.2662, -0.5504],
         [ 0.7181, -1.8480, -1.9792,  ..., -0.2438,  0.2020,  0.7441],
         [-0.9669, -1.0223,  1.3323,  ...,  0.4970,  0.2248, -0.4049]],

        [[-0.7046, -2.8739,  0.0855,  ..., -1.6872, -0.9749, -1.2459],
         [-1.2504,  1.4612,  0.5813,  ...,  1.5345,  1.1943, -1.5296],
         [ 1.9678,  1.4298,  0.5392,  ...,  2.1428, -0.0042, -0.9434],
         [ 0.3441, -1.8806,  0.4176,  ...,  0.1913, -1.6323,  0.7319]],

        [[-1.9897, -0.3926, -1.0954,  ..., -1.3716,  0.7999, -0.7133],
         [-1.1286, -0.4196, -0.0837,  ...,  1.3964,  1.6609,  0.6257],
         [ 0.5373, -1.1588, -2.2110,  ...,  0.5811, -1.2095,  1.3431],
         [ 0.3250,  0.8857,  0.7181,  ...,  0.9207,  0.2875,  1.5541]],

        [[-1.6049,  0.0897, -2.4922,  ...,  1.4078,  0.6736,  0.3517],
         [-2.9864,  0.0116,  1.0373,  ...,  3.1762, -0.3772, -0.6464],


In [19]:
input_embedding.shape

torch.Size([4, 4, 256])

In [20]:
# Model Build

In [21]:
import torch.nn as nn

input_dim = 4
embedding_dim = 128
lstm_dim = 256


class NextWordPredictLSTM(nn.Module):
    def __init__(self):
        super(NextWordPredictLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_dim, batch_first=True)
        self.fc = nn.Linear(lstm_dim, vocab_size)

    def forward(self, x):
        embedded_x = self.embedding(x)
        intermediate_hidden_state, (final_hidden_state, final_cell_state) = self.lstm(embedded_x)
        output = self.fc(final_hidden_state.squeeze(0))
        return output

In [22]:
# Train Model

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [37]:
epochs = 5
learning_rate = 0.1
model = NextWordPredictLSTM()
model.to(device)

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
for epoch in range(epochs):
    epoch_loss = 0
    for batch_id, (batch_x, batch_y) in enumerate(dataloader):
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)

        optimizer.zero_grad()

        y_pred = model.forward(batch_x)

        loss = criterion(y_pred, batch_y[:, -1])

        loss.backward()

        optimizer.step()

        epoch_loss += loss.item()

    print(f"Epoch {epoch}: Loss: {epoch_loss / len(dataloader)}")

Epoch 0: Loss: 5.742423404795184
Epoch 1: Loss: 4.996300324227968
Epoch 2: Loss: 4.566004809539323
Epoch 3: Loss: 4.1266695134926366
Epoch 4: Loss: 3.65284640428196


In [38]:
model.eval()

def prediction(model,vocab, text):
    tokenize_text = tokenize.encode(text)
    
    if(len(tokenize_text) < 4):
        tokenize_text = [tokenize.str_to_int['|endoftext|']] * (4 - len(tokenize_text)) + tokenize_text
    else:
        tokenize_text = tokenize_text[-4:]
    
    tokenize_text = torch.tensor(tokenize_text).unsqueeze(0).to(device)

    output = model(tokenize_text)
    
    value, index = torch.max(output, dim=1)
    print(value, index)
    
    return text + " " + list(tokenize.str_to_int)[index]

In [39]:
prediction(model,tokenize.str_to_int, "One night--it was on the twentieth of March")

tensor([8.9875], device='cuda:0', grad_fn=<MaxBackward0>) tensor([145], device='cuda:0')


'One night--it was on the twentieth of March which'

In [44]:

prediction(model,tokenize.str_to_int, "A Scandal in ")

tensor([8.1842], device='cuda:0', grad_fn=<MaxBackward0>) tensor([59], device='cuda:0')


'A Scandal in  her'

In [41]:
def makeNWord(n,text):
    for i in range(n):
        text = prediction(model,tokenize.str_to_int,text)
        print(text,end=" ")

In [43]:
makeNWord(5,"221 b baker")

tensor([10.9925], device='cuda:0', grad_fn=<MaxBackward0>) tensor([206], device='cuda:0')
221 b baker street tensor([9.7544], device='cuda:0', grad_fn=<MaxBackward0>) tensor([53], device='cuda:0')
221 b baker street . tensor([8.1171], device='cuda:0', grad_fn=<MaxBackward0>) tensor([92], device='cuda:0')
221 b baker street . but tensor([9.3004], device='cuda:0', grad_fn=<MaxBackward0>) tensor([393], device='cuda:0')
221 b baker street . but you tensor([11.7348], device='cuda:0', grad_fn=<MaxBackward0>) tensor([54], device='cuda:0')
221 b baker street . but you have 

In [48]:
makeNWord(5,"red headed league")

tensor([7.9646], device='cuda:0', grad_fn=<MaxBackward0>) tensor([85], device='cuda:0')
red headed league , tensor([11.4203], device='cuda:0', grad_fn=<MaxBackward0>) tensor([67], device='cuda:0')
red headed league , and tensor([8.9204], device='cuda:0', grad_fn=<MaxBackward0>) tensor([264], device='cuda:0')
red headed league , and so tensor([7.8540], device='cuda:0', grad_fn=<MaxBackward0>) tensor([74], device='cuda:0')
red headed league , and so that tensor([9.1202], device='cuda:0', grad_fn=<MaxBackward0>) tensor([71], device='cuda:0')
red headed league , and so that it 