In [1]:
!pip install nltk



In [2]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

In [3]:
df = pd.read_fwf('/content/SherlockHolmesEbook.txt')
df.shape

(9633, 1)

In [5]:
df = df[:300]

In [6]:
df.shape

(300, 1)

In [7]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [8]:
#tokenization
def tokenization(text):
  token = word_tokenize(text.lower())
  return token

for i in range(0, len(df['Unnamed: 0'])):
  df.at[i,'Unnamed: 0'] =  tokenization(df.loc[i, 'Unnamed: 0'])

In [9]:
#build vocab
vocab = {'<UNK>':0}
def build_vocab(tokens, vocab):
  for i in tokens:
    if i not in vocab:
      vocab[i] = len(vocab)

for i in range(0, len(df['Unnamed: 0'])):
  build_vocab(df.loc[i, 'Unnamed: 0'],vocab)

In [10]:
len(vocab)

1119

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0
0,"[project, gutenberg, 's, the, adventures, of, ..."
1,"[this, ebook, is, for, the, use, of, anyone, a..."
2,"[almost, no, restrictions, whatsoever, ., you,..."
3,"[re-use, it, under, the, terms, of, the, proje..."
4,"[with, this, ebook, or, online, at, www.gutenb..."


In [12]:
# text to indices
def text_to_numbers(tokens, vocab):
  numerical_sentence = []
  for token in tokens:
    if token in vocab:
      numerical_sentence.append(vocab[token])
    else:
      numerical_sentence.append(vocab['<UNK>'])
  return numerical_sentence

for i in range(0, len(df['Unnamed: 0'])):
  df.at[i,'Unnamed: 0'] =  text_to_numbers(df.loc[i, 'Unnamed: 0'],vocab)



In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0
0,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]"
1,"[14, 15, 16, 17, 4, 18, 6, 19, 20, 21, 22, 23,..."
2,"[26, 22, 27, 28, 29, 30, 31, 32, 33, 9, 34, 33..."
3,"[37, 33, 38, 4, 39, 6, 4, 1, 2, 40, 41]"
4,"[25, 14, 15, 36, 42, 21, 43]"


In [14]:
df.shape

(300, 1)

In [15]:
training_sequence = []
def training_data(indices,training_sequence):
  for i in range(1, len(indices)):
    training_sequence.append(indices[:i+1])

for i in range(0, len(df['Unnamed: 0'])):
  training_data(df.loc[i, 'Unnamed: 0'],training_sequence)

In [16]:
len(training_sequence)

3439

In [17]:
training_sequence[:15]

[[1, 2],
 [1, 2, 3],
 [1, 2, 3, 4],
 [1, 2, 3, 4, 5],
 [1, 2, 3, 4, 5, 6],
 [1, 2, 3, 4, 5, 6, 7],
 [1, 2, 3, 4, 5, 6, 7, 8],
 [1, 2, 3, 4, 5, 6, 7, 8, 9],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
 [14, 15],
 [14, 15, 16],
 [14, 15, 16, 17]]

In [18]:
#padding
#1. finding longest sequence
len_list = 0
for i in training_sequence:
  if len_list<len(i):
    len_list = len(i)

print(len_list)

29


In [19]:
#2. Adding Zeroes in front of each sequence
padded_training_sequence = []
for sequence in training_sequence:

  padded_training_sequence.append([0]*(len_list - len(sequence)) + sequence)

In [20]:
len(padded_training_sequence[0])

29

In [21]:
padded_training_sequence = torch.tensor(padded_training_sequence, dtype=torch.long)

In [22]:
padded_training_sequence.shape

torch.Size([3439, 29])

In [23]:
X = padded_training_sequence[:,:-1]
y = padded_training_sequence[:,-1]

In [24]:
#dataset class
class CustomDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __len__(self):
    return self.X.shape[0]

  def __getitem__(self,index):
    return self.X[index], self.y[index]

In [25]:
dataset = CustomDataset(X,y)

In [26]:
len(dataset)

3439

In [27]:
dataloader = DataLoader(dataset, batch_size = 64, shuffle = True)

In [28]:
import torch.nn as nn
import torch.optim as optim


In [29]:
#Building Model(LSTM)
class Model(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size,100)
    self.lstm = nn.LSTM(100,150,batch_first=True)
    self.fc = nn.Linear(150,vocab_size)


  def forward(self,x):
    embedded = self.embedding(x)
    intermediate_hidden_states, (final_hidden_state, final_cell_state) = self.lstm(embedded)
    output = self.fc(final_hidden_state.squeeze(0))
    return output

In [30]:
model = Model(len(vocab))

In [31]:
epochs = 50
learning_rate = 0.001

criterion = nn.CrossEntropyLoss()

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [32]:
# training loop

for epoch in range(epochs):
  total_loss = 0

  for batch_x, batch_y in dataloader:

    #batch_x, batch_y = batch_x.to(device), batch_y.to(device)

    optimizer.zero_grad()

    output = model(batch_x)

    loss = criterion(output, batch_y)

    loss.backward()

    optimizer.step()

    total_loss = total_loss + loss.item()

  print(f"Epoch: {epoch + 1}, Loss: {total_loss:.4f}")

Epoch: 1, Loss: 343.0201
Epoch: 2, Loss: 300.1971
Epoch: 3, Loss: 288.7651
Epoch: 4, Loss: 277.1355
Epoch: 5, Loss: 263.9846
Epoch: 6, Loss: 250.8265
Epoch: 7, Loss: 237.7982
Epoch: 8, Loss: 225.4852
Epoch: 9, Loss: 212.7389
Epoch: 10, Loss: 200.6502
Epoch: 11, Loss: 188.6383
Epoch: 12, Loss: 177.0069
Epoch: 13, Loss: 165.6079
Epoch: 14, Loss: 154.3322
Epoch: 15, Loss: 143.1672
Epoch: 16, Loss: 132.7607
Epoch: 17, Loss: 122.5205
Epoch: 18, Loss: 112.7588
Epoch: 19, Loss: 103.4459
Epoch: 20, Loss: 95.1526
Epoch: 21, Loss: 87.0300
Epoch: 22, Loss: 79.7367
Epoch: 23, Loss: 73.2311
Epoch: 24, Loss: 66.9789
Epoch: 25, Loss: 61.4709
Epoch: 26, Loss: 56.1712
Epoch: 27, Loss: 51.7611
Epoch: 28, Loss: 47.6760
Epoch: 29, Loss: 43.7424
Epoch: 30, Loss: 40.6110
Epoch: 31, Loss: 37.6099
Epoch: 32, Loss: 34.8171
Epoch: 33, Loss: 32.1496
Epoch: 34, Loss: 30.0136
Epoch: 35, Loss: 28.0800
Epoch: 36, Loss: 26.1506
Epoch: 37, Loss: 24.3939
Epoch: 38, Loss: 22.8867
Epoch: 39, Loss: 21.5354
Epoch: 40, Loss

In [38]:
# prediction

def prediction(model, vocab, text):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = text_to_numbers(tokenized_text, vocab)
  # padding
  padded_text = torch.tensor([0] * (29 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0)
  # send to model
  output = model(padded_text)
  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + " " + list(vocab.keys())[index]

In [39]:
prediction(model, vocab, "I carefully examined the")

'I carefully examined the writing'