In [1]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.11.3-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 442.1 kB/s eta 0:00:03
   ------------- -------------------------- 0.5/1.5 MB 442.1 kB/s eta 0:00:03
   ------------- -------------------------- 0.5/1.5 MB 442.1 kB/s eta 0:00:03
   ------------- -------------------------- 0.

In [1]:
import torch
import torch.nn as nn 
import torch.optim as optim 
import numpy as np 
from collections import Counter
from torch.utils.data import Dataset,DataLoader
from nltk.tokenize import word_tokenize
import nltk

In [2]:
from pathlib import Path

path = Path("wiki_00")
if not path.exists():
    raise FileNotFoundError(f"File not found: {path.resolve()}")
document = path.read_text(encoding="utf-8")

In [3]:
len(document)

1030253

In [4]:
## Tokenization 
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\joshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [5]:
# Tokenize 
tokens = word_tokenize(document.lower())

In [6]:
## build vocab 
vocab = {'<UNK>' : 0}

for token in Counter(tokens).keys():
    if token not in vocab:
        vocab[token] = len(vocab)

In [7]:
vocab

{'<UNK>': 0,
 'april': 1,
 'is': 2,
 'the': 3,
 'fourth': 4,
 'month': 5,
 'of': 6,
 'year': 7,
 'in': 8,
 'julian': 9,
 'and': 10,
 'gregorian': 11,
 'calendars': 12,
 ',': 13,
 'comes': 14,
 'between': 15,
 'march': 16,
 'may': 17,
 '.': 18,
 'it': 19,
 'one': 20,
 'four': 21,
 'months': 22,
 'to': 23,
 'have': 24,
 '30': 25,
 'days': 26,
 'always': 27,
 'begins': 28,
 'on': 29,
 'same': 30,
 'day': 31,
 'week': 32,
 'as': 33,
 'july': 34,
 'additionally': 35,
 'january': 36,
 'leap': 37,
 'years': 38,
 'ends': 39,
 'december': 40,
 'making': 41,
 'also': 42,
 'first': 43,
 'out': 44,
 'that': 45,
 'june': 46,
 'september': 47,
 'november': 48,
 'are': 49,
 'later': 50,
 'every': 51,
 'each': 52,
 'other': 53,
 "'s": 54,
 'last': 55,
 'exactly': 56,
 '35': 57,
 'weeks': 58,
 '(': 59,
 '245': 60,
 ')': 61,
 'apart': 62,
 'common': 63,
 'starts': 64,
 'october': 65,
 'previous': 66,
 'finishes': 67,
 'february': 68,
 'immediately': 69,
 'after': 70,
 'before': 71,
 'following': 72,
 'a

In [8]:
len(vocab)

15184

In [9]:
# split into sentences and ignore blank/whitespace-only lines
input_sentences = [s.strip() for s in nltk.sent_tokenize(document) if s.strip()]
print(f"Number of sentences: {len(input_sentences)}")

Number of sentences: 11375


In [10]:
input_sentences

['April is the fourth month of the year in the Julian and Gregorian calendars, and comes between March and May.',
 'It is one of four months to have 30 days.',
 'April always begins on the same day of the week as July, and additionally, January in leap years.',
 'April always ends on the same day of the week as December.',
 'April comes between March and May, making it the fourth month of the year.',
 'It also comes first in the year out of the four months that have 30 days, as June, September and November are later in the year.',
 'April begins on the same day of the week as July every year and on the same day of the week as January in leap years.',
 "April ends on the same day of the week as December every year, as each other's last days are exactly 35 weeks (245 days) apart.",
 'In common years, April starts on the same day of the week as October of the previous year, and in leap years, May of the previous year.',
 'In common years, April finishes on the same day of the week as July

In [11]:
def text_to_indices(sentence,vocab):
    numerical_sentence = []
    for token in sentence :
        if token in vocab:
            numerical_sentence.append(vocab[token])
        else : 
            numerical_sentence.append(vocab['<UNK>'])
    return numerical_sentence

In [12]:
## tokenization of sentences
input_numerical_sentences = []
for sentence in input_sentences:
    input_numerical_sentences.append(text_to_indices(word_tokenize(sentence.lower()),vocab))

In [13]:
len(input_numerical_sentences)

11375

In [14]:
training_sequence = []
for sentence in input_numerical_sentences:
    for i in range(1,len(sentence)):
        training_sequence.append(sentence[:i+1])

In [15]:
len(training_sequence)

191730

In [16]:
len_list = []
for sequence in training_sequence:
    len_list.append(len(sequence))
max(len_list)

157

In [17]:
padded_training_sequence = []
for sequence in training_sequence:
    padded_training_sequence.append([0]*(max(len_list)-len(sequence)) + sequence)

In [18]:
padded_training_sequence = torch.tensor(padded_training_sequence,dtype=torch.long)

In [19]:
padded_training_sequence.shape

torch.Size([191730, 157])

In [20]:
## dataset created
X = padded_training_sequence[:,:-1]
y = padded_training_sequence[:,-1]

In [21]:
X

tensor([[   0,    0,    0,  ...,    0,    0,    1],
        [   0,    0,    0,  ...,    0,    1,    2],
        [   0,    0,    0,  ...,    1,    2,    3],
        ...,
        [   0,    0,    0,  ...,  100,  154,  157],
        [   0,    0,    0,  ...,  154,  157,    8],
        [   0,    0,    0,  ...,  157,    8, 7671]])

In [22]:
y

tensor([   2,    3,    4,  ...,    8, 7671,  157])

In [23]:
class CustomDataset(Dataset):
    def __init__(self,X,y):
        self.X = X
        self.y = y 
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, index):
        return self.X[index],self.y[index]

In [24]:
dataset = CustomDataset(X,y)

In [25]:
len(dataset)

191730

In [26]:
dataloader = DataLoader(dataset,batch_size=32,shuffle=True,pin_memory=True)

In [27]:
print(X.shape)

torch.Size([191730, 156])


In [28]:
## Creating LSTM model architecture
class LSTMModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size,100)
        self.lstm = nn.LSTM(100,150,batch_first=True) ## here 150 are the number of neurons in the hidden layers 
        self.fc = nn.Linear(150,vocab_size)
    def forward(self,x):
        embedded = self.embedding(x)
        intermediate_hidden_states , (final_hidden_state,final_cell_state) = self.lstm(embedded)
        output = self.fc(final_hidden_state[-1])
        return output

In [29]:
model = LSTMModel(len(vocab))

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [31]:
model.to(device)

LSTMModel(
  (embedding): Embedding(15184, 100)
  (lstm): LSTM(100, 150, batch_first=True)
  (fc): Linear(in_features=150, out_features=15184, bias=True)
)

In [32]:
epochs = 50
learning_rate = 0.001 

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [33]:
# training loop 
for epoch in range(epochs):
    total_loss = 0 
    for batch_x , batch_y in dataloader:
        batch_x , batch_y = batch_x.to(device) , batch_y.to(device).long()
        optimizer.zero_grad()
        y_pred = model(batch_x)
        loss = criterion(y_pred,batch_y)
        loss.backward()
        optimizer.step()

        total_loss = total_loss + loss.item()
    print(f'Epoch : {epoch+1} , Loss : {total_loss : .4f}')


Epoch : 1 , Loss :  36509.0038
Epoch : 2 , Loss :  31146.1933
Epoch : 3 , Loss :  28039.5378
Epoch : 4 , Loss :  25470.9827
Epoch : 5 , Loss :  23257.4384
Epoch : 6 , Loss :  21342.1749
Epoch : 7 , Loss :  19664.1132
Epoch : 8 , Loss :  18184.9161
Epoch : 9 , Loss :  16892.0239
Epoch : 10 , Loss :  15733.9539
Epoch : 11 , Loss :  14725.4605
Epoch : 12 , Loss :  13819.5730
Epoch : 13 , Loss :  13025.1975
Epoch : 14 , Loss :  12312.1462
Epoch : 15 , Loss :  11676.3660
Epoch : 16 , Loss :  11120.2301
Epoch : 17 , Loss :  10618.0629
Epoch : 18 , Loss :  10163.0530
Epoch : 19 , Loss :  9735.8727
Epoch : 20 , Loss :  9372.3738
Epoch : 21 , Loss :  9039.5495
Epoch : 22 , Loss :  8715.2030
Epoch : 23 , Loss :  8457.1584
Epoch : 24 , Loss :  8198.0479
Epoch : 25 , Loss :  7982.2118
Epoch : 26 , Loss :  7751.8909
Epoch : 27 , Loss :  7550.4584
Epoch : 28 , Loss :  7405.9881
Epoch : 29 , Loss :  7209.4705
Epoch : 30 , Loss :  7061.3995
Epoch : 31 , Loss :  6925.1541
Epoch : 32 , Loss :  6780.9541

In [36]:
# prediction

def prediction(model, vocab, text):

  # tokenize
  tokenized_text = word_tokenize(text.lower())

  # text -> numerical indices
  numerical_text = text_to_indices(tokenized_text, vocab)

  # padding
  padded_text = torch.tensor([0] * (61 - len(numerical_text)) + numerical_text, dtype=torch.long).unsqueeze(0).to(device)

  # send to model
  output = model(padded_text)

  # predicted index
  value, index = torch.max(output, dim=1)

  # merge with text
  return text + " " + list(vocab.keys())[index]



In [37]:
import time 
num_tokens = 10 
input_text = 'April is the fourth'

for i in range(num_tokens):
    output_text = prediction(model,vocab,input_text)
    print(output_text)
    input_text = output_text
    time.sleep(0.5)


April is the fourth month
April is the fourth month of
April is the fourth month of the
April is the fourth month of the year
April is the fourth month of the year in
April is the fourth month of the year in the
April is the fourth month of the year in the julian
April is the fourth month of the year in the julian and
April is the fourth month of the year in the julian and gregorian
April is the fourth month of the year in the julian and gregorian calendars


In [40]:
dataloader1 = DataLoader(dataset, batch_size=32, shuffle=False)

In [41]:
# Function to calculate accuracy
def calculate_accuracy(model, dataloader, device):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # No need to compute gradients
        for batch_x, batch_y in dataloader1:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)

            # Get model predictions
            outputs = model(batch_x)

            # Get the predicted word indices
            _, predicted = torch.max(outputs, dim=1)

            # Compare with actual labels
            correct += (predicted == batch_y).sum().item()
            total += batch_y.size(0)

    accuracy = correct / total * 100
    return accuracy

# Compute accuracy
accuracy = calculate_accuracy(model, dataloader, device)
print(f"Model Accuracy: {accuracy:.2f}%")


Model Accuracy: 82.88%
