In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import re


In [3]:
with open("data.txt", "r") as file:
    text = file.read()


text = re.sub(r'[^a-zA-Z\s]', '', text).lower() 
words = text.split()


In [4]:

word_counts = Counter(words)
vocab = {word: idx for idx, (word, _) in enumerate(word_counts.items(), start=1)}
vocab["<unk>"] = 0  
idx_to_word = {idx: word for word, idx in vocab.items()}


In [5]:

sequences = [vocab.get(word, vocab["<unk>"]) for word in words]



In [6]:
sequence_length = 5
data = []
for i in range(len(sequences) - sequence_length):
    input_seq = sequences[i:i + sequence_length]
    target = sequences[i + sequence_length]
    data.append((input_seq, target))


In [7]:
class TextDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        input_seq, target = self.data[idx]
        return torch.tensor(input_seq, dtype=torch.long), torch.tensor(target, dtype=torch.long)


In [8]:

dataset = TextDataset(data)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)


In [9]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        embeds = self.embedding(x)
        out, _ = self.lstm(embeds)
        out = self.fc(out[:, -1, :])
        return out


In [10]:
vocab_size = len(vocab)
embed_size = 128
hidden_size = 256
num_layers = 2

model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers).to("cuda")
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [11]:
epochs = 20
model.train()
for epoch in range(epochs):
    for inputs, targets in dataloader:
        inputs, targets = inputs.to("cuda"), targets.to("cuda")
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")


Epoch [1/20], Loss: 6.1191
Epoch [2/20], Loss: 5.9015
Epoch [3/20], Loss: 4.7091
Epoch [4/20], Loss: 5.3222
Epoch [5/20], Loss: 4.2424
Epoch [6/20], Loss: 3.6023
Epoch [7/20], Loss: 3.0741
Epoch [8/20], Loss: 1.9362
Epoch [9/20], Loss: 1.7768
Epoch [10/20], Loss: 1.7467
Epoch [11/20], Loss: 1.2791
Epoch [12/20], Loss: 0.8541
Epoch [13/20], Loss: 0.6664
Epoch [14/20], Loss: 0.4661
Epoch [15/20], Loss: 0.3827
Epoch [16/20], Loss: 0.6593
Epoch [17/20], Loss: 0.3038
Epoch [18/20], Loss: 0.3661
Epoch [19/20], Loss: 0.2209
Epoch [20/20], Loss: 0.4908


In [12]:
torch.save(model,"model.pt")


In [15]:
def predict_next_word(model, input_text, vocab, idx_to_word, sequence_length):
    model.eval()
    tokens = [vocab.get(word, vocab["<unk>"]) for word in input_text.split()]

    if len(tokens) < sequence_length:
        tokens = [0] * (sequence_length - len(tokens)) + tokens
    else:
        tokens = tokens[-sequence_length:]

    input_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to("cuda")  
    output = model(input_tensor)
    predicted_idx = torch.argmax(output, dim=1).item()
    return idx_to_word[predicted_idx]


input_text = "You must confine yourself to"
predicted_word = predict_next_word(model, input_text, vocab, idx_to_word, sequence_length)
print(f"Input: '{input_text}' | Predicted Next Word: '{predicted_word}'")


Input: 'You must confine yourself to' | Predicted Next Word: 'the'
