## Preparing the Dataset
tokenizing the text data and converting everything to lowercase:

In [20]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')
import torch

with open("sherlock-holm.es_stories_plain-text_advs.txt","r",encoding="utf-8") as f:
    text = f.read().lower()

#print(text)
tokens = word_tokenize(text)
print("Total tokens:",len(tokens))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/samyakjain/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/samyakjain/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Total tokens: 125731


## Creating a Vocabulary
converting words into numbers and creating word-to-index mappings

In [13]:
from collections import Counter

word_tokens = Counter(tokens)
vocab = sorted(word_tokens, key = word_tokens.get,reverse=True)

word2idx = {word: idx for idx ,word in enumerate(vocab)}
idx2word = {idx: word for word,idx in word2idx.items()}
vocab_size = len(vocab)

## Building Input-Output Sequences

In [22]:
sequence_length = 4
data = []

for i in range(len(tokens) - sequence_length):
    input_seq = tokens[i:i + sequence_length -1]
    target = tokens[i + sequence_length - 1]
    data.append((input_seq,target))

def encode(seq): return [word2idx[word] for word in seq]

encoded_data = [(torch.tensor(encode(inp)), torch.tensor(word2idx[target]))
                for inp, target in data]

## Lstm - based Predictive model

In [28]:
import torch.nn as nn

class PredictiveKeyboard(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super(PredictiveKeyboard, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output[:, -1, :]) 
        return output

## Training the model

In [30]:
import torch
import torch.optim as optim
import random

model = PredictiveKeyboard(vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005)

epochs = 20
for epoch in range(epochs):
    total_loss = 0
    random.shuffle(encoded_data)
    for input_seq, target in encoded_data[:10000]:
        input_seq = input_seq.unsqueeze(0)
        output = model(input_seq)
        loss = criterion(output, target.unsqueeze(0))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 66642.7793
Epoch 2, Loss: 67274.0552
Epoch 3, Loss: 70471.1248
Epoch 4, Loss: 69492.3132
Epoch 5, Loss: 71459.2069
Epoch 6, Loss: 71859.0885
Epoch 7, Loss: 73984.6701
Epoch 8, Loss: 74208.9809
Epoch 9, Loss: 75186.8002
Epoch 10, Loss: 76560.5147
Epoch 11, Loss: 76268.7618
Epoch 12, Loss: 77134.1414
Epoch 13, Loss: 77554.3710
Epoch 14, Loss: 76841.4712
Epoch 15, Loss: 77987.6094
Epoch 16, Loss: 78736.2223
Epoch 17, Loss: 79407.7132
Epoch 18, Loss: 81233.4178
Epoch 19, Loss: 82753.6963
Epoch 20, Loss: 80690.2219


## Predicting the Next Words

In [None]:
import torch.nn.functional as F

def suggest_next_words(model, text_prompt, top_k=3):
    model.eval()
    tokens = word_tokenize(text_prompt.lower())
    if len(tokens) < sequence_length - 1:
        raise ValueError(f"Input should be at least {sequence_length - 1} words long.")

    input_seq = tokens[-(sequence_length - 1):]
    input_tensor = torch.tensor(encode(input_seq)).unsqueeze(0)

    with torch.no_grad():
        output = model(input_tensor)
        probs = F.softmax(output, dim=1).squeeze()
        top_indices = torch.topk(probs, top_k).indices.tolist()

    return [idx2word[idx] for idx in top_indices]

print("Suggestions:", suggest_next_words(model, "So, are we really at"))