In [6]:
import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
import nltk 
import string

In [7]:
# Load and combine sentences, then convert to list
text_1 = list(gutenberg.sents('carroll-alice.txt'))
text_2 = list(gutenberg.sents('austen-emma.txt'))
#combined_text = text_1 + text_2
# Flatten list of sentences into one long list of tokens
tokens = [word.lower() for sent in text_1 for word in sent]
print("Total tokens:", len(tokens))


Total tokens: 34113


In [9]:
clean_tokens = [tok for tok in tokens if tok.isalpha()]
print("Clean tokens:", len(clean_tokens))

Clean tokens: 27333


In [10]:
from collections import Counter

freq = Counter(clean_tokens)
vocab = sorted(freq.keys())

word_to_idx = {w: i+1 for i, w in enumerate(vocab)}  # 0 reserved for padding
idx_to_word = {i: w for w, i in word_to_idx.items()}

encoded = [word_to_idx[w] for w in clean_tokens]

print("Vocab size:", len(word_to_idx) + 1)
print("Encoded length:", len(encoded))


Vocab size: 2570
Encoded length: 27333


In [11]:
seq_len = 6
inputs = []
targets = []

for i in range(len(encoded) - seq_len):
    inputs.append(encoded[i:i+seq_len])
    targets.append(encoded[i+seq_len])

inputs = np.array(inputs)
targets = np.array(targets)

print("Inputs shape:", inputs.shape)
print("Targets shape:", targets.shape)

Inputs shape: (27327, 6)
Targets shape: (27327,)


In [12]:
def inspect(i):
    x = inputs[i]
    y = targets[i]
    print("IN :", [idx_to_word[idx] for idx in x])
    print("OUT:", idx_to_word[y])

inspect(100)


IN : ['a', 'daisy', 'chain', 'would', 'be', 'worth']
OUT: the


In [13]:
import torch
from torch.utils.data import Dataset, DataLoader

class NextWordDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = NextWordDataset(inputs, targets)
loader = DataLoader(dataset, batch_size=64, shuffle=True)


In [14]:
import torch.nn as nn

class LSTMNextWord(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embed(x)               # (B, T, E)
        out, _ = self.lstm(x)           # (B, T, H)
        out = out[:, -1, :]             # last time step
        logits = self.fc(out)           # (B, vocab)
        return logits


In [15]:
vocab_size = len(word_to_idx) + 1
print("Vocab size:", vocab_size)

Vocab size: 2570


In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = LSTMNextWord(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

epochs = 50

for epoch in range(epochs):
    total_loss = 0

    for X_batch, y_batch in loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        logits = model(X_batch)
        loss = criterion(logits, y_batch)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}  Loss: {total_loss:.4f}")


Epoch 1/50  Loss: 2617.0893
Epoch 2/50  Loss: 2309.8410
Epoch 3/50  Loss: 2143.1861
Epoch 4/50  Loss: 2010.4620
Epoch 5/50  Loss: 1894.5809
Epoch 6/50  Loss: 1786.3853
Epoch 7/50  Loss: 1684.0673
Epoch 8/50  Loss: 1583.5648
Epoch 9/50  Loss: 1486.9259
Epoch 10/50  Loss: 1392.6430
Epoch 11/50  Loss: 1300.9698
Epoch 12/50  Loss: 1213.6476
Epoch 13/50  Loss: 1130.8572
Epoch 14/50  Loss: 1050.8234
Epoch 15/50  Loss: 975.1526
Epoch 16/50  Loss: 903.2075
Epoch 17/50  Loss: 833.9479
Epoch 18/50  Loss: 769.4333
Epoch 19/50  Loss: 707.3406
Epoch 20/50  Loss: 649.6065
Epoch 21/50  Loss: 594.5398
Epoch 22/50  Loss: 542.4300
Epoch 23/50  Loss: 493.6371
Epoch 24/50  Loss: 447.7632
Epoch 25/50  Loss: 405.2798
Epoch 26/50  Loss: 365.2026
Epoch 27/50  Loss: 328.1147
Epoch 28/50  Loss: 293.4679
Epoch 29/50  Loss: 261.9541
Epoch 30/50  Loss: 233.0970
Epoch 31/50  Loss: 206.5107
Epoch 32/50  Loss: 181.7294
Epoch 33/50  Loss: 160.5583
Epoch 34/50  Loss: 140.3020
Epoch 35/50  Loss: 122.2740
Epoch 36/50  Lo

In [17]:
import torch.nn.functional as F

def generate_text(model, seed_words, num_words=20, temperature=1.0):
    model.eval()
    words = seed_words.lower().split()

    for _ in range(num_words):
        # Convert last seq_len words to indices
        last_words = words[-seq_len:]
        encoded = [word_to_idx.get(w, 0) for w in last_words]
        x = torch.tensor(encoded, dtype=torch.long).unsqueeze(0).to(device)

        # Predict
        with torch.no_grad():
            logits = model(x)
            logits = logits / temperature
            probs = F.softmax(logits, dim=-1).squeeze()

        # Sample
        next_idx = torch.multinomial(probs, 1).item()
        next_word = idx_to_word.get(next_idx, "<unk>")

        words.append(next_word)

    return " ".join(words)


In [21]:
print(generate_text(model, "let s play a game", num_words=35))

let s play a game and alice did not be very civil of swimming at school i slates in a nervous tone i m with a telescope i know it s always now with a great many teeth and at
