In [1]:
import requests
import re
from collections import Counter
import numpy as np

In [2]:
url = "https://www.gutenberg.org/files/1661/1661-0.txt"
text = requests.get(url).text

print("Length:", len(text))

Length: 593731


In [3]:
#filtering out just the story part
start_idx, end_idx = None, None

#finding where the first story begins and ends indexes
lines = text.splitlines()
for i, line in enumerate(lines):
    if " A SCANDAL IN BOHEMIA" in line:
        start_idx = i
        break

for i, line in enumerate(lines):
    if "END OF THE PROJECT GUTENBERG" in line.upper():
        end_idx = i
        break

# getting the text that is between the markers
if start_idx is not None and end_idx is not None:
    story_lines = lines[start_idx:end_idx]
elif start_idx is not None:
    story_lines = lines[start_idx:]
else:
    story_lines = lines

text = "\n".join(story_lines).strip()

print(f"Length: {len(text)}\n")
print(text[:800])


Length: 561653

I. A SCANDAL IN BOHEMIA


I.

To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,
were abhorrent to his cold, precise but admirably balanced mind. He
was, I take it, the most perfect reasoning and observing machine that
the world has seen, but as a lover he would have placed himself in a
false position. He never spoke of the softer passions, save with a gibe
and a sneer. They were admirable things for the observer—excellent for
drawing the veil from men’s motives and actions. But for the trained
reasoner to admit such intrusions into his own delicate and finely
adjusted 


In [4]:
#filtering out everything other than letters numbers and .

cleaned_text = re.sub(r'[^a-zA-Z0-9 \.]', '', text)
cleaned_text = cleaned_text.lower() #converting everything to lower case
len(cleaned_text)

533032

In [5]:
#splitting into tokens(words)
cleaned = re.sub(r'[^a-zA-Z0-9\.]', ' ', text)
cleaned = re.sub(r'(\.)', r' \1 ', cleaned)  #space around every period
cleaned = re.sub(r'\s+', ' ', cleaned).strip().lower()
tokens = cleaned.split()#seperated .
print(len(tokens))
print(tokens[:50])

112095
['i', '.', 'a', 'scandal', 'in', 'bohemia', 'i', '.', 'to', 'sherlock', 'holmes', 'she', 'is', 'always', 'the', 'woman', '.', 'i', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other', 'name', '.', 'in', 'his', 'eyes', 'she', 'eclipses', 'and', 'predominates', 'the', 'whole', 'of', 'her', 'sex', '.', 'it', 'was', 'not', 'that', 'he', 'felt', 'any', 'emotion']


In [6]:
# Keep '.' in tokens
tokens_wo_period = tokens  # don't remove '.'

# Build vocab (include <unk>)
vocab = sorted(set(tokens_wo_period) | {"<unk>"})

# Frequency stats
from collections import Counter
freq = Counter(tokens_wo_period)
most_common_10 = freq.most_common(10)
least_common_10 = freq.most_common()[-10:]

print(f"Vocabulary size: {len(vocab)}\n")
print("10 most frequent words:")
for w, c in most_common_10:
    print(f"{w:>10} : {c}")

print("\n10 least frequent words:")
for w, c in least_common_10:
    print(f"{w:>10} : {c}")

# Create mappings
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

# Ensure <unk> present (safety)
if "<unk>" not in word_to_idx:
    unk_index = len(word_to_idx)
    word_to_idx["<unk>"] = unk_index
    idx_to_word[unk_index] = "<unk>"


Vocabulary size: 7885

10 most frequent words:
         . : 6196
       the : 5612
         i : 3037
       and : 3018
        to : 2744
        of : 2647
         a : 2640
        in : 1765
      that : 1752
        it : 1734

10 least frequent words:
    seaman : 1
 blockaded : 1
 arguments : 1
     locus : 1
    standi : 1
  survived : 1
    solely : 1
 mauritius : 1
manifested : 1
   walsall : 1


In [7]:
#parameters
context_size = 5#given in example
tokens_final = tokens
vocab = sorted(set(tokens_final))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

#creating the sequences
X, y = [], []

for i in range(len(tokens_final) - context_size):
    context = tokens_final[i : i + context_size]
    target = tokens_final[i + context_size]
    X.append([word_to_idx[w] for w in context])
    y.append(word_to_idx[target])

X = np.array(X)
y = np.array(y)

print(f"Context size: {context_size}")
print(f"Total samples: {len(X)}\n")

#verification
for i in range(45,60):
    context_words = [idx_to_word[idx] for idx in X[i]]
    target_word = idx_to_word[y[i]]
    print(f"{' '.join(context_words)} ---> {target_word}")


Context size: 5
Total samples: 112090

that he felt any emotion ---> akin
he felt any emotion akin ---> to
felt any emotion akin to ---> love
any emotion akin to love ---> for
emotion akin to love for ---> irene
akin to love for irene ---> adler
to love for irene adler ---> .
love for irene adler . ---> all
for irene adler . all ---> emotions
irene adler . all emotions ---> and
adler . all emotions and ---> that
. all emotions and that ---> one
all emotions and that one ---> particularly
emotions and that one particularly ---> were
and that one particularly were ---> abhorrent


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import re, requests


In [9]:
#ensuring that the GPU is used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


In [10]:
#train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)
#converting to torch tensors
X_train = torch.tensor(X_train, dtype=torch.long).to(device)
y_train = torch.tensor(y_train, dtype=torch.long).to(device)
X_val = torch.tensor(X_val, dtype=torch.long).to(device)
y_val = torch.tensor(y_val, dtype=torch.long).to(device)


In [11]:
#defining the model
vocab_size=7885
class MLPTextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, context_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim * context_size, hidden_dim)
        self.act1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)           #(batch, context_size, embed_dim)
        x = x.view(x.size(0), -1)       #flatten
        x = self.act1(self.fc1(x))
        x = self.fc2(x)
        return self.log_softmax(x)

In [12]:
#starting training
embed_dim = 64
hidden_dim = 1024
epochs = 120

model = MLPTextGenerator(vocab_size, embed_dim, hidden_dim, context_size).to(device)
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [14]:
class MLPTextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, context_size, dropout=0.3):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim * context_size, hidden_dim)
        self.act1 = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_dim, vocab_size)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = self.embedding(x)
        x = x.view(x.size(0), -1)
        x = self.act1(self.fc1(x))
        x = self.drop(x)
        x = self.fc2(x)
        return self.log_softmax(x)


In [17]:
with open("vocab.pkl", "wb") as f:
    pickle.dump({
        "vocab": vocab,
        "word_to_idx": word_to_idx,
        "idx_to_word": idx_to_word,
        "context_size": context_size
    }, f)

In [19]:
import torch
import torch.nn as nn
import pickle

# Load vocab
with open("vocab.pkl", "rb") as f:
    data = pickle.load(f)
vocab = data["vocab"]
vocab_size = len(vocab)
context_size = data["context_size"]

# Define training variants
variants = [
    {"name": "small", "embed_dim": 32, "hidden_dim": 512, "epochs": 20},
    {"name": "medium", "embed_dim": 64, "hidden_dim": 1024, "epochs": 40},
    {"name": "large", "embed_dim": 128, "hidden_dim": 2048, "epochs": 60},
]


In [None]:
from torch.utils.data import DataLoader, TensorDataset

batch_size = 512
train_dataset = TensorDataset(X_train, y_train)
val_dataset = TensorDataset(X_val, y_val)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

for var in variants:
    print(f"\nTraining {var['name']} model...")

    model = MLPTextGenerator(vocab_size, var["embed_dim"], var["hidden_dim"], context_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=3)
    criterion = nn.CrossEntropyLoss()

    epochs = var["epochs"]

    for epoch in range(epochs):
        model.train()
        train_loss, correct, total = 0.0, 0, 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            output = model(xb)
            loss = criterion(output, yb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * xb.size(0)
            preds = output.argmax(dim=1)
            correct += (preds == yb).sum().item()
            total += yb.size(0)

        train_acc = correct / total
        train_loss /= total

        # --- Validation ---
        model.eval()
        val_loss, correct, total = 0.0, 0, 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                output = model(xb)
                loss = criterion(output, yb)
                val_loss += loss.item() * xb.size(0)
                preds = output.argmax(dim=1)
                correct += (preds == yb).sum().item()
                total += yb.size(0)

        val_acc = correct / total
        val_loss /= total
        scheduler.step(val_loss)

        if (epoch + 1) % 10 == 0 or epoch == 0:
            print(f"Epoch {epoch+1}/{epochs} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.3f}")

    save_path = f"holmes_{var['name']}.pt"
    torch.save(model.state_dict(), save_path)
    print(f" Saved {var['name']} model to {save_path}")



Training small model...
Epoch 1/20 | Train Loss: 6.3125 | Val Loss: 5.9317 | Val Acc: 0.100
Epoch 10/20 | Train Loss: 2.9009 | Val Loss: 6.7906 | Val Acc: 0.138
Epoch 20/20 | Train Loss: 2.4927 | Val Loss: 7.3668 | Val Acc: 0.142
 Saved small model to holmes_small.pt

Training medium model...
Epoch 1/40 | Train Loss: 6.1307 | Val Loss: 5.7630 | Val Acc: 0.118
Epoch 10/40 | Train Loss: 1.9245 | Val Loss: 7.2282 | Val Acc: 0.142
Epoch 20/40 | Train Loss: 1.3984 | Val Loss: 7.8972 | Val Acc: 0.141
Epoch 30/40 | Train Loss: 1.2417 | Val Loss: 8.1773 | Val Acc: 0.141
Epoch 40/40 | Train Loss: 1.1789 | Val Loss: 8.3149 | Val Acc: 0.141
 Saved medium model to holmes_medium.pt

Training large model...
Epoch 1/60 | Train Loss: 6.0037 | Val Loss: 5.6392 | Val Acc: 0.136
Epoch 10/60 | Train Loss: 0.6339 | Val Loss: 8.0381 | Val Acc: 0.134
Epoch 20/60 | Train Loss: 0.2091 | Val Loss: 8.8432 | Val Acc: 0.137
Epoch 30/60 | Train Loss: 0.1386 | Val Loss: 9.1154 | Val Acc: 0.136
Epoch 40/60 | Train L

In [None]:
import pickle

data = {
    "word_to_idx": word_to_idx,
    "idx_to_word": idx_to_word,
    "vocab": vocab,
    "context_size": context_size
}
with open("vocab.pkl", "wb") as f:
    pickle.dump(data, f)

print("Saved vocab.pkl")


Saved vocab.pkl


In [None]:
from google.colab import files
files.download("holmes_small.pt")
files.download("holmes_medium.pt")
files.download("holmes_large.pt")
files.download("vocab.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>