In [3]:
import nltk
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [5]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# 1. Load & Preprocess Data
print("Loading and preprocessing text...")

alice = gutenberg.raw('carroll-alice.txt')

sentences = []
for sentence in sent_tokenize(alice):
    cleaned_sentence = re.sub(r'[^a-zA-Z\s]', '', sentence.lower())
    words = word_tokenize(cleaned_sentence)
    if words:
        sentences.append(words)

# Flatten into token list
tokens = [w for sent in sentences for w in sent]

# 2. Build Vocabulary
vocab = list(set(tokens))
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}
vocab_size = len(vocab)

print(f"Total tokens: {len(tokens)}")
print(f"Vocabulary size: {vocab_size}")

# 3. Create Training Data (CBOW)
def make_context_target(tokens, window_size=2):
    data = []
    for i in range(window_size, len(tokens) - window_size):
        context = tokens[i - window_size:i] + tokens[i+1:i+window_size+1]
        target = tokens[i]
        data.append((context, target))
    return data

window_size = 2
data = make_context_target(tokens, window_size=window_size)

# 4. CBOW Model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context_idxs):
        embeds = self.embeddings(context_idxs)
        mean_embeds = embeds.mean(dim=0).view(1, -1)
        out = self.linear(mean_embeds)
        return out
#https://docs.pytorch.org/docs/stable/generated/torch.nn.Embedding.html
#https://docs.pytorch.org/docs/stable/generated/torch.nn.Linear.html

# 5. Training Setup
embedding_dim = 100
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CBOW(vocab_size, embedding_dim).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)
#https://docs.pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
#https://docs.pytorch.org/docs/stable/generated/torch.optim.SGD.html

print("Training on:", device)

# 6. Training Loop
epochs = 100
for epoch in range(epochs):
    total_loss = 0
    for context, target in data:
        context_idxs = torch.tensor([word_to_ix[w] for w in context], dtype=torch.long).to(device)
        target_idx = torch.tensor([word_to_ix[target]], dtype=torch.long).to(device)

        scores = model(context_idxs)
        loss = loss_fn(scores, target_idx)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

# 7. Extract Word Vectors
word_vectors = model.embeddings.weight.data.cpu()

print("\nVector for 'alice':")
print(word_vectors[word_to_ix["alice"]][:10])  # show first 10 dims

Loading and preprocessing text...
Total tokens: 26383
Vocabulary size: 2751
Training on: cuda
Epoch 1/100, Loss: 175599.8108
Epoch 2/100, Loss: 151633.2676
Epoch 3/100, Loss: 142667.9521
Epoch 4/100, Loss: 136857.3290
Epoch 5/100, Loss: 132352.6871
Epoch 6/100, Loss: 128564.9292
Epoch 7/100, Loss: 125237.9990
Epoch 8/100, Loss: 122236.7932
Epoch 9/100, Loss: 119480.5221
Epoch 10/100, Loss: 116917.1232
Epoch 11/100, Loss: 114511.2352
Epoch 12/100, Loss: 112237.6852
Epoch 13/100, Loss: 110077.7939
Epoch 14/100, Loss: 108017.3120
Epoch 15/100, Loss: 106045.1349
Epoch 16/100, Loss: 104152.4164
Epoch 17/100, Loss: 102331.9488
Epoch 18/100, Loss: 100577.7251
Epoch 19/100, Loss: 98884.6753
Epoch 20/100, Loss: 97248.4737
Epoch 21/100, Loss: 95665.4422
Epoch 22/100, Loss: 94132.4125
Epoch 23/100, Loss: 92646.6728
Epoch 24/100, Loss: 91205.8836
Epoch 25/100, Loss: 89808.0191
Epoch 26/100, Loss: 88451.3510
Epoch 27/100, Loss: 87134.3775
Epoch 28/100, Loss: 85855.8143
Epoch 29/100, Loss: 84614.536

In [12]:
import torch.nn.functional as F

def word_similarity(w1, w2):
    v1 = word_vectors[word_to_ix[w1]]
    v2 = word_vectors[word_to_ix[w2]]
    sim = F.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0))
    return sim.item()

print("Similarity(alice, wonderland):", word_similarity("alice", "wonderland"))
print("Similarity(king, queen):", word_similarity("king", "queen") if "king" in word_to_ix else "N/A")


Similarity(alice, wonderland): 0.008554024621844292
Similarity(king, queen): 0.20389409363269806


In [13]:
model_save_path = "cbow_model.pth"
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to cbow_model.pth


In [16]:
for name, param in model.named_parameters():
    print(name, param.shape, param.requires_grad)

total_params = sum(p.numel() for p in model.parameters())
print("Total parameters:", total_params)

embeddings.weight torch.Size([2751, 100]) True
linear.weight torch.Size([2751, 100]) True
linear.bias torch.Size([2751]) True
Total parameters: 552951
