Implementing Continous Bag of Worlds Model

In [47]:
import torch
import numpy as np
import random
import torch.nn as nn
import torch.nn.functional as F


In [3]:
import nltk
import re

In [4]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/kaushikdwivedi/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [5]:
from nltk.corpus import brown

In [8]:
words = list(brown.words())

In [9]:
len(words)

1161192

In [10]:
words = words[:100000]

In [11]:
len(words)

100000

In [14]:
def tokenize(some_words):
    tokenized_words = []
    
    for word in some_words:
        word = word.lower()
        word = re.sub(r"'","", word)
        
        clean_parts = re.findall(r"\w+", word)
        tokenized_words.extend(clean_parts)
        
    return tokenized_words

In [15]:
words = tokenize(words)

In [16]:
len(words)

89563

In [18]:
"'" in words

False

In [31]:
vocab={}

In [32]:
i=0
while i<len(words):
    vocab[i] = words[i]
    i = i+1

In [33]:
vocab_size = len(vocab)
emb_dim = 10

In [48]:
#vocab

In [36]:
index_to_word = vocab

In [37]:
word_to_index = {word: idx for idx, word in vocab.items()}

In [43]:
#building the tokenized corpus
tokenized_corpus = []
for i in index_to_word:
    tokenized_corpus.append(i)

In [132]:
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, emb_dim):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, emb_dim)
        self.linear = nn.Linear(emb_dim, vocab_size)
        
    def forward(self, context_batch):
        context_embeds = self.embeddings(context_batch)
        v_ctx = context_embeds.mean(dim=1)
        logits  = self.linear(v_ctx)
        return logits
    

In [133]:
def generate_cbow_batches(tokenized_corpus, window_size=2, batch_size=128):
    context_batch = []
    target_batch = []
    
    for i in range(window_size, len(tokenized_corpus) - window_size):
        context = tokenized_corpus[i - window_size:i] + tokenized_corpus[i + 1:i + window_size + 1]
        target = tokenized_corpus[i]

        context_batch.append(context)
        target_batch.append(target)

        if len(context_batch) == batch_size:
            yield torch.tensor(context_batch, dtype=torch.long), torch.tensor(target_batch, dtype=torch.long)
            context_batch, target_batch = [], []

    if context_batch:
        yield torch.tensor(context_batch, dtype=torch.long), torch.tensor(target_batch, dtype=torch.long)


In [134]:
emb_dim = 50
num_epochs =1
batch_size = 128
lr = 0.005

In [135]:
model = CBOW(vocab_size, emb_dim)

In [141]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

for epoch in range(num_epochs):
    total_loss = 0
    step_count = 0  # 🧮 Track number of steps

    print(f"\n🔁 Epoch {epoch+1}/{num_epochs}")

    for step, (context_batch, target_batch) in enumerate(generate_cbow_batches(tokenized_corpus, window_size=2, batch_size=128)):
        logits = model(context_batch)
        loss = F.cross_entropy(logits, target_batch)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        step_count += 1

        # 🖨️ Print every 100 steps
        if step % 100 == 0:
            print(f"Step {step} || Batch Loss: {loss.item():.4f}")

    avg_loss = total_loss / step_count
    print(f"✅ Epoch {epoch+1} Finished || Avg Loss: {avg_loss:.4f}")



🔁 Epoch 1/1
Step 0 || Batch Loss: 9.3239
Step 100 || Batch Loss: 10.4122
Step 200 || Batch Loss: 11.5879
Step 300 || Batch Loss: 12.0860
Step 400 || Batch Loss: 12.5950
Step 500 || Batch Loss: 13.1339
Step 600 || Batch Loss: 13.6471
✅ Epoch 1 Finished || Avg Loss: 12.1607


In [139]:
def get_neighbors(word, word_to_ix, ix_to_word, model, top_k=5):
    with torch.no_grad():
        word_idx = word_to_ix[word]
        word_vec = model.embeddings(torch.tensor([word_idx])).squeeze()
        all_vecs = model.embeddings.weight
        sims = torch.matmul(all_vecs, word_vec)
        topk = torch.topk(sims, top_k + 1)  # +1 to skip the word itself

        for idx in topk.indices[1:]:
            print(ix_to_word[idx.item()])


In [140]:
get_neighbors("india", word_to_index, index_to_word, model)


and
and
was
to
said
