Implementing Continous Bag of Worlds Model

In [47]:
import torch
import numpy as np
import random
import torch.nn as nn
import torch.nn.functional as F


In [3]:
import nltk
import re

In [4]:
nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     /Users/kaushikdwivedi/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [5]:
from nltk.corpus import brown

In [8]:
words = list(brown.words())

In [9]:
len(words)

1161192

In [10]:
words = words[:100000]

In [11]:
len(words)

100000

In [14]:
def tokenize(some_words):
    tokenized_words = []
    
    for word in some_words:
        word = word.lower()
        word = re.sub(r"'","", word)
        
        clean_parts = re.findall(r"\w+", word)
        tokenized_words.extend(clean_parts)
        
    return tokenized_words

In [15]:
words = tokenize(words)

In [16]:
len(words)

89563

In [18]:
"'" in words

False

In [31]:
vocab={}

In [32]:
i=0
while i<len(words):
    vocab[i] = words[i]
    i = i+1

In [33]:
vocab_size = len(vocab)
emb_dim = 10

In [48]:
#vocab

In [36]:
index_to_word = vocab

In [37]:
word_to_index = {word: idx for idx, word in vocab.items()}

In [43]:
#building the tokenized corpus
tokenized_corpus = []
for i in index_to_word:
    tokenized_corpus.append(i)

In [80]:
emb_dim =10

In [81]:
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, emb_dim, tokenized_corpus):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, emb_dim)
        self.linear = nn.Linear(emb_dim, vocab_size)
        self.tokenized_corpus = tokenized_corpus
        
    def forward(self, center_idx):
        context_indices = torch.tensor([center_idx - 2, center_idx - 1, center_idx + 1, center_idx + 2], dtype=torch.long)
        context_emb = self.embeddings(context_indices)
        v_ctx = torch.mean(context_emb, dim =0, keepdim=True)
        logits  = self.linear(v_ctx)
        return logits
            
            
        

In [82]:
model = CBOW(vocab_size, emb_dim, tokenized_corpus)

In [86]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [89]:
num_epochs = 1

In [90]:
for epoch in range(num_epochs):
    total_loss = 0
    print(f"Epoch {epoch+1}/{num_epochs}")
    
    for i in range(2, len(tokenized_corpus)-2):
        center_idx = tokenized_corpus[i]
        logits = model(i)
        
        loss = F.cross_entropy(logits, torch.tensor([center_idx], dtype=torch.long))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_loss += loss.item()
        
        if i%10000 == 0:
            print(f"step {i} || loss: {loss.item()}")
            
    print(f"Epoch {epoch+1} || Finished || Total Loss: {total_loss:.4f}")

Epoch 1/1
step 10000 || loss: 10.793659210205078
step 20000 || loss: 11.876947402954102
step 30000 || loss: 12.252073287963867
step 40000 || loss: 11.405335426330566


KeyboardInterrupt: 