# CBOW (Continuous Bag-of-Words)

Part of **#30DaysOfBasics**, Imma implement CBOW using pytorh.

CBOW: Given conext words and it would predicts the word. This is distinct from LM, since it is not sequential model and does not have to be probabilistic. Typically, CBOW is used to quickly train word embeddings, and these embeddings are used to initialize the embeddings of some more complicated model. Usually, this is referred to as pretraining embeddings

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
HI_FILE_PATH = './data/data/hi/hi_sample.txt'

In [3]:
CONTEXT_SIZE = 2

EMBD_DIM = 64

In [None]:
#To re-use method of N-gram notebook

%run 'n-gram LM.ipynb'
hi_vocab, hi_word_to_idx, hi_idx_to_word, hi_content = create_vocab(HI_FILE_PATH)

In [5]:
print(hi_content[:10])

['अस्पताल', 'में', 'डिलीवरी', 'के', 'लिए', 'लेबर', 'रूम', 'बना', 'है,', 'लेकिन']


In [6]:
hi_word_to_ix = {word:ix for ix, word in enumerate(hi_vocab)}
hi_ix_to_word = {ix:word for ix, word in enumerate(hi_vocab)}

In [7]:
def generate_cbow_data(list_of_tokens, window=2):
    cbow = [(
    [list_of_tokens[idx-idx1-1] for idx1 in range(window)] +
    [list_of_tokens[idx+idx1+1] for idx1 in range(window)],

    list_of_tokens[idx])
 
    for idx in range(window, len(list_of_tokens)-window
    )
    ]
    return cbow

In [8]:
cbow_data = generate_cbow_data(hi_content)

In [9]:
class CBOW(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, context_size):
        super(CBOW, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, 128)
        self.fc2 = nn.Linear(128, vocab_size)
        
    
    def forward(self, inputs):
        embeds = sum(self.embeddings(inputs)).view(1,-1)
        print(embeds.shape)
        out = F.relu(self.fc1(embeds))
        
        out = self.fc2(out)
        log_probs = F.log_softmax(out, dim=1)
        
        return log_probs

In [10]:
network = CBOW(len(hi_vocab), EMBD_DIM, CONTEXT_SIZE)

In [11]:
loss_fn = nn.NLLLoss()
optimizer = torch.optim.SGD(network.parameters(), lr=0.001)

In [None]:
#training the model

EPOCHS = 20

for epoch in range(EPOCHS):
    
    total_loss = 0
    
    for context, target in cbow_data:
        
        context_tensor = torch.tensor([hi_word_to_ix[word] for word in context], dtype=torch.long)
        
        network.zero_grad()
        
        words_probs = network(context_tensor)
        
        loss = loss_fn(words_prob, torch.tensor([hi_word_to_ix[target]], dtype=torch.long))
        
        loss.backward()
        
        optimizer.step()
        
    total_loss += loss.item()
    print('Epoch {}/{}, loss: {}'.format(epoch, EPOCHS, total_loss))