# CBOW model

Input data 

In [24]:
import numpy as np
from collections import defaultdict
import pandas as pd

corpus = [
    "Movie was boring",
    "Movie actions were very good",
    "Movie was good",
    "Movie story was very bad"
]

# Preprocessing: 
##  tokenize the sentences
def tokenize_corpus(corpus):
    tokens = [sentence.lower().split() for sentence in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

pd.DataFrame(tokenized_corpus)

Unnamed: 0,0,1,2,3,4
0,movie,was,boring,,
1,movie,actions,were,very,good
2,movie,was,good,,
3,movie,story,was,very,bad


Unique set of words in the vocabulary

In [4]:
# Build vocabulary and mappings
vocab = set()
for sentence in tokenized_corpus:
    vocab.update(sentence)

vocab

{'actions', 'bad', 'boring', 'good', 'movie', 'story', 'very', 'was', 'were'}

Assign a unique index to each word

In [26]:
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}
vocab_size = len(word_to_index)
word_to_index, index_to_word

pd.DataFrame([word_to_index])

Unnamed: 0,was,good,movie,actions,bad,very,story,boring,were
0,0,1,2,3,4,5,6,7,8


 Define context and target words for training

In [7]:
def generate_training_data(tokenized_corpus, window_size=1):
    training_data = []
    for sentence in tokenized_corpus:
        for i, word in enumerate(sentence):
            context = []
            for j in range(-window_size, window_size + 1):
                if j != 0 and 0 <= i + j < len(sentence):
                    context.append(sentence[i + j])
            target = word
            training_data.append((context, target))
    return training_data

training_data = generate_training_data(tokenized_corpus)

training_data

[(['was'], 'movie'),
 (['movie', 'boring'], 'was'),
 (['was'], 'boring'),
 (['actions'], 'movie'),
 (['movie', 'were'], 'actions'),
 (['actions', 'very'], 'were'),
 (['were', 'good'], 'very'),
 (['very'], 'good'),
 (['was'], 'movie'),
 (['movie', 'good'], 'was'),
 (['was'], 'good'),
 (['story'], 'movie'),
 (['movie', 'was'], 'story'),
 (['story', 'very'], 'was'),
 (['was', 'bad'], 'very'),
 (['very'], 'bad')]

Generating training matrix

In [29]:
# Hyperparameters
embedding_dim = 10
learning_rate = 0.001
epochs = 10000

# Weight initialization
W1 = np.random.uniform(-1, 1, (vocab_size, embedding_dim))  # Input to hidden weights
W2 = np.random.uniform(-1, 1, (embedding_dim, vocab_size))  # Hidden to output weights

pd.DataFrame(W1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.707264,0.871984,-0.866987,-0.621605,0.685625,0.671544,0.949171,-0.701265,0.163057,-0.856476
1,-0.175688,-0.912535,0.945804,-0.628869,0.434738,-0.392405,0.591259,-0.131291,-0.598987,0.923839
2,-0.950587,-0.877492,-0.004931,-0.498021,-0.51324,0.656964,-0.496648,0.607605,-0.355421,0.385362
3,-0.470238,0.821604,0.184242,-0.99373,0.590488,0.328958,0.760862,-0.281472,0.113409,0.278888
4,0.203614,-0.303613,-0.571385,-0.420684,-0.018658,0.416891,0.176079,0.936392,-0.979797,0.48866
5,-0.726533,0.775927,-0.178288,-0.383613,0.983372,-0.769543,0.597816,-0.966627,0.135263,0.852387
6,0.354482,-0.252654,0.376925,0.279722,-0.33979,-0.24594,-0.863511,-0.662389,0.741874,-0.237106
7,0.919164,0.239099,0.706737,0.787357,0.285884,0.443212,-0.468733,0.033076,-0.229903,-0.402908
8,-0.573819,-0.735416,-0.980552,-0.317224,0.56235,-0.193679,0.827143,-0.457183,-0.373474,-0.527231


Training and generating word embeddings

In [30]:
# One-hot encoding function
def one_hot_vector(word, word_to_index):
    one_hot = np.zeros(vocab_size)
    one_hot[word_to_index[word]] = 1
    return one_hot

# Training loop
for epoch in range(epochs):
    loss = 0
    for context, target in training_data:
        # Forward pass
        context_vectors = np.sum([one_hot_vector(word, word_to_index) for word in context], axis=0)
        h = np.dot(context_vectors, W1)  # Hidden layer
        u = np.dot(h, W2)  # Output layer
        y_pred = np.exp(u) / np.sum(np.exp(u)) # Softmax activation
        
        # Calculate loss (cross-entropy)
        target_one_hot = one_hot_vector(target, word_to_index)
        loss += -np.sum(target_one_hot * np.log(y_pred + 1e-8))

        # Backpropagation
        e = y_pred - target_one_hot
        dW2 = np.outer(h, e)
        dW1 = np.outer(context_vectors, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    # Print loss every 1000 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss:.4f}')

Epoch 1000, Loss: 9.5670
Epoch 2000, Loss: 7.0455
Epoch 3000, Loss: 6.3769
Epoch 4000, Loss: 6.1014
Epoch 5000, Loss: 5.9584
Epoch 6000, Loss: 5.8731
Epoch 7000, Loss: 5.8174
Epoch 8000, Loss: 5.7785
Epoch 9000, Loss: 5.7501
Epoch 10000, Loss: 5.7285


Display word embeddings

In [22]:
embed_dict = {}
for word, idx in word_to_index.items():
    embed_dict[word] = W1[idx]
    # print(f'Word: {word}, Embedding: {W1[idx]}')

import pandas as pd
pd.DataFrame(embed_dict)

Unnamed: 0,was,good,movie,actions,bad,very,story,boring,were
0,1.226012,-0.685008,-1.773884,-0.415146,0.301805,-0.378394,-1.6525,-1.027951,0.354144
1,0.988148,-0.441821,0.659642,-1.628279,-0.665754,1.481087,-0.004331,0.67566,-1.43428
2,-0.117835,0.515531,-0.136495,0.252435,-0.82143,-0.925796,0.63345,0.505873,-0.891902
3,0.969744,0.585738,0.767863,0.717832,-1.287798,-0.862781,-0.220741,0.720866,-0.094789
4,-1.372613,0.256785,1.127048,-0.139056,0.986007,-0.086398,0.734862,-0.355017,-0.495644
5,0.791518,0.146677,0.767615,1.383622,0.556307,-2.177009,1.599112,-0.932217,0.24002
6,1.06487,1.476541,-1.915478,0.530619,-0.003385,0.628305,0.805629,0.916441,-0.110245
7,0.716644,-1.571887,-0.699909,1.392141,-2.098957,-0.418843,0.329034,-0.657582,-1.036781
8,-0.309886,-0.35503,0.859751,-1.784142,-0.423709,0.120963,-1.60876,-0.427513,-0.55689
9,-1.01133,-0.704604,1.082053,1.043992,0.085216,1.130337,-0.908654,-0.124562,0.375912


Calculating the Euclidian distances b/w the word embeding to see the similarity b/w them

In [23]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word_to_index[target] 
    scores = Counter() 
    for word,index in word_to_index.items(): 
        raw_difference = W1[index] - (W1[target_index]) 
        squared_difference = raw_difference * raw_difference 
        scores[word] = -math.sqrt(sum(squared_difference)) 

    return scores.most_common(10)

similar('boring')

[('boring', -0.0),
 ('good', -2.1010490666222092),
 ('very', -3.043933532973709),
 ('were', -3.4409453522364952),
 ('was', -3.513749970353083),
 ('story', -3.5137792092515623),
 ('bad', -4.042996161186965),
 ('movie', -4.146321741354841),
 ('actions', -4.325949285795467)]