# CBOW model

Input data 

In [2]:
import numpy as np
from collections import defaultdict
import pandas as pd

corpus = [
    "Movie was boring",
    "Movie actions were very good",
    "Movie was good",
    "Movie story was very bad"
]

# Preprocessing: 
##  tokenize the sentences
def tokenize_corpus(corpus):
    tokens = [sentence.lower().split() for sentence in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

pd.DataFrame(tokenized_corpus)

Unnamed: 0,0,1,2,3,4
0,movie,was,boring,,
1,movie,actions,were,very,good
2,movie,was,good,,
3,movie,story,was,very,bad


Unique set of words in the vocabulary

In [3]:
# Build vocabulary and mappings
vocab = set()
for sentence in tokenized_corpus:
    vocab.update(sentence)

vocab

{'actions', 'bad', 'boring', 'good', 'movie', 'story', 'very', 'was', 'were'}

Assign a unique index to each word

In [4]:
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}
vocab_size = len(word_to_index)
word_to_index, index_to_word

pd.DataFrame([word_to_index])

Unnamed: 0,bad,movie,good,very,were,boring,story,was,actions
0,0,1,2,3,4,5,6,7,8


 Define context and target words for training

In [None]:
def generate_training_data(tokenized_corpus, window_size=1):
    training_data = []
    for sentence in tokenized_corpus:
        for i, word in enumerate(sentence):
            context = []
            for j in range(-window_size, window_size + 1):
                if j != 0 and 0 <= i + j < len(sentence):
                    context.append(sentence[i + j])
            target = word
            training_data.append((context, target))
    return training_data

training_data = generate_training_data(tokenized_corpus)

# put the training data into a pandas dataframe
df = pd.DataFrame(training_data, columns=['context', 'target'])
df

Unnamed: 0,context,target
0,[was],movie
1,"[movie, boring]",was
2,[was],boring
3,[actions],movie
4,"[movie, were]",actions
5,"[actions, very]",were
6,"[were, good]",very
7,[very],good
8,[was],movie
9,"[movie, good]",was


Generating training matrix

In [6]:
# Hyperparameters
embedding_dim = 10
learning_rate = 0.001
epochs = 10000

# Weight initialization
W1 = np.random.uniform(-1, 1, (vocab_size, embedding_dim))  # Input to hidden weights
W2 = np.random.uniform(-1, 1, (embedding_dim, vocab_size))  # Hidden to output weights

pd.DataFrame(W1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.343629,0.895943,0.344396,-0.310865,0.616621,-0.781529,0.77838,0.527281,-0.347242,0.876641
1,0.46676,0.805717,-0.052154,-0.454725,0.037078,-0.747449,0.599849,0.079534,-0.164993,-0.702418
2,-0.984807,0.603134,0.56088,0.156324,-0.746102,-0.82535,0.420052,0.673051,0.040919,-0.424616
3,0.666363,-0.693173,-0.331068,-0.352913,0.276031,-0.7079,-0.41518,0.871119,-0.53281,-0.071285
4,0.898871,0.647435,0.817961,0.895987,0.600363,0.231629,-0.209346,0.483954,-0.489557,0.328599
5,0.106193,-0.458705,0.345401,-0.484469,-0.58143,0.289737,0.214768,0.03787,-0.961217,-0.222933
6,0.850184,-0.870605,0.631146,-0.291726,0.072465,0.425294,-0.005853,-0.875307,-0.424155,-0.423402
7,-0.786989,-0.174817,0.190667,0.511677,0.571319,0.113535,-0.988634,-0.289847,-0.91584,0.614044
8,0.587991,0.659895,0.420496,-0.71043,0.705954,-0.147209,-0.049564,0.401719,-0.269373,-0.458963


Training and generating word embeddings

In [7]:
# One-hot encoding function
def one_hot_vector(word, word_to_index):
    one_hot = np.zeros(vocab_size)
    one_hot[word_to_index[word]] = 1
    return one_hot

# Training loop
for epoch in range(epochs):
    loss = 0
    for context, target in training_data:
        # Forward pass
        context_vectors = np.sum([one_hot_vector(word, word_to_index) for word in context], axis=0)
        h = np.dot(context_vectors, W1)  # Hidden layer
        u = np.dot(h, W2)  # Output layer
        y_pred = np.exp(u) / np.sum(np.exp(u)) # Softmax activation
        
        # Calculate loss (cross-entropy)
        target_one_hot = one_hot_vector(target, word_to_index)
        loss += -np.sum(target_one_hot * np.log(y_pred + 1e-8))

        # Backpropagation
        e = y_pred - target_one_hot
        dW2 = np.outer(h, e)
        dW1 = np.outer(context_vectors, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    # Print loss every 1000 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss:.4f}')

Epoch 1000, Loss: 10.3566
Epoch 2000, Loss: 7.3014
Epoch 3000, Loss: 6.4818
Epoch 4000, Loss: 6.1558
Epoch 5000, Loss: 5.9913
Epoch 6000, Loss: 5.8951
Epoch 7000, Loss: 5.8330
Epoch 8000, Loss: 5.7903
Epoch 9000, Loss: 5.7592
Epoch 10000, Loss: 5.7358


Display word embeddings

In [8]:
embed_dict = {}
for word, idx in word_to_index.items():
    embed_dict[word] = W1[idx]
    # print(f'Word: {word}, Embedding: {W1[idx]}')

import pandas as pd
pd.DataFrame(embed_dict)

Unnamed: 0,bad,movie,good,very,were,boring,story,was,actions
0,0.296226,1.121783,-0.755393,1.646865,0.328901,0.27367,0.246467,-1.484911,-0.272933
1,1.551822,2.071923,0.68429,-0.917012,1.21694,-0.431677,-0.908383,-1.721983,0.348118
2,0.343731,0.116024,0.24825,-0.459827,0.381259,0.195228,0.42285,0.48918,-0.162599
3,-0.710891,-1.22118,-0.844552,-0.83161,0.709367,-0.771546,-0.205967,1.376502,0.472939
4,1.262123,-0.724961,-0.892323,0.606782,1.96338,-0.816449,-1.309273,0.412098,0.498871
5,-0.660001,-0.559801,-0.401471,-0.190436,0.87538,0.40364,-0.211005,-0.544246,-1.545547
6,1.099475,-1.173366,0.521397,-0.966333,-0.115644,0.050861,0.665625,-0.783145,1.409328
7,0.71799,-0.76611,1.515827,2.274953,0.133833,0.254954,-1.445132,0.046666,-0.983817
8,0.437736,-0.609603,0.41453,0.346453,0.011753,-1.050103,-2.390929,-0.360189,-1.368685
9,1.487259,-0.206481,0.492699,-0.447767,0.780829,0.003694,-0.115543,1.008114,-1.524842


Calculating the Euclidian distances b/w the word embeding to see the similarity b/w them

In [9]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word_to_index[target] 
    scores = Counter() 
    for word,index in word_to_index.items(): 
        raw_difference = W1[index] - (W1[target_index]) 
        squared_difference = raw_difference * raw_difference 
        scores[word] = -math.sqrt(sum(squared_difference)) 

    return scores.most_common(10)

similar('boring')

[('boring', -0.0),
 ('story', -2.5101003835316664),
 ('good', -2.6761792259001056),
 ('movie', -3.3033098569130446),
 ('very', -3.4930927582761115),
 ('actions', -3.732417046683652),
 ('was', -3.7536141876488642),
 ('were', -3.8304770751238855),
 ('bad', -3.8911340880488727)]