# Skip-gram model

In [35]:
import numpy as np
from collections import defaultdict
import pandas as pd

corpus = [
    "Movie was boring",
    "Movie actions were very good",
    "Movie was good",
    "Movie story was very bad"
]

# Preprocessing: 
##  tokenize the sentences
def tokenize_corpus(corpus):
    tokens = [sentence.lower().split() for sentence in corpus]
    return tokens

tokenized_corpus = tokenize_corpus(corpus)

pd.DataFrame(tokenized_corpus)

Unnamed: 0,0,1,2,3,4
0,movie,was,boring,,
1,movie,actions,were,very,good
2,movie,was,good,,
3,movie,story,was,very,bad


Unique set of words in the vocabulary

In [36]:
# Build vocabulary and mappings
vocab = set()
for sentence in tokenized_corpus:
    vocab.update(sentence)

vocab

{'actions', 'bad', 'boring', 'good', 'movie', 'story', 'very', 'was', 'were'}

Assign a unique index to each word

In [37]:
word_to_index = {word: i for i, word in enumerate(vocab)}
index_to_word = {i: word for word, i in word_to_index.items()}
vocab_size = len(word_to_index)

pd.DataFrame([word_to_index])

Unnamed: 0,bad,was,story,good,boring,movie,were,actions,very
0,0,1,2,3,4,5,6,7,8


 Define target (input) and surrounding words (output) for training

In [38]:
training_data = []

for sentence in tokenized_corpus:
    for index, word in enumerate(sentence):
        start = max(0,index -1)
        end = min(len(sentence),index + 2)
        for i in range( start,end):
            if i == index:
                continue
            training_data.append((word,sentence[i]))
training_data

[('movie', 'was'),
 ('was', 'movie'),
 ('was', 'boring'),
 ('boring', 'was'),
 ('movie', 'actions'),
 ('actions', 'movie'),
 ('actions', 'were'),
 ('were', 'actions'),
 ('were', 'very'),
 ('very', 'were'),
 ('very', 'good'),
 ('good', 'very'),
 ('movie', 'was'),
 ('was', 'movie'),
 ('was', 'good'),
 ('good', 'was'),
 ('movie', 'story'),
 ('story', 'movie'),
 ('story', 'was'),
 ('was', 'story'),
 ('was', 'very'),
 ('very', 'was'),
 ('very', 'bad'),
 ('bad', 'very')]

Generating training matrix

In [39]:
# Hyperparameters
embedding_dim = 10
learning_rate = 0.001
epochs = 10000

# Weight initialization
W1 = np.random.uniform(-1, 1, (vocab_size, embedding_dim))  # Input to hidden weights
W2 = np.random.uniform(-1, 1, (embedding_dim, vocab_size))  # Hidden to output weights

pd.DataFrame(W1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.558863,-0.450077,-0.482045,-0.210383,0.743571,0.972926,0.24669,-0.276175,0.6089,-0.107225
1,0.676004,0.35067,-0.296408,-0.220011,0.6638,-0.859058,0.622075,-0.979189,0.939077,0.909042
2,0.005383,0.323528,-0.420377,-0.590993,-0.62276,0.835336,-0.384069,0.281857,0.451137,0.923902
3,-0.501703,-0.681849,0.735308,0.875643,0.901984,0.179165,0.631896,0.649565,0.855368,0.282343
4,0.606698,0.967331,-0.703657,0.702238,0.823197,0.116379,-0.065489,-0.167417,-0.413385,0.51263
5,-0.367456,-0.202791,0.93599,0.072731,0.740566,0.886563,-0.16766,0.467634,-0.701546,0.039228
6,0.010358,0.037167,0.924954,-0.35281,0.400363,-0.734969,-0.260772,0.638162,-0.34111,-0.021667
7,0.985502,0.220966,0.842141,0.888686,0.498978,0.36657,0.019107,0.015352,-0.359787,0.912854
8,0.763074,-0.149559,-0.809394,0.162321,-0.348835,0.03497,-0.334681,-0.794619,-0.745083,0.911792


Training and generating word embeddings without negative sampling

In [41]:
# One-hot encoding function
def one_hot_vector(word, word_to_index):
    one_hot = np.zeros(vocab_size)
    one_hot[word_to_index[word]] = 1
    return one_hot

# Training loop
for epoch in range(epochs):
    loss = 0
    for target, context in training_data:
        # Forward pass
        context_vectors = np.sum([one_hot_vector(target, word_to_index)], axis=0)
        h = np.dot(context_vectors, W1)  # Hidden layer
        u = np.dot(h, W2)  # Output layer
        y_pred = np.exp(u) / np.sum(np.exp(u)) # Softmax activation
        
        # Calculate loss (cross-entropy)
        target_one_hot = one_hot_vector(target, word_to_index)
        loss += -np.sum(target_one_hot * np.log(y_pred + 1e-8))

        # Backpropagation
        e = y_pred - target_one_hot
        dW2 = np.outer(h, e)
        dW1 = np.outer(context_vectors, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    # Print loss every 1000 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss:.4f}')

Epoch 1000, Loss: 1.7347
Epoch 2000, Loss: 0.5940
Epoch 3000, Loss: 0.3327
Epoch 4000, Loss: 0.2250
Epoch 5000, Loss: 0.1676
Epoch 6000, Loss: 0.1325
Epoch 7000, Loss: 0.1089
Epoch 8000, Loss: 0.0921
Epoch 9000, Loss: 0.0795
Epoch 10000, Loss: 0.0698


Display word embeddings

In [42]:
embed_dict = {}
for word, idx in word_to_index.items():
    embed_dict[word] = W1[idx]
    # print(f'Word: {word}, Embedding: {W1[idx]}')

import pandas as pd
pd.DataFrame(embed_dict)

Unnamed: 0,bad,was,story,good,boring,movie,were,actions,very
0,0.735443,-0.824483,-0.585991,0.031183,1.202295,-1.61077,-0.690566,1.218512,1.376613
1,-0.145329,0.3408,-0.655562,-1.472839,0.556116,0.798795,-0.479428,1.018777,-0.124884
2,-1.167133,-0.927087,-0.513141,1.066024,-1.042191,1.571263,0.984312,0.698056,-0.33162
3,-0.021334,0.030539,-1.136659,1.215518,-0.762876,0.0604,-0.257658,0.630519,1.177853
4,0.977032,-0.159515,-1.660993,1.309154,1.258262,0.921352,-0.046623,-0.112564,0.196002
5,1.732744,-1.786758,1.27925,-0.665958,-0.739589,1.224908,-0.237851,0.652473,0.408416
6,0.803412,0.314957,0.227222,0.744522,0.608762,0.274491,-0.739552,-0.594229,-1.054363
7,0.621141,-1.553057,-0.501898,0.806874,-0.430438,0.020153,1.641897,0.638145,-2.071297
8,0.270751,1.468846,-0.075014,0.712767,0.115631,0.219112,-0.716025,-1.20614,-0.048304
9,-0.501684,0.268338,0.378241,0.669135,1.112098,0.65546,-1.350527,1.602655,0.15736


Calculating the Euclidian distances b/w the word embeding to see the similarity b/w them

In [43]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word_to_index[target] 
    scores = Counter() 
    for word,index in word_to_index.items(): 
        raw_difference = W1[index] - (W1[target_index]) 
        squared_difference = raw_difference * raw_difference 
        scores[word] = -math.sqrt(sum(squared_difference)) 

    return scores.most_common(10)

similar('boring')

[('boring', -0.0),
 ('bad', -3.352469543609894),
 ('was', -3.434140047111468),
 ('actions', -3.6848492192246494),
 ('very', -3.688984247207017),
 ('good', -3.9946150357586943),
 ('story', -4.290580058146138),
 ('movie', -4.470671911570958),
 ('were', -4.88266343835967)]