In [2]:
import numpy as np
from collections import defaultdict

# Define the training data
# corpus = [
#     "the quick brown fox jumped over the lazy dog",
#     "the dog barked at the fox",
#     "the lazy dog slept",
#     "the quick fox was very quick"
# ]
corpus = [
    "Movie was boring",
    "Movie actions were very good",
    "Movie was good",
    "Movie story was very bad"
]

# Preprocessing: tokenize the sentences and build the vocabulary
def tokenize_corpus(corpus):
    tokens = [sentence.lower().split() for sentence in corpus]
    return tokens

# Build vocabulary and mappings
def build_vocab(tokenized_corpus):
    vocab = set()
    for sentence in tokenized_corpus:
        vocab.update(sentence)
    word_to_index = {word: i for i, word in enumerate(vocab)}
    index_to_word = {i: word for word, i in word_to_index.items()}
    return word_to_index, index_to_word

tokenized_corpus = tokenize_corpus(corpus)
word_to_index, index_to_word = build_vocab(tokenized_corpus)
vocab_size = len(word_to_index)

print(tokenized_corpus)
print(f'Vocab size: {vocab_size}')


[['movie', 'was', 'boring'], ['movie', 'actions', 'were', 'very', 'good'], ['movie', 'was', 'good'], ['movie', 'story', 'was', 'very', 'bad']]
Vocab size: 9


In [3]:
# Define context and target creation
def generate_training_data(tokenized_corpus, window_size=1):
    training_data = []
    for sentence in tokenized_corpus:
        for i, word in enumerate(sentence):
            context = []
            for j in range(-window_size, window_size + 1):
                if j != 0 and 0 <= i + j < len(sentence):
                    context.append(sentence[i + j])
            target = word
            training_data.append((context, target))
    return training_data

training_data = generate_training_data(tokenized_corpus)

training_data

[(['was'], 'movie'),
 (['movie', 'boring'], 'was'),
 (['was'], 'boring'),
 (['actions'], 'movie'),
 (['movie', 'were'], 'actions'),
 (['actions', 'very'], 'were'),
 (['were', 'good'], 'very'),
 (['very'], 'good'),
 (['was'], 'movie'),
 (['movie', 'good'], 'was'),
 (['was'], 'good'),
 (['story'], 'movie'),
 (['movie', 'was'], 'story'),
 (['story', 'very'], 'was'),
 (['was', 'bad'], 'very'),
 (['very'], 'bad')]

In [4]:

# Hyperparameters
embedding_dim = 10
learning_rate = 0.01
epochs = 10000

# Weight initialization
W1 = np.random.uniform(-1, 1, (vocab_size, embedding_dim))  # Input to hidden weights
W2 = np.random.uniform(-1, 1, (embedding_dim, vocab_size))  # Hidden to output weights

# One-hot encoding function
def one_hot_vector(word, word_to_index):
    one_hot = np.zeros(vocab_size)
    one_hot[word_to_index[word]] = 1
    return one_hot

# Training loop
for epoch in range(epochs):
    loss = 0
    for context, target in training_data:
        # Forward pass
        context_vectors = np.sum([one_hot_vector(word, word_to_index) for word in context], axis=0)
        h = np.dot(context_vectors, W1)  # Hidden layer
        u = np.dot(h, W2)  # Output layer
        y_pred = np.exp(u) / np.sum(np.exp(u)) # Softmax activation
        
        # Calculate loss (cross-entropy)
        target_one_hot = one_hot_vector(target, word_to_index)
        loss += -np.sum(target_one_hot * np.log(y_pred + 1e-8))

        # Backpropagation
        e = y_pred - target_one_hot
        dW2 = np.outer(h, e)
        dW1 = np.outer(context_vectors, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    # Print loss every 1000 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss:.4f}')

# Display word embeddings
for word, idx in word_to_index.items():
    print(f'Word: {word}, Embedding: {W1[idx]}')


Epoch 1000, Loss: 6.0661
Epoch 2000, Loss: 5.9807
Epoch 3000, Loss: 5.9451
Epoch 4000, Loss: 5.9201
Epoch 5000, Loss: 5.9005
Epoch 6000, Loss: 5.8848
Epoch 7000, Loss: 5.8720
Epoch 8000, Loss: 5.8616
Epoch 9000, Loss: 5.8530
Epoch 10000, Loss: 5.8460
Word: story, Embedding: [ 0.61296695 -2.02968151  0.40999344 -1.80774536  0.71139503 -2.15146414
 -1.49340307 -0.17002308  0.84067125  1.03293581]
Word: bad, Embedding: [ 0.81443191 -0.16625505 -1.16117618  1.10794049  0.73504063  1.21059665
 -0.473284   -0.93869332  2.10504319  1.59647903]
Word: were, Embedding: [ 1.13467847  1.06144307  0.60877506 -0.13299855 -1.95872094  0.88250022
  0.05692184  0.52090016 -0.15463408  1.34342115]
Word: actions, Embedding: [ 2.11715297  0.31220676  1.67147372 -2.45587517  0.71064448 -0.94340327
 -0.89678103  0.21794937 -0.9732588   1.51023869]
Word: was, Embedding: [ 0.63498284 -0.64684698 -0.94248559 -0.95003585 -0.57050697 -0.05634887
  0.25353939 -0.53652473 -1.17099228 -0.6489178 ]
Word: very, Embed

Calculating the Euclidian distances b/w the word embeding to see the similarity b/w them

In [11]:
from collections import Counter
import math

def similar(target='beautiful'):
    target_index = word_to_index[target] 
    scores = Counter() 
    for word,index in word_to_index.items(): 
        raw_difference = W1[index] - (W1[target_index]) 
        squared_difference = raw_difference * raw_difference 
        scores[word] = -math.sqrt(sum(squared_difference)) 

    return scores.most_common(10)

similar('good')

[('good', -0.0),
 ('boring', -2.4396124005917725),
 ('bad', -3.461821912031705),
 ('were', -3.8365481664176646),
 ('was', -3.8466084848655866),
 ('very', -4.361414069388088),
 ('story', -4.4708995591898075),
 ('movie', -5.2517917415193605),
 ('actions', -5.521263175819463)]