# Implementation of Word2vector for CBOW training & Inference Code

In [52]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import OneHotEncoder
import random

# nltk.download('punkt')

# Provided text
text = ['Best way to success is through hardwork and persistence']

# Tokenize the text
tokenized_text = [word_tokenize(sentence.lower()) for sentence in text]
tokenized_text = tokenized_text[0]  # Flatten the list of lists

# Build the vocabulary
vocab = set(tokenized_text)
vocab_size = len(vocab)

# Create word_to_index and index_to_word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Prepare training data
def generate_training_data(tokenized_text, word_to_index, window_size=2):
    data = []
    for i, word in enumerate(tokenized_text):
        target = word_to_index[word]
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j != i and j >= 0 and j < len(tokenized_text):
                context.append(word_to_index[tokenized_text[j]])
        data.append((target, context))
    return data

training_data = generate_training_data(tokenized_text, word_to_index)

class Word2VecCBOW:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W1 = np.random.randn(vocab_size, embedding_dim)
        self.W2 = np.random.randn(embedding_dim, vocab_size)
        
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)
    
    def forward_cbow(self, context_indices):
        h = np.mean(self.W1[context_indices], axis=0)
        u = np.dot(h, self.W2)
        y_pred = self.softmax(u)
        return y_pred, h
    
    def backward_cbow(self, error, h, context_indices, learning_rate):
        dW2 = np.outer(h, error)
        dW1 = np.zeros_like(self.W1)
        for context_index in context_indices:
            dW1[context_index] += np.dot(self.W2, error)
        dW1 /= len(context_indices)
        
        self.W1[context_indices] -= learning_rate * dW1[context_indices]
        self.W2 -= learning_rate * dW2

    def train_cbow(self, training_data, epochs, learning_rate):
        for epoch in range(epochs):
            loss = 0
            for target, context in training_data:
                y_pred, h = self.forward_cbow(context)
                error = y_pred.copy()
                error[target] -= 1
                self.backward_cbow(error, h, context, learning_rate)
                loss += -np.log(y_pred[target])
        print(f'Epoch {epoch + 1}, Loss: {loss}')

def get_embedding(word, word_to_index, weight_matrix):
    index = word_to_index.get(word)
    if index is not None:
        return weight_matrix[index]
    else:
        raise ValueError(f"Word '{word}' not found in vocabulary.")

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def find_similar_words(word, word_to_index, index_to_word, weight_matrix, top_n=5):
    target_embedding = get_embedding(word, word_to_index, weight_matrix)
    similarities = {}

    for idx, embedding in enumerate(weight_matrix):
        if index_to_word[idx] != word:
            similarity = cosine_similarity(target_embedding, embedding)
            similarities[index_to_word[idx]] = similarity

    sorted_similar_words = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_similar_words[:top_n]

# Example usage for CBOW
embedding_dim = 10
word2vec_cbow = Word2VecCBOW(vocab_size, embedding_dim)
word2vec_cbow.train_cbow(training_data, epochs=600, learning_rate=0.01)



# Find similar words for 'success'
similar_words = find_similar_words('success', word_to_index, index_to_word, word2vec_cbow.W1, top_n=5)
print(f"Words similar to 'success': {similar_words}")

# -------------------------------prediction for multiple words----------------------------------------------------------------------------------------------

context_words = ['best','way']
context_indices = [word_to_index[word] for word in context_words]
y_pred, _ = word2vec_cbow.forward_cbow(context_indices)
predicted_word_index = np.argmax(y_pred)
predicted_word = index_to_word[predicted_word_index]
print(f"Predicted word given context words {context_words}: {predicted_word}")



Epoch 600, Loss: 0.514563838759905
Words similar to 'success': [('way', 0.48595725752578506), ('best', 0.2952497345919592), ('is', 0.11750311183935276), ('and', 0.0855585172817024), ('to', 0.027338659163779158)]
Predicted word given context words ['best', 'way']: to


# Implementation of Word2vector for SKIP Gram  training & Inference Code

In [61]:
# Provided text
text = ['Best way to success is through hardwork and persistence']

# Tokenize the text
tokenized_text = [word_tokenize(sentence.lower()) for sentence in text]
tokenized_text = tokenized_text[0]  # Flatten the list of lists

# Build the vocabulary
vocab = set(tokenized_text)
vocab_size = len(vocab)

# Create word_to_index and index_to_word mappings
word_to_index = {word: idx for idx, word in enumerate(vocab)}
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Prepare training data
def generate_training_data(tokenized_text, word_to_index, window_size=2):
    data = []
    for i, word in enumerate(tokenized_text):
        target = word_to_index[word]
        context = []
        for j in range(i - window_size, i + window_size + 1):
            if j != i and j >= 0 and j < len(tokenized_text):
                context.append(word_to_index[tokenized_text[j]])
        data.append((target, context))
    return data

training_data = generate_training_data(tokenized_text, word_to_index)

class Word2VecSkipGram:
    def __init__(self, vocab_size, embedding_dim):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.W1 = np.random.randn(vocab_size, embedding_dim) * 0.01  # Smaller initial weights
        self.W2 = np.random.randn(embedding_dim, vocab_size) * 0.01  # Smaller initial weights
        
    def softmax(self, x):
        e_x = np.exp(x - np.max(x))  # For numerical stability
        return e_x / e_x.sum(axis=0)
    
    def forward_skipgram(self, target_index):
        h = self.W1[target_index]
        u = np.dot(h, self.W2)
        y_pred = self.softmax(u)
        return y_pred, h
    
    def backward_skipgram(self, error, h, target_index, learning_rate):
        dW2 = np.outer(h, error)
        dW1 = np.dot(self.W2, error)
        
        self.W1[target_index] -= learning_rate * dW1
        self.W2 -= learning_rate * dW2

    def train_skipgram(self, training_data, epochs, learning_rate):
        for epoch in range(epochs):
            loss = 0
            for target, context in training_data:
                y_pred, h = self.forward_skipgram(target)
                for context_word in context:
                    error = y_pred.copy()
                    error[context_word] -= 1
                    self.backward_skipgram(error, h, target, learning_rate)
                    loss += -np.log(y_pred[context_word] + 1e-9)  # Avoid log(0)
            if epoch % 1000 == 0:  # Print loss every 100 epochs
                print(f'Epoch {epoch + 1}, Loss: {loss / len(training_data)}')  # Average loss

def get_embedding(word, word_to_index, weight_matrix):
    index = word_to_index.get(word)
    if index is not None:
        return weight_matrix[index]
    else:
        raise ValueError(f"Word '{word}' not found in vocabulary.")

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

def find_similar_words(word, word_to_index, index_to_word, weight_matrix, top_n=5):
    target_embedding = get_embedding(word, word_to_index, weight_matrix)
    similarities = {}

    for idx, embedding in enumerate(weight_matrix):
        if index_to_word[idx] != word:
            similarity = cosine_similarity(target_embedding, embedding)
            similarities[index_to_word[idx]] = similarity

    sorted_similar_words = sorted(similarities.items(), key=lambda item: item[1], reverse=True)
    return sorted_similar_words[:top_n]

# Initialize and train the model
embedding_dim = 10
word2vec = Word2VecSkipGram(vocab_size, embedding_dim)
word2vec.train_skipgram(training_data, epochs=2000, learning_rate=0.01)

# Find similar words for 'success'
similar_words = find_similar_words('success', word_to_index, index_to_word, word2vec.W1, top_n=5)
print(f"Words similar to 'success': {similar_words}")

# Find context words given 'success'
target_word = 'success'
context_indices = word_to_index[target_word]
y_pred, _ = word2vec_skipgram.forward_skipgram(context_indices)
predicted_context_indices = np.argsort(y_pred)[-5:]  # Top 5 predicted context words
predicted_context_words = [index_to_word[idx] for idx in predicted_context_indices]
print(f"Context words for '{target_word}': {predicted_context_words}")


Epoch 1, Loss: 7.32415973238885
Epoch 1001, Loss: 4.154487689590306
Words similar to 'success': [('best', 0.664994658832855), ('hardwork', 0.3644387527531994), ('is', 0.16391701713753604), ('to', 0.03118278838720984), ('through', -0.16216656057258505)]
Context words for 'success': ['and', 'is', 'through', 'to', 'way']
