In [5]:
import numpy as np

np.random.seed(42)


class Word2Vec:
    def __init__(self, corpus, embed_size, window_size, learning_rate, epochs):
        self.corpus = corpus
        self.embed_size = embed_size
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        
        # Create vocabulary
        self.words = set(corpus)
        self.word2index = {word: i for i, word in enumerate(self.words)}
        self.index2word = {i: word for i, word in enumerate(self.words)}
        self.vocab_size = len(self.words)
        
        # Initialize weights
        self.input_word = np.random.randn(self.vocab_size, self.embed_size)

        self.output_cotext_words = np.random.randn(self.embed_size, self.vocab_size)
        
    def train(self):
        for epoch in range(self.epochs):
            self.loss = 0
            for word_index, word in enumerate(self.corpus):
                # We are going to predict the context words from the current word
                # The context words are the words that fall within the window size
                # around the current word
                start = max(0, word_index - self.window_size)
                end = min(word_index + self.window_size, len(self.corpus))
                context_indices = [self.word2index[w] for w in self.corpus[start:end] if w != word]
                
                for context_index in context_indices:
                    
                    # Forward pass
                    input_word_embedding = self.input_word[context_index]
          
                    # print(self.embedding_to_word(input_word_embedding))
                    dot_product = np.dot(input_word_embedding, self.output_cotext_words)

                    softmaxed_probability = self.softmax(dot_product)
                    
                    # Compute loss
                    self.loss += -np.log(softmaxed_probability[self.word2index[word]])
                    
                    # Backpropagation
                    e = softmaxed_probability.copy()
                    # Subtract 1 from the word that was actually the context word
                    e[self.word2index[word]] -= 1
                    
                    #This is the gradient that shows how much the output embedding should change to minimize the loss
                    gradient_for_output_embedding = np.outer(input_word_embedding, e)
                    #This is the gradient that shows how much the input embedding should change to minimize the loss
                    gradient_for_input_embedding = np.dot(self.output_cotext_words, e.T)
                    
                    # Update weights
                    #Updating output embedding to take a STEP TOWARDS OPPOSITE TO THE GRADIENT
                    self.output_cotext_words -= self.learning_rate * gradient_for_output_embedding

                    #Updating input embedding ONLY FOR THE CONTEXT WORD in the context matrix
                    self.input_word[context_index] -= self.learning_rate * gradient_for_input_embedding
            
            # print(f'Epoch: {epoch + 1}/{self.epochs}, Loss: {self.loss/len(self.corpus)}')
            
   
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)
    
    def get_word_vector(self, word):
        return self.input_word[self.word2index[word]]
    


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def euclidean_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)





# Small Corpus Training

In [6]:

corpus = "the quick brown fox jumped over the lazy dog".split()


model = Word2Vec(corpus, embed_size=20, window_size=2, learning_rate=0.01, epochs=1000)
model.train()



# Calculate for "quick" and "brown"
print("Cosine Similarity (quick, brown):", cosine_similarity(model.get_word_vector("quick"), model.get_word_vector("brown")))
print("Euclidean Distance (quick, brown):", euclidean_distance(model.get_word_vector("quick"), model.get_word_vector("brown")))
#compare above two cosine similarities with each other and print the smaller pair
if cosine_similarity(model.get_word_vector("quick"), model.get_word_vector("brown")) > cosine_similarity(model.get_word_vector("quick"), model.get_word_vector("dog")):
    print("Cosine Similarity (quick, brown) is larger than (quick, dog): TRUE")
else:
    print("Cosine Similarity (quick, brown) is smaller than (quick, dog): FALSE")

# Calculate for "quick" and "dog"
print("Cosine Similarity (quick, dog):", cosine_similarity(model.get_word_vector("quick"), model.get_word_vector("dog")))
print("Euclidean Distance (quick, dog):", euclidean_distance(model.get_word_vector("quick"), model.get_word_vector("dog")))
#compare above two euclidean distances with each other and print the smaller pair
if euclidean_distance(model.get_word_vector("quick"), model.get_word_vector("brown")) > euclidean_distance(model.get_word_vector("quick"), model.get_word_vector("dog")):
    print("Euclidean Distance (quick, brown) is larger than (quick, dog): FALSE")
else:
    print("Euclidean Distance (quick, brown) is smaller than (quick, dog): TRUE")

Cosine Similarity (quick, brown): 0.10410215449008678
Euclidean Distance (quick, brown): 5.608967870998298
Cosine Similarity (quick, brown) is larger than (quick, dog): TRUE
Cosine Similarity (quick, dog): -0.22225721139339435
Euclidean Distance (quick, dog): 6.449068007238908
Euclidean Distance (quick, brown) is smaller than (quick, dog): TRUE


# Slightly bigger Corpus Training

In [9]:

#read training_text.txt and split it into words
corpus = open("training_text.txt", "r").read().split()


model = Word2Vec(corpus, embed_size=19, window_size=2, learning_rate=0.01, epochs=1000)
model.train()




print("Cosine Similarity (data, science):", cosine_similarity(model.get_word_vector("data"), model.get_word_vector("science")))
print("Cosine Similarity (data, really):", cosine_similarity(model.get_word_vector("data"), model.get_word_vector("really")))

#compare above two cosine similarities with each other and print the smaller pair
if cosine_similarity(model.get_word_vector("data"), model.get_word_vector("science")) > cosine_similarity(model.get_word_vector("data"), model.get_word_vector("really")):
    print("Cosine Similarity (data, science) is larger than (data, really): TRUE")
else:
    print("Cosine Similarity (data, science) is smaller than (data, really): FALSE")


print("Euclidean Distance (data, science):", euclidean_distance(model.get_word_vector("data"), model.get_word_vector("science")))
print("Euclidean Distance (data, really):", euclidean_distance(model.get_word_vector("data"), model.get_word_vector("really")))

#compare above two euclidean distances with each other and print the smaller pair
if euclidean_distance(model.get_word_vector("data"), model.get_word_vector("science")) > euclidean_distance(model.get_word_vector("data"), model.get_word_vector("really")):
    print("Euclidean Distance (data, science) is larger than (data, really): FALSE")
else:
    print("Euclidean Distance (data, science) is smaller than (data, really): TRUE")

Cosine Similarity (data, science): 0.15936889830770104
Cosine Similarity (data, really): -0.2637223323415979
Cosine Similarity (data, science) is larger than (data, really): TRUE
Euclidean Distance (data, science): 4.445672355379358
Euclidean Distance (data, really): 6.493874619544722
Euclidean Distance (data, science) is smaller than (data, really): TRUE
