# Implementing Word2Vec Model using Skip-Gram 

## Importing the Necesssary Stuff

In [1]:
import numpy as np 
import string 
from nltk.corpus import stopwords
import re

<img src="https://media.geeksforgeeks.org/wp-content/uploads/Skip-gram-architecture-2.jpg">

Let's define some variables :

V    Number of unique words in our corpus of text ( Vocabulary )<br>
x    Input layer (One hot encoding of our input word ). <br>
N    Number of neurons in the hidden layer of neural network<br>
W    Weights between input layer and hidden layer<br>
W'   Weights between hidden layer and output layer<br>
y    A softmax output layer having probabilities of every word in our vocabulary

## Softmax Function

In [2]:
def softmax(x): 
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x)) 
    return e_x / e_x.sum() 
   

## Word2Vec Class

### Contains Functions for Forward Propogation, Backward Propogation, Training and Predicting the words.

In <b>Forward Propogation</b>, We multiply one hot encoding of centre word (denoted by x) with the first weight matrix W to get hidden layer matrix h (of size N x 1). We then multiply the hidden layer vector h with second weight matrix W’ to get a new matrix u. We Then obtain our loss function, Which comes out to be <br><img src="https://miro.medium.com/max/994/1*XPhzBnf1xEb0u67qazx9nA.png"><br>
E being our Loss Function.<br>
In <b>Backward Propogation</b>, We find the partial derivatives of our loss function with respect to W and W’ to apply gradient descent algorithm.


In [3]:
class word2vec(object): 
    def __init__(self): 
        self.N = 10
        self.X_train = [] 
        self.y_train = [] 
        self.window_size = 2
        self.alpha = 0.001
        self.words = [] 
        self.word_index = {} 
   
    def initialize(self,V,data): 
        self.V = V 
        self.W = np.random.uniform(-0.8, 0.8, (self.V, self.N)) 
        self.W1 = np.random.uniform(-0.8, 0.8, (self.N, self.V)) 
           
        self.words = data 
        for i in range(len(data)): 
            self.word_index[data[i]] = i 
   
       
    def feed_forward(self,X): 
        self.h = np.dot(self.W.T,X).reshape(self.N,1) 
        self.u = np.dot(self.W1.T,self.h) 
        #print(self.u) 
        self.y = softmax(self.u)   
        return self.y 
           
    def backpropagate(self,x,t): 
        e = self.y - np.asarray(t).reshape(self.V,1) 
        # e.shape is V x 1 
        dLdW1 = np.dot(self.h,e.T) 
        X = np.array(x).reshape(self.V,1) 
        dLdW = np.dot(X, np.dot(self.W1,e).T) 
        self.W1 = self.W1 - self.alpha*dLdW1 
        self.W = self.W - self.alpha*dLdW 
           
    def train(self,epochs): 
        for x in range(1,epochs):         
            self.loss = 0
            for j in range(len(self.X_train)): 
                self.feed_forward(self.X_train[j]) 
                self.backpropagate(self.X_train[j],self.y_train[j]) 
                C = 0
                for m in range(self.V): 
                    if(self.y_train[j][m]): 
                        self.loss += -1*self.u[m][0] 
                        C += 1
                self.loss += C*np.log(np.sum(np.exp(self.u))) 
            print("epoch ",x, " loss = ",self.loss) 
            self.alpha *= 1/( (1+self.alpha*x) ) 
              
    def predict(self,word,number_of_predictions): 
        if word in self.words: 
            index = self.word_index[word] 
            X = [0 for i in range(self.V)] 
            X[index] = 1
            prediction = self.feed_forward(X) 
            output = {} 
            for i in range(self.V): 
                output[prediction[i][0]] = i 
               
            top_context_words = [] 
            for k in sorted(output,reverse=True): 
                top_context_words.append(self.words[output[k]]) 
                if(len(top_context_words)>=number_of_predictions): 
                    break
       
            return top_context_words 
        else: 
            print("Word not found in dicitonary") 

## Functions for Preparing and Preprocessing Data

In [4]:
def preprocessing(corpus): 
    stop_words = set(stopwords.words('english'))     
    training_data = [] 
    sentences = corpus.split(".") 
    for i in range(len(sentences)): 
        sentences[i] = sentences[i].strip() 
        sentence = sentences[i].split() 
        x = [word.strip(string.punctuation) for word in sentence 
                                     if word not in stop_words] 
        x = [word.lower() for word in x] 
        training_data.append(x) 
    return training_data 
       
   
def prepare_data_for_training(sentences,w2v): 
    data = {} 
    for sentence in sentences: 
        for word in sentence: 
            if word not in data: 
                data[word] = 1
            else: 
                data[word] += 1
    V = len(data) 
    data = sorted(list(data.keys())) 
    vocab = {} 
    for i in range(len(data)): 
        vocab[data[i]] = i 
       
    #for i in range(len(words)): 
    for sentence in sentences: 
        for i in range(len(sentence)): 
            center_word = [0 for x in range(V)] 
            center_word[vocab[sentence[i]]] = 1
            context = [0 for x in range(V)] 
              
            for j in range(i-w2v.window_size,i+w2v.window_size): 
                if i!=j and j>=0 and j<len(sentence): 
                    context[vocab[sentence[j]]] += 1
            w2v.X_train.append(center_word) 
            w2v.y_train.append(context) 
    w2v.initialize(V,data) 
   
    return w2v.X_train,w2v.y_train 

## Source Code

In [5]:
corpus = "" 
corpus += "anarchism originated as a term of abuse first used against early working class radicals including the diggers of the english revolution and the sans culottes of the french revolution"
epochs = 1000
  
training_data = preprocessing(corpus) 
w2v = word2vec() 
  
prepare_data_for_training(training_data,w2v) 
w2v.train(epochs)  
  
print(w2v.predict("term",3))     

epoch  1  loss =  155.32711193049806
epoch  2  loss =  155.1349316035461
epoch  3  loss =  154.94353942346672
epoch  4  loss =  154.7531178148475
epoch  5  loss =  154.56384470573835
epoch  6  loss =  154.37589254337567
epoch  7  loss =  154.18942738072514
epoch  8  loss =  154.00460804544744
epoch  9  loss =  153.82158540087147
epoch  10  loss =  153.64050170636978
epoch  11  loss =  153.46149008222966
epoch  12  loss =  153.28467408181294
epoch  13  loss =  153.11016737154267
epoch  14  loss =  152.93807351715242
epoch  15  loss =  152.7684858727068
epoch  16  loss =  152.60148756722347
epoch  17  loss =  152.4371515823133
epoch  18  loss =  152.27554091313297
epoch  19  loss =  152.11670880411143
epoch  20  loss =  151.96069905037015
epoch  21  loss =  151.80754635547663
epoch  22  loss =  151.65727673614865
epoch  23  loss =  151.50990796470336
epoch  24  loss =  151.36545004041778
epoch  25  loss =  151.22390568147884
epoch  26  loss =  151.08527082982565
epoch  27  loss =  150.94

epoch  342  loss =  143.97148558928603
epoch  343  loss =  143.96874031519906
epoch  344  loss =  143.9660108771383
epoch  345  loss =  143.96329713922523
epoch  346  loss =  143.9605989671228
epoch  347  loss =  143.95791622801377
epoch  348  loss =  143.95524879057976
epoch  349  loss =  143.95259652497947
epoch  350  loss =  143.9499593028288
epoch  351  loss =  143.94733699718046
epoch  352  loss =  143.94472948250367
epoch  353  loss =  143.9421366346648
epoch  354  loss =  143.93955833090826
epoch  355  loss =  143.9369944498373
epoch  356  loss =  143.9344448713954
epoch  357  loss =  143.9319094768479
epoch  358  loss =  143.92938814876413
epoch  359  loss =  143.92688077099933
epoch  360  loss =  143.92438722867763
epoch  361  loss =  143.92190740817432
epoch  362  loss =  143.91944119709945
epoch  363  loss =  143.91698848428092
epoch  364  loss =  143.9145491597482
epoch  365  loss =  143.91212311471602
epoch  366  loss =  143.90971024156883
epoch  367  loss =  143.907310433

epoch  700  loss =  143.48866591598807
epoch  701  loss =  143.48800718715657
epoch  702  loss =  143.48735033562812
epoch  703  loss =  143.48669535339977
epoch  704  loss =  143.48604223251377
epoch  705  loss =  143.4853909650574
epoch  706  loss =  143.48474154316284
epoch  707  loss =  143.48409395900646
epoch  708  loss =  143.4834482048087
epoch  709  loss =  143.48280427283422
epoch  710  loss =  143.48216215539068
epoch  711  loss =  143.4815218448291
epoch  712  loss =  143.4808833335435
epoch  713  loss =  143.48024661397014
epoch  714  loss =  143.4796116785879
epoch  715  loss =  143.47897851991752
epoch  716  loss =  143.47834713052137
epoch  717  loss =  143.47771750300336
epoch  718  loss =  143.47708963000832
epoch  719  loss =  143.47646350422215
epoch  720  loss =  143.47583911837097
epoch  721  loss =  143.47521646522168
epoch  722  loss =  143.47459553758048
epoch  723  loss =  143.47397632829393
epoch  724  loss =  143.4733588302477
epoch  725  loss =  143.4727430

The Model Predicts 'anarchism', 'originated' and 'class' to have similar meaning to that of 'term' which is somewhat true.<br>
Hence we can say that this model is Functional.