### Importing necessary modules

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.constraints import nonneg
import numpy as np
import pandas as pd
import nltk
import operator

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Dataset
Here Wikipedia Movie Plots dataset have been used.<br>
Dataset link: https://www.kaggle.com/jrobischon/wikipedia-movie-plots
* You can fit your own dataset

In [None]:
#data 
df = pd.read_csv('dataset/wiki_movie_plots_deduped.csv') #get it from Kaggle
STOP_WORDS = set(nltk.corpus.stopwords.words('english')) 
df.Plot.head()

In [None]:
df.Plot[0]

### Necessary Functions

In [None]:
#filtering and tokenizing function
def sentence_to_wordlist(sentence, filters="!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n?,।!‍.'0123456789০১২৩৪৫৬৭৮৯‘\u200c–“”…‘"):
    translate_dict = dict((c, ' ') for c in filters)
    translate_map = str.maketrans(translate_dict)
    wordlist = sentence.translate(translate_map).split()
    return list(filter(lambda x: x not in STOP_WORDS, wordlist))

#function for calculating word similarity
def cosine_similarity(u, v):

    '''Cosine similarity reflects the degree of similariy between u and v'''
    
    distance = 0.0
    
    ### START CODE HERE ###
    # Compute the dot product between u and v (≈1 line)
    dot = np.dot(u,v)
    # Compute the L2 norm of u (≈1 line)
    norm_u = np.sqrt(np.sum(np.power(u,2)))
    
    # Compute the L2 norm of v (≈1 line)
    norm_v = np.sqrt(np.sum(np.power(v,2)))
    # Compute the cosine similarity defined by formula (1) (≈1 line)
    cosine_similarity = np.divide(dot,norm_u*norm_v)
    ### END CODE HERE ###
    
    return cosine_similarity

#determine simularity between two words
def word_similarity(word1,word2,weight,word_to_index,print_vec=False):
    e1 = weight[word_to_index[word1]]
    e2 = weight[word_to_index[word2]]
    if print_vec:
        print(word1,e1)
        print(word2,e2)
    return cosine_similarity(e1,e2)

#retruns most similiar words
def most_similiar(word,weight,word_to_index,count=5,print_vec=False):
    e1 = weight[word_to_index[word]]
    similarity = {}
    for v in vocab:
        if v is not word:
            e2 = weight[word_to_index[v]]
            similarity[v]=cosine_similarity(e1,e2)
            
    if print_vec:
        print(word,e)
        
    sorted_similarity = sorted(similarity.items(), key=operator.itemgetter(1),reverse=True)      
    return sorted_similarity[1:count+1]

#function for vocab and indexing
def word_idx_generate(text):
    '''
    --build vocabulary of unique words
    --create dictionary for 'indext to word' and word to index'
    '''
    # build vocabulary of unique words
    vocab = []
    for sent in text:
        for i in sent:
            if i not in vocab:
                vocab.append(i)
                
    word2id = {}
    id2word = {}
    for i,word in enumerate(vocab):
        word2id[word] = i
        id2word[i] = word
    return vocab,word2id,id2word

#dataset: CBOW Represantation
def cbow(text,window=4):
    '''Return dataset as one-hot encoded context(X) vs target(Y) words'''
    X = []
    Y = []
    X_context = []
    Y_target = []
    V = len(vocab)
    for sent in text:
        border = len(sent)
        end = window -1
        start = 0
        if len(sent)>=window:
            while (border>end):
                x_temp = []
                x_word = [] #op
                for i in range(start,end):
                    index = word_to_index[sent[i]]
                    temp = np.zeros((1,V))
                    temp[0][index] = 1
                    x_temp.append(temp)
                    x_word.append(sent[i]) #op
                    #print(i,sent[i],end,border) #op
                index = word_to_index[sent[end]]
                temp = np.zeros((1,V))
                temp[0][index] = 1
                Y.append(temp)
                X.append(np.sum(x_temp,axis=0)) #summping context words
                X_context.append(x_word)
                Y_target.append(sent[end]) #op
                start +=1
                end +=1
    
    return X,Y,x_context,y_target

#dataset: Skip-gram Represantation
def skip_gram(text,word_to_index,window=4):
    '''Return dataset as one-hot encoded context(X) vs target(Y) words'''
    X = []
    Y = []
    X_context = []
    Y_target = []
    V = len(vocab)
    for sent in text:
        border = len(sent)
        end = window-1
        start = 0
        if len(sent)>=window:
            while (border>end):
                index = word_to_index[sent[start]]
                temp = np.zeros((1,V))
                temp[0][index] = 1
                X.append(temp)
                y_temp = []
                y_word = []
                X_context.append(sent[start]) #op
                for i in range(start+1,end+1):
                    index = word_to_index[sent[i]]
                    temp = np.zeros((1,V))
                    temp[0][index] = 1
                    y_temp.append(temp)
                    y_word.append(sent[i])

                Y.append(np.sum(y_temp,axis=0)) #summping context words
                Y_target.append(y_word) #op
                start +=1
                end +=1
       
    X = np.array(X)
    Y = np.array(Y)

    return X,Y,X_context,Y_target

def contex_target(X_context,Y_target):
    '''prints context vs target words'''
    for i,x in enumerate(X_context):
        print(i,"Context :",x,'---> Target:',Y_target[i])

### Preparing Dataset for word vectors
We are taking first 200 plots for reducing time it takes

In [None]:
text = df[:200] #selecting first 200 entry
text = [sentence_to_wordlist(i.lower()) for i in text.Plot] #filtering and tokenizing plot text

In [None]:
text

In [None]:
vocab, word2idx,idx2word = word_idx_generate(text)

In [None]:
X,Y,X_context,Y_target = skip_gram(text,word2idx)
print('X:',X.shape,'Y:',Y.shape)

Here we have followed skip-gram model. Shape of X and Y shows, we have **18360 context word against 18360 target word sets.** <br>
Here we can visualize **context word** Vs **target word** with *context_target()* function.

In [None]:
contex_target(X_context,Y_target)

In [None]:
#reshaping dataset to fit in neural network
r = [X.shape[0],X.shape[2]]
X = X.reshape(r)
Y = Y.reshape(r)

### Model Definition

In [None]:
# create model
model = Sequential()
model.add(Dense(100, input_dim=r[1], init= 'uniform' ,kernel_constraint=nonneg(), activation= 'sigmoid' ))
model.add(Dense(r[1], init= 'uniform',kernel_constraint=nonneg(), activation= 'softmax' ))

model.compile(loss= 'binary_crossentropy' , optimizer= 'adam' , metrics=['accuracy'])

### Training Word Vectors

In [None]:
train = model.fit(X, Y, nb_epoch=1)

### Word Similarity Analysis
Now we will use the weight matrix of our Neural Network.<br>
We have two weight metrices for two layers. We have taken 100 hidden units in hidden layer. So, for each word we have 100 dimentional vector.
<br><br>
Weights from both layers need to be summed for final vector.
<br><br>
**Here is an important thing:** Word vectors have less dimention than the one-hot encoding matrix. It reduce computation complexity in NLP task. 

In [None]:
weight1, biases = model.layers[0].get_weights()
weight2, biases = model.layers[1].get_weights()
weight = weight1 +weight2.T

In [None]:
print(word_similarity('afternoon','dead',weight,word2idx))

In [None]:
print(most_similiar('man',weight,word2idx))

### Reproducing Past Results 
In this section weights from the first training article to reproduce the past result that has been discussed in the medium article

In [None]:
from sklearn.externals import joblib

In [None]:
filename = 'model.sav'
#joblib.dump(model, filename)

In [None]:
loaded_model = joblib.load(filename)

In [None]:
W1, biases = loaded_model.layers[0].get_weights()
W2, biases = loaded_model.layers[1].get_weights()
W = W1 +W2.T

In [None]:
print(most_similiar('man',W,word2idx))

In [None]:
W[0]

In [None]:
W

In [None]:
len(list((word2idx)))

In [None]:
W[0][0]

In [None]:
X.shape

In [None]:
word2idx

In [None]:
X


In [None]:
X_context