#  Training Word 2 Vec Models

In [107]:
import pandas as pd
import numpy as np
import string
import os


In [108]:
data = pd.DataFrame()
data = pd.read_csv('imdb.csv', encoding='utf-8')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [109]:
# create empty list
review_list = list()

indv_lines = data['review'].values.tolist()
for line in indv_lines:
    #create words tokens as well as remove punctuation in one go
    rem_punc = RegexpTokenizer(r'\w+')
    tokens = rem_punc.tokenize(line)
    
    #convert the words to lower case
    words = [w.lower() for w in tokens]
    
    #invoke all english stopwords
    stop_word_list= set(stopwords.words('english'))
    
    #remove stop words
    words = [w for w in words if not w in stop_word_list]
    # append words in review_data list
    review_list.append(words)
len(review_list)

50000

In [110]:
import gensim
Embedding_Dim = 100
#train word2vec model
model = gensim.models.Word2Vec(sentences=review_list, size=Embedding_Dim, workers=4, min_count=1)

In [111]:
words = list(model.wv.vocab)
print('Vocabulary Size..%d' %len(words))

Vocabulary Size..101791


In [112]:
model.wv.most_similar('bad')

[('awful', 0.7533317804336548),
 ('terrible', 0.7517919540405273),
 ('horrible', 0.7437856793403625),
 ('sucks', 0.7067738771438599),
 ('lousy', 0.685297429561615),
 ('crappy', 0.667189359664917),
 ('good', 0.6625716686248779),
 ('suck', 0.6590425372123718),
 ('atrocious', 0.6514021158218384),
 ('lame', 0.6419229507446289)]

In [113]:
# Finding the odd word out from list
model.wv.doesnt_match('man horse lady'.split())

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'horse'

In [114]:
# Performing some mathematics on word vectors queen + man - woman = ?
model.wv.most_similar_cosmul(positive=['queen','man'], negative=['woman'])

[('vs', 0.884979248046875),
 ('iii', 0.874188244342804),
 ('paroxismus', 0.8653695583343506),
 ('slumberness', 0.864371657371521),
 ('aka', 0.8598666787147522),
 ('savage', 0.8572079539299011),
 ('eagle', 0.855822741985321),
 ('wizard', 0.8551560640335083),
 ('godfather', 0.8523043394088745),
 ('prowler', 0.8499919176101685)]

In [115]:
# Import libraries for showing how words of similar context are grouped together
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [116]:
# Defining the Chart 
output_notebook()
plot_chart = bp.figure(plot_width= 700, plot_height = 600, title ='Map of 5000 words vectors',
                tools='pan,wheel_zoom,box_zoom,reset,hover,previewsave',
                      x_axis_type = None, y_axis_type = None, min_border=1)
# extracting the list of word vectors, limiting to 5000 each is of 200 dimensions
word_vectors = [model[w] for w in list(model.wv.vocab.keys())[:5000]]

# Reducing dimensionality by converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

#Storing data in dataframe
tsne_df = pd.DataFrame(tsne_w2v,columns=['x','y'])
tsne_df['word'] = list(model.wv.vocab.keys())[:5000]

plot_chart.scatter(x='x',y='y',source = tsne_df)
hover = plot_chart.select(dict(type=HoverTool))
hover.tooltips = {'word':'@words'}
show(plot_chart)

  import sys


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 5000 samples in 0.049s...
[t-SNE] Computed neighbors for 5000 samples in 8.847s...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.274870
[t-SNE] KL divergence after 250 iterations with early exaggeration: 83.081047
[t-SNE] KL divergence after 1000 iterations: 2.247330


In [117]:
# Saving the embedding model
model_file = 'imdb_word2vec.txt'
model.wv.save_word2vec_format(model_file,binary=False)