In [1]:
import pandas as pd
import gensim
from sklearn.manifold import TSNE

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import random

# [Word2Vec Algorithm](https://en.wikipedia.org/wiki/Word2vec)

### What is it?
It's a Natural Language Processing [NLP] algorithm that transforms words to vectors.

### When do I use it?
When I want to explore semantics of words. For example: find word's oposites, find context word etc.

### Why should I use it?
1] NLP

2] It's a general idea of mapping elements (eg. strings) onto vectors (and vectors are good to work with). I can use it for recommending next product or a song in playlist.

# How does it work?

* Word2Vec is actualy shallow Neural Network [NN] (1 hidden layer).

* **Starting point:** 
    * We have N words. 
    * Each word is represented by N-dimensional vector with 1 on index position and 0s elsewhere (one-hot encoding).


* We let the NN predict word's naighbours.


* We cut out only the guts of the trained NN - scored hidden leayer values for each word.


* **End point:** 
    * Each word is represented by only M-dimension vector (M << N), that carries some context information.   **:-)**

### Why is it important to have vectors instead of words? Because we have the Algebra!



## Word2Vec: one-hot encoding
<img src="one_hot.png" alt="one_hot" style="width: 600px;"/>

## Word2Vec: Word2Vec output 
<img src="w2v_output.png" alt="w2v_output" style="width: 600px;"/>

## Approaches:
* **Skipgram** [SG]
    * Uses neighbour words as an input to NN and central word as an output.

* **Continuous Bag Of Words** [CBOW]
    * Uses central word as an input to NN and neighbour words as an output.

## Word2Vec: Word neighborhood
<img src="word_neighbour.png" alt="word_neighbour" style="width: 600px;"/>

## Word2Vec: Skipgram method
<img src="skipgram.png" alt="skipgram" style="width: 600px;"/>

## Word2Vec: CBOW method
<img src="cbow.png" alt="skipgram" style="width: 600px;"/>

## Dataset - QUORA

In [3]:
# Load the dataset
df = pd.read_csv("data/quora.csv")
df.head()

Unnamed: 0,id,qid1,qid2,question1,question2,is_duplicate
0,0,1,2,What is the step by step guide to invest in sh...,What is the step by step guide to invest in sh...,0
1,1,3,4,What is the story of Kohinoor (Koh-i-Noor) Dia...,What would happen if the Indian government sto...,0
2,2,5,6,How can I increase the speed of my internet co...,How can Internet speed be increased by hacking...,0
3,3,7,8,Why am I mentally very lonely? How can I solve...,Find the remainder when [math]23^{24}[/math] i...,0
4,4,9,10,"Which one dissolve in water quikly sugar, salt...",Which fish would survive in salt water?,0


In [4]:
# Transfer the dataset into list of lists
def read_questions(row,column_name):
    return gensim.utils.simple_preprocess(str(row[column_name]).encode('utf-8'))
    
documents = []
for index, row in df.iterrows():
    documents.append(read_questions(row,"question1"))
    if row["is_duplicate"] == 0:
        documents.append(read_questions(row,"question2"))

In [5]:
# document example 
documents[:4]

[['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market',
  'in',
  'india'],
 ['what',
  'is',
  'the',
  'step',
  'by',
  'step',
  'guide',
  'to',
  'invest',
  'in',
  'share',
  'market'],
 ['what', 'is', 'the', 'story', 'of', 'kohinoor', 'koh', 'noor', 'diamond'],
 ['what',
  'would',
  'happen',
  'if',
  'the',
  'indian',
  'government',
  'stole',
  'the',
  'kohinoor',
  'koh',
  'noor',
  'diamond',
  'back']]

## Model build

In [6]:
# Lets train the word2vec model using skipgram/cbow
w2v_model = gensim.models.Word2Vec(size=150, window=10, min_count=5, sg=0, workers=10)
w2v_model.build_vocab(documents) 
w2v_model.train(sentences=documents, total_examples=len(documents), epochs=w2v_model.epochs)

(25175944, 35144510)

## Model exploration

In [7]:
# Model vocabulary
print(f"Our vocabulary hase {len(w2v_model.wv.vocab)} words.")
w2v_model.wv.vocab

Our vocabulary hase 27775 words.


{'what': <gensim.models.keyedvectors.Vocab at 0x10b28eef0>,
 'is': <gensim.models.keyedvectors.Vocab at 0x112c0e8d0>,
 'the': <gensim.models.keyedvectors.Vocab at 0x112c0e080>,
 'step': <gensim.models.keyedvectors.Vocab at 0x112c0e0f0>,
 'by': <gensim.models.keyedvectors.Vocab at 0x112c0e7f0>,
 'guide': <gensim.models.keyedvectors.Vocab at 0x112c0e1d0>,
 'to': <gensim.models.keyedvectors.Vocab at 0x112c0e240>,
 'invest': <gensim.models.keyedvectors.Vocab at 0x112c0e400>,
 'in': <gensim.models.keyedvectors.Vocab at 0x112c0e208>,
 'share': <gensim.models.keyedvectors.Vocab at 0x112c0e278>,
 'market': <gensim.models.keyedvectors.Vocab at 0x112c0e438>,
 'india': <gensim.models.keyedvectors.Vocab at 0x14aa76048>,
 'story': <gensim.models.keyedvectors.Vocab at 0x14aa76080>,
 'of': <gensim.models.keyedvectors.Vocab at 0x14aa760b8>,
 'kohinoor': <gensim.models.keyedvectors.Vocab at 0x14aa760f0>,
 'koh': <gensim.models.keyedvectors.Vocab at 0x14aa76128>,
 'noor': <gensim.models.keyedvectors.Voc

In [8]:
word = 'trump'
print(f"Word {word} is represented by {len(w2v_model.wv[word])}-dim vector:")
w2v_model.wv[word]

Word trump is represented by 150-dim vector:


array([-3.0021594e+00, -1.6013217e+00, -1.3477046e+00,  1.8107045e+00,
        1.0545655e+00,  2.8576498e+00,  3.1857543e+00, -1.5512133e-01,
        1.0026896e+00, -2.7203097e+00, -2.6428261e+00,  2.0584211e+00,
        7.5834483e-01, -8.0337667e-01,  2.7588370e+00,  2.2655666e+00,
        1.4655324e+00, -1.8966476e+00,  6.8301016e-01,  4.4246539e-02,
       -3.1087118e-01,  3.5403550e+00, -5.3326893e-01, -1.7692728e+00,
        5.1000398e-01,  1.6844894e+00,  7.4914271e-01, -2.8916543e+00,
       -1.0654932e+00, -5.8375257e-01, -1.1625142e+00, -1.8287467e+00,
        4.3621227e-01, -3.7072525e+00,  1.7972028e+00,  1.6560841e+00,
       -3.7426863e+00,  8.4443456e-01,  2.2283988e+00, -3.0649856e-01,
        2.4297147e+00, -3.0128231e+00, -1.7465433e+00,  3.3772628e+00,
        1.6061653e+00,  1.0635952e+00, -2.8212173e+00,  1.3719076e-01,
       -3.1639478e-01,  1.9403396e+00, -2.0502132e-01,  8.6858004e-01,
       -9.9654227e-01, -4.8301321e-01, -2.6555356e-01,  2.6888206e+00,
      

In [9]:
# TSNE representation of N words
N = 1000
wanted_vocab = random.sample(list(w2v_model.wv.vocab), N)
X = w2v_model[wanted_vocab] # X is an array of word vectors, each vector containing 150 tokens
tsne_model = TSNE(perplexity=40, n_components=2, init="pca", n_iter=5000, random_state=23)
Y = tsne_model.fit_transform(X)


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



In [10]:
# Plot
trace = go.Scatter(
    x = Y[:,0],
    y = Y[:,1],
    text = list(wanted_vocab),
    mode='text'
)

data = [trace]

# Plot and embed in ipython notebook!
iplot(data, filename='basic-scatter')

## Using the model

In [11]:
# Find similar word 1
words1 = ['trump']
w2v_model.wv.most_similar(positive=words1, topn=10)

[('trumps', 0.7408903241157532),
 ('knuth', 0.6836468577384949),
 ('hoffman', 0.6373307704925537),
 ('hillary', 0.6026067733764648),
 ('obama', 0.5921074151992798),
 ('bernie', 0.5784475803375244),
 ('president', 0.5665757656097412),
 ('democrats', 0.5629357099533081),
 ('election', 0.5411714315414429),
 ('presidential', 0.5351323485374451)]

In [12]:
# Find similar word 2
words1 = ['panda', 'rabbit', 'dog']
w2v_model.wv.most_similar(positive=words1, topn=10)

[('hamster', 0.78056800365448),
 ('shrimp', 0.7587584853172302),
 ('chow', 0.7555397748947144),
 ('tuna', 0.7444003820419312),
 ('canned', 0.7407806515693665),
 ('snake', 0.734947144985199),
 ('fleas', 0.7347173690795898),
 ('leopard', 0.7345868349075317),
 ('rat', 0.7278550863265991),
 ('pug', 0.7276909947395325)]

In [13]:
# Find similar word 4
words1 = ['husband', 'man']
words2 = ['woman']
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

[('wife', 0.7653980851173401),
 ('son', 0.6974135637283325),
 ('daughter', 0.6875126361846924),
 ('mother', 0.6809148788452148),
 ('brother', 0.6784870624542236),
 ('father', 0.6765068769454956),
 ('dad', 0.6480885148048401),
 ('sister', 0.6466807126998901),
 ('mom', 0.6303774118423462),
 ('boyfriend', 0.6156502366065979)]

In [None]:
# Find similar word 5
words1 = ['president', 'trump']
words2 = ['strength']
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

In [None]:
# Find similar word 4
words1 = ['programming','beginner']
words2 = []
w2v_model.wv.most_similar(positive=words1, negative=words2, topn=10)

In [None]:
# What should not be there? 1
w2v_model.wv.doesnt_match(['tesla', 'bmw', 'superman', 'mercedes'])

In [None]:
# What should not be there? 2
w2v_model.wv.doesnt_match(['trump', 'president', 'wall', 'business'])

In [None]:
# What should not be there? 3
w2v_model.wv.doesnt_match(['weed', 'beer', 'herion', 'cocain'])

## Sources
[Wiki](https://en.wikipedia.org/wiki/Word2vec)

[Good Article 1](https://blog.acolyer.org/2016/04/21/the-amazing-power-of-word-vectors/)

[Good Article 1](http://kavita-ganesan.com/gensim-word2vec-tutorial-starter-code/#.XJfruC1_HUo)

[Tensor Flow Article](https://www.tensorflow.org/tutorials/representation/word2vec)