# Gensim word vector visualization of various word vectors

In [7]:
import numpy as np

# Get the interactive Tools for Matplotlib
%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


For looking at word vectors, I'll use Gensim. We also use it in hw1 for word vectors. Gensim isn't really a deep learning package. It's a package for for word and text similarity modeling, which started with (LDA-style) topic models and grew into SVD and neural word representations. But its efficient and scalable, and quite widely used.

Our homegrown Stanford offering is GloVe word vectors. Gensim doesn't give them first class support, but allows you to convert a file of GloVe vectors into word2vec format. You can download the GloVe vectors from [the Glove page](https://nlp.stanford.edu/projects/glove/). They're inside [this zip file](https://nlp.stanford.edu/data/glove.6B.zip)

(I use the 100d vectors below as a mix between speed and smallness vs. quality. If you try out the 50d vectors, they basically work for similarity but clearly aren't as good for analogy problems. If you load the 300d vectors, they're even better than the 100d vectors.)

In [3]:
#!wget https://nlp.stanford.edu/data/glove.6B.zip
#!cp glove.* /content/drive/MyDrive/dataScience/deepLearning/nlp/
! cp /content/drive/MyDrive/dataScience/deepLearning/nlp/glove.6B.zip . 
!unzip glove.6B.zip 

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [4]:
!ls -ltrh

total 2.9G
-rw-rw-r-- 1 root root 332M Aug  4  2014 glove.6B.100d.txt
-rw-rw-r-- 1 root root 662M Aug  4  2014 glove.6B.200d.txt
-rw-rw-r-- 1 root root 164M Aug  4  2014 glove.6B.50d.txt
-rw-rw-r-- 1 root root 990M Aug 27  2014 glove.6B.300d.txt
drwxr-xr-x 1 root root 4.0K Jun 15 13:37 sample_data
drwx------ 5 root root 4.0K Jun 17 12:12 drive
-rw------- 1 root root 823M Jun 17 12:12 glove.6B.zip


In [5]:
!head glove.6B.100d.txt -n 3

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062
, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.58505 -0.52158 

In [8]:
glove_file = datapath('glove.6B.100d.txt')
word2vec_glove_file = 'glove.6B.100d.word2vec.txt'
glove2word2vec('glove.6B.100d.txt', word2vec_glove_file)

(400000, 100)

In [9]:
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

### Simlar words for given words 

In [11]:
model.most_similar('banana')

[('coconut', 0.7097253799438477),
 ('mango', 0.7054824233055115),
 ('bananas', 0.6887733936309814),
 ('potato', 0.6629636287689209),
 ('pineapple', 0.6534532904624939),
 ('fruit', 0.6519855260848999),
 ('peanut', 0.6420576572418213),
 ('pecan', 0.6349173188209534),
 ('cashew', 0.6294420957565308),
 ('papaya', 0.6246591210365295)]

### Simlarity of two given words

In [12]:
model.wv.similarity('apple','banana')

  """Entry point for launching an IPython kernel.


0.5054469

### Using similarity of two given words to solve "Odd Man Out" problem

In [13]:
def odd_man_out(question_list):
  """
  Input: Takes a list of words like ["apple", "banana", "guvava",'cachew"] as input
  Output: Using semantic based word similary, try to find out which words are simiar and which one is the odd man out.
  """
  question_list = [w.lower() for w in question_list]
  out = {}
  for t1 in question_list:
    for t2 in question_list:
      if t1==t2: continue
      if (t2,t1) not in out:
        out[(t1,t2)] = model.similarity(t1,t2)

  odd_man = question_list.copy()
  sorted_items = [k for k,v in sorted(out.items(),key=lambda x:x[1],reverse=True)]
  while len(odd_man)>1:
    for pair in sorted_items:
      for item in pair:
        if item in odd_man:
          if len(odd_man)==1: return odd_man[0]
          odd_man.remove(item)
        #print(odd_man, pair)
        if len(odd_man)==1: return odd_man[0]

In [14]:
"""
Sample Question 
answers are  
"""

question_list_list  = [['Banana','Mango','Cashew','Guava'],
                  ['Gold', 'Silver', 'Carbon', 'Diamond'],
                  ['Screw','Hammer','Needle','Pin'],
                  ['Car','Bicycle','Motorcycle','Jeep'],
                  ['Listen','Swim','Walk','Climb']
                ]
for qlist in question_list_list:
  print("Odd man from %s is %s "%(qlist, odd_man_out(qlist)))

Odd man from ['Banana', 'Mango', 'Cashew', 'Guava'] is cashew 
Odd man from ['Gold', 'Silver', 'Carbon', 'Diamond'] is carbon 
Odd man from ['Screw', 'Hammer', 'Needle', 'Pin'] is hammer 
Odd man from ['Car', 'Bicycle', 'Motorcycle', 'Jeep'] is jeep 
Odd man from ['Listen', 'Swim', 'Walk', 'Climb'] is listen 
