
**KeyedVectors** - This object essentially contains the mapping between words and embeddings. After training, it can be used directly to query those embeddings in various ways

In [26]:
import gensim
import pandas as pd
from gensim.models import word2vec
from gensim.models import KeyedVectors
from sklearn.metrics.pairwise import cosine_similarity
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec


In [15]:
word_vectors = KeyedVectors.load_word2vec_format('word2vec-applications-transfer-learning-nlp-master/GoogleNews-vectors-negative300.bin',binary=True)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [16]:
v_apple = word_vectors["apple"] 
v_mango = word_vectors["india"]

In [17]:
print(v_apple.shape)
print(v_mango.shape)

(300,)
(300,)


In [18]:
cosine_similarity([v_mango],[v_apple])

array([[0.17158598]], dtype=float32)

In [19]:
import numpy as np

## 1. Find the Odd One Out

In [38]:
def odd_one_out(words):
    """Accepts a list of words and returns the odd word"""
    
    # Generate all word embeddings for the given list
    all_word_vectors = [word_vectors[w] for w in words]
    avg_vector = np.mean(all_word_vectors,axis=0)
    #print(avg_vector.shape)
    
    #Iterate over every word and find similarity
    odd_one_out = None
    min_similarity = 1.0 #Very high value
    
    for w in words:
        sim = cosine_similarity([word_vectors[w]],[avg_vector])
        if sim < min_similarity:
            min_similarity = sim
            odd_one_out = w
    
        #print("Similairy btw %s and avg vector is %.2f"%(w,sim))
            
    return odd_one_out

In [21]:
input_1 = ["apple","mango","juice","party","orange"] 
input_2 = ["music","dance","sleep","dancer","food"]        
input_3  = ["match","player","football","cricket","dancer"]
input_4 = ["india","paris","russia","france","germany"]

In [22]:
odd_one_out(input_1) 

(300,)
Similairy btw apple and avg vector is 0.78
Similairy btw mango and avg vector is 0.76
Similairy btw juice and avg vector is 0.71
Similairy btw party and avg vector is 0.36
Similairy btw orange and avg vector is 0.65


'party'

In [23]:
odd_one_out(input_2) 

(300,)
Similairy btw music and avg vector is 0.66
Similairy btw dance and avg vector is 0.81
Similairy btw sleep and avg vector is 0.51
Similairy btw dancer and avg vector is 0.72
Similairy btw food and avg vector is 0.52


'sleep'

In [24]:
odd_one_out(input_3)

(300,)
Similairy btw match and avg vector is 0.58
Similairy btw player and avg vector is 0.68
Similairy btw football and avg vector is 0.72
Similairy btw cricket and avg vector is 0.70
Similairy btw dancer and avg vector is 0.53


'dancer'

In [25]:
odd_one_out(input_4)

(300,)
Similairy btw india and avg vector is 0.81
Similairy btw paris and avg vector is 0.75
Similairy btw russia and avg vector is 0.79
Similairy btw france and avg vector is 0.81
Similairy btw germany and avg vector is 0.84


'paris'

### Load Testing Dataset as Xtest

In [45]:
def giveAllPredictions(X):
    ans = []
    l = X.shape[0]
    for i in range(l):
        ans.append(odd_one_out(X_test[i]))
    
    return ans

In [48]:
Y_test = giveAllPredictions(X_test)