In [22]:
from gensim.models import KeyedVectors, Word2Vec

In [6]:
# Download google pretrained word vectors: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
## 1.5G, 3 million words trained on 100 billion words from google news dataset
## Relevant params: https://radimrehurek.com/gensim/models/keyedvectors.html

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [12]:
# Get word vector of a word
sample_word = model['icecream']
sample_word[4:10]  # here I'm just showing a subarray, otherwise the array is huge

array([-0.36914062,  0.171875  ,  0.30273438, -0.08056641, -0.20507812,
        0.3984375 ], dtype=float32)

In [13]:
# find most similar words

print(model.most_similar(positive=['woman', 'queen'], negative=['man']))  # this took very long time to run...

[(u'princess', 0.6431564688682556), (u'queens', 0.6387216448783875), (u'very_pampered_McElhatton', 0.5774043202400208), (u'Queen_Consort', 0.5504266619682312), (u'Queen', 0.5450494289398193), (u'princesses', 0.5421540141105652), (u'duchess', 0.5339502692222595), (u'empress', 0.5262109637260437), (u'monarch', 0.5216404795646667), (u'Princess', 0.5202960968017578)]


In [15]:
# Pick out the odd token out...

print(model.doesnt_match("Emmanuel's cats are very adorable! In fact Emmanuel is an adorable cat! As lovely as Totoro!".split()))

## but the word vector does have Emmanuel this word
sample_word = model['Emmanuel']
sample_word[4:10]

Emmanuel


array([ 0.26757812,  0.02929688, -0.18359375, -0.15722656, -0.09179688,
        0.0168457 ], dtype=float32)

In [20]:
print(model.similarity('Totoro', 'Emmanuel'))
print(model.similarity('Cat', 'Emmanuel'))
print(model.similarity('Icecream', 'Emmanuel'))
print(model.similarity('ice', 'fire'))
print(model.similarity('Hanhan', 'Emmanuel'))  # ha, much higher!

0.0830172488544
0.0624687042684
0.0583377236346
0.105000703299
0.289815366273


In [44]:
# My own model, train on only 3 sentences
## min_count - lowest frequency to be included in the vocabulary
## size - size/dimensions of the word vector
## workers - number of cores used in parallelization

s1 = "Emmanuel's cats are very adorable"
s2 = "In fact Emmanuel is an adorable cat"
s3 = "Emmanuel is as lovely as Totoro"
sentences = [s1.split(), s2.split(), s3.split()]
my_model = Word2Vec(sentences, min_count=1,size=300,workers=4)

In [49]:
sentences

[["Emmanuel's", 'cats', 'are', 'very', 'adorable'],
 ['In', 'fact', 'Emmanuel', 'is', 'an', 'adorable', 'cat'],
 ['Emmanuel', 'is', 'as', 'lovely', 'as', 'Totoro']]

In [47]:
# When check similarity, words have to be in the vocabulary of your own model

print(my_model.wv.similarity('Emmanuel', 'adorable'))

-0.0140343215701
