In [1]:
# Play with pre-trained word embeddings
# Need to download GoogleNews-vectors-negative300.bin or other word2vec embeddings and copy to ./data folder

import gensim

model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

vocab = model.vocab.keys()
wordsInVocab = len(vocab)
print(f'Vocab length: {wordsInVocab}')



Vocab length: 3000000


In [3]:
# Compute similarities between words

print(f'Similarity between (dog, cat): {model.similarity("dog", "cat")}')
print(f'Similarity between (king, queen): {model.similarity("king", "queen")}')
print(f'Similarity between (car, computer): {model.similarity("car", "computer")}')

Similarity between (dog, cat): 0.760945737361908
Similarity between (king, queen): 0.6510956883430481
Similarity between (car, computer): 0.246127188205719


In [4]:
# Look up most similar words
model.most_similar('france', topn=6)

[('spain', 0.6375303268432617),
 ('french', 0.6326056718826294),
 ('germany', 0.6314354538917542),
 ('europe', 0.6264256238937378),
 ('italy', 0.6257959008216858),
 ('england', 0.6120775938034058)]

In [5]:
# Look up most similar words
model.most_similar('polite', topn=6)

[('courteous', 0.7520973682403564),
 ('everybody_Pendergrast', 0.7189083099365234),
 ('respectful', 0.6748367547988892),
 ('mannerly', 0.6553859710693359),
 ('gracious', 0.6316325664520264),
 ('considerate', 0.6307362914085388)]

In [6]:
# Compute similarities between sentences

s1 = "Transportation services are great"
s2 = "Public transportation needs attention"

# Normalize sentences: remove words not in vocabulary
tokens = s1.split()
s1_final = ''
for t in tokens:
    if t in model.vocab:
        s1_final += t + ' '
s1_final = s1_final.strip()

tokens = s2.split()
s2_final = ''
for t in tokens:
    if t in model.vocab:
        s2_final += t + ' '
s2_final = s2_final.strip()

print(f'Sentence 1: {s1_final}')
print(f'Sentence 2: {s2_final}')
sml = model.n_similarity(s1.lower().split(), s2.lower().split())
print('Similarity = %.3f' % sml)

Sentence 1: Transportation services are great
Sentence 2: Public transportation needs attention
Similarity = 0.623


In [9]:
# Most famous example:
# That is the word queen is the closest word given the subtraction 
# of the notion of man from king and adding the word woman.
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.7118192911148071)]


In [11]:
# A similar example
result = model.most_similar(positive=['Tokyo', 'France'], negative=['Paris'], topn=1)
print(result)

[('Japan', 0.8167769908905029)]


In [18]:
# hmmm.. the model learns what we feed it...

result = model.most_similar(positive=['she', 'doctor'], negative=['he'], topn=1)
print(result)

[('nurse', 0.6588720679283142)]


In [28]:
# !!
result = model.most_similar(positive=['she', 'homemaker'], negative=['he'], topn=1)
print(result)

[('housewife', 0.6646618843078613)]
