# Pre-trained Word2Vec embeddings

In [1]:
# Play with pre-trained word embeddings
# Need to download GoogleNews-vectors-negative300.bin or other word2vec embeddings and copy to ./data folder


# We will use the gensim library to import the word vectors
import gensim

# Load the word embeddings
# (this is just a simple structure. Each word is a vector)
model = gensim.models.KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin', binary=True)

# Get the vocabulary (i.e. the unique words that were used to train these embeddings)
vocab = model.vocab.keys()

# Get the size of the Vocabulary
wordsInVocab = len(vocab)
print(f'Vocab length: {wordsInVocab}')

Vocab length: 3000000


## Compute similarities

In [2]:
# Compute similarities between words

print(f'Similarity between (dog, cat): {model.similarity("dog", "cat")}')
print(f'Similarity between (king, queen): {model.similarity("king", "queen")}')
print(f'Similarity between (car, computer): {model.similarity("car", "computer")}')

Similarity between (dog, cat): 0.760945737361908
Similarity between (king, queen): 0.6510956883430481
Similarity between (car, computer): 0.246127188205719


In [3]:
# Look up most similar words
model.most_similar('france', topn=5)

[('spain', 0.6375303268432617),
 ('french', 0.6326056718826294),
 ('germany', 0.6314354538917542),
 ('europe', 0.6264256238937378),
 ('italy', 0.6257959008216858)]

In [9]:
# Compute similarities between sentences

s1 = "This is a sentence"
s2 = "This is also a sentence"

# Normalize sentences: remove words not in vocabulary
tokens = s1.split()
s1_final = ''
for t in tokens:
    if t.lower() in model.vocab:
        s1_final += t.lower() + ' '
s1_final = s1_final.strip()

tokens = s2.split()
s2_final = ''
for t in tokens:
    if t.lower() in model.vocab:
        s2_final += t.lower() + ' '
s2_final = s2_final.strip()

print(f'Sentence 1: {s1_final}')
print(f'Sentence 2: {s2_final}')
sml = model.n_similarity(s1_final.split(), s2_final.split())
print('Similarity = %.3f' % sml)


Sentence 1: this is sentence
Sentence 2: this is also sentence
Similarity = 0.960


## Vector Arithmetics

In [10]:
# Most famous example:

# "Man to Woman is King to X", what is X? (Answer: Queen)

# We can apply basic arithmetic to Word2Vec vectors:
# King - Man + Woman = ?

# This means that if we take the notion of King and subtract the notion of Man 
# and add the notion of Woman, we get the notion of Queen

result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)


[('queen', 0.7118192911148071)]


In [24]:
# Similar examples
result = model.most_similar(positive=['Tokyo', 'France'], negative=['Paris'], topn=1)
print(result)

print('')

result = model.most_similar(positive=['girl', 'prince'], negative=['boy'], topn=1)
print(result)

[('Japan', 0.8167769908905029)]

[('princess', 0.7421581745147705)]


In [55]:
# Probably not the best results: 
# but the model learns what we feed it

result = model.most_similar(positive=['she', 'doctor'], negative=['he'], topn=1)
print(result)

print('')

result = model.most_similar(positive=['woman', 'computer_programer'], negative=['man'], topn=1)
print(result)

print('')

result = model.most_similar(positive=['she', 'janitor'], negative=['he'], topn=1)
print(result)

[('nurse', 0.6588720679283142)]

[('homemaker', 0.4312755763530731)]

[('manicurist', 0.5636395215988159)]


*Gender bias and other types of bias is something we must deal with if we want to move toward ethical and transparent AI solutions*