In [23]:
import gensim.downloader as api
import os
from gensim.models import KeyedVectors, Word2Vec

In [24]:
DATA = 'data'

In [25]:
dataset = api.load('text8')
model = Word2Vec(dataset)
if not os.path.exists(DATA):
    os.mkdir(DATA) 

In [26]:
if not os.path.exists(f'{DATA}/text8-word2vec.bin'):
    model.save(f'{DATA}/text8-word2vec.bin')

In [27]:
def print_most_similar(word_conf_pairs, k):
    for i, (word, conf) in enumerate(word_conf_pairs):
        print("{:.3f} {:s}".format(conf, word))
        if i >= k-1:
            break
    if k < len(word_conf_pairs):
        print("...")

In [28]:
model = KeyedVectors.load("data/text8-word2vec.bin")
word_vectors = model.wv

get words in the vocabulary

In [29]:
words = list(word_vectors.index_to_key)
print([x for i, x in enumerate(words) if i < 10])
assert("king" in words)

['the', 'of', 'and', 'one', 'in', 'a', 'to', 'zero', 'nine', 'two']


In [30]:
print("# words similar to king")
print_most_similar(word_vectors.most_similar("king"), 5)

# words similar to king
0.750 prince
0.736 queen
0.714 throne
0.708 emperor
0.685 kings
...


In [31]:
print("# vector arithmetic with words (cosine similarity)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar(
    positive=["france", "berlin"], negative=["paris"]), 1
)

# vector arithmetic with words (cosine similarity)
# france + berlin - paris = ?
0.775 germany
...


In [32]:
print("# vector arithmetic with words (Levy and Goldberg)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar_cosmul(
    positive=["france", "berlin"], negative=["paris"]), 1
)

# vector arithmetic with words (Levy and Goldberg)
# france + berlin - paris = ?
0.947 germany
...


In [33]:
print("# find odd one out")
print("# [hindus, parsis, singapore, christians]")
print(word_vectors.doesnt_match(["hindus", "parsis", 
    "singapore", "christians"]))

# find odd one out
# [hindus, parsis, singapore, christians]
singapore


In [34]:
print("# similarity between words")
for word in ["woman", "dog", "whale", "tree"]:
    print("similarity({:s}, {:s}) = {:.3f}".format(
        "man", word,
        word_vectors.similarity("man", word)
    ))

# similarity between words
similarity(man, woman) = 0.759
similarity(man, dog) = 0.431
similarity(man, whale) = 0.281
similarity(man, tree) = 0.230


In [35]:
print("# similar by word")
print(print_most_similar(
    word_vectors.similar_by_word("singapore"), 5)
)

# similar by word
0.868 malaysia
0.846 indonesia
0.806 thailand
0.797 philippines
0.797 shanghai
...
None


In [36]:
print("# distance between vectors")
print("distance(singapore, malaysia) = {:.3f}".format(
    word_vectors.distance("singapore", "malaysia")
))

# distance between vectors
distance(singapore, malaysia) = 0.132


In [37]:
vec_song = word_vectors["song"]
print("\n# output vector obtained directly, shape:", vec_song.shape)


# output vector obtained directly, shape: (100,)


In [38]:
vec_song_2 = word_vectors.get_vector("song", norm=True)
print("# output vector obtained using word_vec, shape:", vec_song_2.shape)

  vec_song_2 = word_vectors.word_vec("song", use_norm=True)


TypeError: get_vector() got an unexpected keyword argument 'use_norm'