In [None]:
import gensim.downloader as api
import os
from gensim.models import KeyedVectors, Word2Vec

In [None]:
DATA = 'data'

In [None]:
dataset = api.load('text8')
model = Word2Vec(dataset)
if not os.path.exists(DATA):
    os.mkdir(DATA) 
model.save('data/text8-word2vec.bin')

In [None]:
def print_most_similar(word_conf_pairs, k):
    for i, (word, conf) in enumerate(word_conf_pairs):
        print("{:.3f} {:s}".format(conf, word))
        if i >= k-1:
            break
    if k < len(word_conf_pairs):
        print("...")

In [None]:
model = KeyedVectors.load("data/text8-word2vec.bin")
word_vectors = model.wv

get words in the vocabulary

In [None]:
words = word_vectors.vocab.keys()
print([x for i, x in enumerate(words) if i < 10])
assert("king" in words)

In [None]:
print("# words similar to king")
print_most_similar(word_vectors.most_similar("king"), 5)

In [None]:
print("# vector arithmetic with words (cosine similarity)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar(
    positive=["france", "berlin"], negative=["paris"]), 1
)

In [None]:
print("# vector arithmetic with words (Levy and Goldberg)")
print("# france + berlin - paris = ?")
print_most_similar(word_vectors.most_similar_cosmul(
    positive=["france", "berlin"], negative=["paris"]), 1
)

In [None]:
print("# find odd one out")
print("# [hindus, parsis, singapore, christians]")
print(word_vectors.doesnt_match(["hindus", "parsis", 
    "singapore", "christians"]))

In [None]:
print("# similarity between words")
for word in ["woman", "dog", "whale", "tree"]:
    print("similarity({:s}, {:s}) = {:.3f}".format(
        "man", word,
        word_vectors.similarity("man", word)
    ))

In [None]:
print("# similar by word")
print(print_most_similar(
    word_vectors.similar_by_word("singapore"), 5)
)

In [None]:
print("# distance between vectors")
print("distance(singapore, malaysia) = {:.3f}".format(
    word_vectors.distance("singapore", "malaysia")
))

In [None]:
vec_song = word_vectors["song"]
print("\n# output vector obtained directly, shape:", vec_song.shape)

In [None]:
vec_song_2 = word_vectors.word_vec("song", use_norm=True)
print("# output vector obtained using word_vec, shape:", vec_song_2.shape)