In [34]:
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from future.utils import iteritems

In [35]:
def euclidean_distance(a, b):
    return np.linalg.norm(a - b)
def cosine_distance(a, b):
    return 1 - a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [41]:
dist, metric = cosine_distance, "cosine"

In [37]:
def find_analogies(w1, w2, w3):
    for w in (w1, w2, w3):
        if w not in word2vec:
            print(f"{w} not in dictionary.")
            return
    
    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king - man + woman
    
    min_dist = float('inf')
    best_word = ''
    for word, v1 in iteritems(word2vec):
        if word not in (w1, w2, w3):
            d = dist(v0, v1)
            if d < min_dist:
                min_dist = d
                best_word = word
    print(w1, "-", w2, "=", best_word, "-", w3)

In [38]:
def find_analogies_faster(w1, w2, w3):
    for w in (w1, w2, w3):
        if w not in word2vec:
            print(f"{w} not in dictionary.")
            return
    
    king = word2vec[w1]
    man = word2vec[w2]
    woman = word2vec[w3]
    v0 = king - man + woman
    
    distances = pairwise_distances(v0.reshape(1, D), embedding, metric=metric).reshape(V)
    idx = distances.argmin()
    best_word = idx2word[idx]
    print(w1, "-", w2, "=", best_word, "-", w3)

In [39]:
def nearest_neighbors(word, n=5):
    if word not in word2vec:
        print(f"{word} not in dictionary.")
        return 
    v = word2vec[word]
    distances = pairwise_distances(v.reshape(1, D), embedding, metric=metric).reshape(V)
    idxs = distances.argsort()[1:n+1]
    print(f"neighbors of {word}:")
    for idx in idxs:
        print(f"\t{idx2word[idx]}")

In [42]:
print("Loading word vectors...")
word2vec = {}
embedding = []
idx2word = []
with open("glove.6B.50d.txt", "r", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype="float32")
        word2vec[word] = vec
        embedding.append(vec)
        idx2word.append(word)
print(f"Found {len(word2vec)} word vectors.")
embedding = np.array(embedding)
V, D = embedding.shape

find_analogies("king", "man", "woman")
find_analogies("france", "paris", "london")
find_analogies("france", "paris", "rome")
find_analogies("paris", "france", "italy")
find_analogies("france", "french", "english")
find_analogies("japan", "japanese", "chinese")

nearest_neighbors("woman")
nearest_neighbors("nephew")
nearest_neighbors("february")
nearest_neighbors("rome")
nearest_neighbors("king")

Loading word vectors...
Found 400000 word vectors.
king - man = queen - woman
france - paris = britain - london
france - paris = italy - rome
paris - france = rome - italy
france - french = england - english
japan - japanese = china - chinese
neighbors of woman:
	girl
	man
	mother
	her
	boy
neighbors of nephew:
	cousin
	brother
	grandson
	son
	uncle
neighbors of february:
	october
	december
	january
	august
	september
neighbors of rome:
	naples
	venice
	italy
	turin
	pope
neighbors of king:
	prince
	queen
	ii
	emperor
	son
