In [7]:
import numpy as np

In [5]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

In [6]:
words, word_to_vec_map = read_glove_vecs('glove.6B/glove.6B.50d.txt')

In [8]:

def cosine_similarity(u, v):
    distance = 0.0
    dot = np.dot(u,v)
    # Compute the L2 norm of u 
    norm_u = np.sqrt(np.sum(u**2))  
    # Compute the L2 norm of v 
    norm_v = np.sqrt(np.sum(v**2))
    # Compute the cosine similarity defined by formula 
    cosine_similarity = dot/(norm_u*norm_v)    
    return cosine_similarity


def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
   # convert words to lowercase
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    
    # Get the word embeddings e_a, e_b and e_c 
    e_a = word_to_vec_map.get(word_a)
    e_b = word_to_vec_map.get(word_b)
    e_c = word_to_vec_map.get(word_c)
    
    words = word_to_vec_map.keys()
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    input_words_set = set([word_a, word_b, word_c]) 
    # loop over the whole word vector set
    for w in words:        
        if w in input_words_set:
            continue
        
        cosine_sim = cosine_similarity(np.subtract(e_b,e_a), np.subtract(word_to_vec_map.get(w),e_c))
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
        
    return best_word

In [9]:
triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

italy -> italian :: spain -> spanish
india -> delhi :: japan -> tokyo
man -> woman :: boy -> girl
small -> smaller :: large -> larger
