In [2]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm as tqdm
from annoy import AnnoyIndex

In [4]:
path = '/Users/ivoliv/data/Embeddings/glove'
embedding_file = os.path.join(path, 'glove.6B.100d.txt')

In [5]:
class PretrainedEmbeddings(object):
    def __init__(self, word_to_index, word_vectors):
        
        self.word_to_index = word_to_index
        self.word_vectors = word_vectors
        
        self.index_to_word = {i: word for i, word in enumerate(self.word_to_index)}
        
        print('Building nearest neighbour index... ', flush=True)
        self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        print('Done.')
        
    def get_closest_vector(self, wordvec, n=1):
        
        nn_indices = self.index.get_nns_by_vector(wordvec, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def get_and_print_analogy(self, word1, word2, word3):
        
        vec1 = self.word_vectors[self.word_to_index[word1]]
        vec2 = self.word_vectors[self.word_to_index[word2]]
        vec3 = self.word_vectors[self.word_to_index[word3]]
        vec4 = vec3 + vec2 - vec1
        closest_words = self.get_closest_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words if word not in existing_words] 

        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3, word4))
        
    @classmethod
    def fromEmbeddingFile(cls, embedding_file):
        
        word_to_index = {}
        word_vectors = []
        with open(embedding_file) as fp:
            for line in tqdm(fp.readlines(), desc='Reading embedding file'):
                line = line.replace('\n', '').split(' ')
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])

                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
                
        return cls(word_to_index, word_vectors)

In [6]:
emb = PretrainedEmbeddings.fromEmbeddingFile(embedding_file)

Reading embedding file: 100%|██████████| 400000/400000 [00:11<00:00, 33589.74it/s]


Building nearest neighbour index... 
Done.


In [7]:
emb.get_and_print_analogy('man', 'king', 'woman')

man : king :: woman : queen
man : king :: woman : throne
man : king :: woman : elizabeth


In [8]:
emb.get_and_print_analogy('man', 'he', 'woman')

man : he :: woman : she
man : he :: woman : never
man : he :: woman : her


In [9]:
emb.get_and_print_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse
man : doctor :: woman : physician
