In [1]:
import torch
from torchtext.vocab import load_word_vectors
from annoy import AnnoyIndex
from tqdm import tqdm_notebook as tqdm

from __future__ import print_function

In [2]:
!head -n 1 data/glove/glove.6B.100d.txt

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062


In [17]:
class PreTrainedEmbeddings(object):
    def __init__(self, path='data/glove', file='glove.6B', dimension=100):
        self.word_to_index, self.word_vectors, self.word_vector_size = load_word_vectors(path, file, dimension)
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        self.index = AnnoyIndex(self.word_vector_size, metric='euclidean')
        print('Building Index')
        for _, i in tqdm(self.word_to_index.items()):
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
    
    def get_embedding(self, word):
        return self.word_vectors[self.word_to_index[word]]
    
    def closest(self, word, n=1):
        vector = self.get_embedding(word)
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def closest_v(self, vector, n=1):
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def sim(self, w1, w2):
        return torch.dot(self.get_embedding(w1), self.get_embedding(w2))

In [18]:
glove = PreTrainedEmbeddings()

loading word vectors from data/glove/glove.6B.100d.pt
Building Index





In [19]:
glove.closest('apple', n=5)

['apple', 'microsoft', 'ibm', 'intel', 'pc']

In [20]:
glove.closest('plane', n=5)

['plane', 'jet', 'flight', 'crashed', 'crash']

In [21]:
glove.sim('beer', 'wine'), glove.sim('beer', 'gasoline')

(26.873451232910156, 16.50149154663086)

In [22]:
def SAT_analogy(w1, w2, w3):
    '''
    Solves problems of the type:
    w1 : w2 :: w3 : __
    '''
    closest_words = []
    try:
        w1v = glove.get_embedding(w1)
        w2v = glove.get_embedding(w2)
        w3v = glove.get_embedding(w3)
        w4v = w3v + (w2v - w1v)
        closest_words = glove.closest_v(w4v, n=5)
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))

In [23]:
SAT_analogy('man', 'superman', 'woman')

man : superman :: woman : supergirl


In [24]:
SAT_analogy('fly', 'plane', 'sail')

fly : plane :: sail : ship


In [25]:
SAT_analogy('man', 'king', 'woman')

man : king :: woman : queen


In [26]:
SAT_analogy('eat', 'breakfast', 'drink')

eat : breakfast :: drink : drinks
