In [1]:
from argparse import Namespace
import os
os.environ['OMP_NUM_THREADS'] = '4' 

from annoy import AnnoyIndex
import numpy as np
import torch
from tqdm import tqdm_notebook


These pre-trained word embeddings come from the [Glove project](https://nlp.stanford.edu/projects/glove/). For more details, about how the embeddings were generated see [this paper](https://nlp.stanford.edu/pubs/glove.pdf).

In [2]:
args = Namespace(
    glove_filename='../data/glove.6B.100d.txt'
)

In [3]:
def load_word_vectors(filename):
    """
    A helper function to load word vectors from a file.
    """
    word_to_index = {}
    word_vectors = []
    
    with open(filename) as fp:
        for line in tqdm_notebook(fp.readlines(), leave=False):
            line = line.split(" ")
            
            word = line[0]
            word_to_index[word] = len(word_to_index)
            
            vec = np.array([float(x) for x in line[1:]])
            word_vectors.append(vec)
            
    return word_to_index, word_vectors

In [4]:
class PreTrainedEmbeddings(object):
    """
    A helper class to use standalone pre-trained embeddings
    """
    def __init__(self, glove_filename):
        self.word_to_index, self.word_vectors = load_word_vectors(glove_filename)
        self.word_vector_size = len(self.word_vectors[0])
        
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        self.index = AnnoyIndex(self.word_vector_size, metric='euclidean')
        print('Building Index')
        for _, i in tqdm_notebook(self.word_to_index.items(), leave=False):
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        print('Finished!')
    
    def get_embedding(self, word):
        return self.word_vectors[self.word_to_index[word]]
    
    def closest(self, word, n=1):
        """
        Finall the top-n closest words (in the embedding space) to a given word.
        """
        vector = self.get_embedding(word)
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def closest_v(self, vector, n=1):
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def sim(self, w1, w2):
        """
        find similarity between two words. returns a non-negative score.
        Higher the score, more the similarity in some dimension.
        """
        return np.dot(self.get_embedding(w1), self.get_embedding(w2))

In [5]:
glove = PreTrainedEmbeddings(args.glove_filename)

HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))

Building Index


HBox(children=(IntProgress(value=0, max=400000), HTML(value='')))

Finished!


In [6]:
glove.closest('apple', n=5)

['apple', 'microsoft', 'dell', 'pc', 'compaq']

In [7]:
glove.closest('plane', n=5)

['plane', 'airplane', 'jet', 'flight', 'crashed']

In [8]:
glove.sim('beer', 'wine'), glove.sim('beer', 'gasoline')

(26.873448266652, 16.501491855324)

**A study of lexical relationships uncovered by word embeddings**

Traditionally many of these relationships were hand-coded. See, for example, [the WordNet project](https://wordnet.princeton.edu/).

In [9]:
def SAT_analogy(w1, w2, w3):
    '''
    Solves problems of the type:
    w1 : w2 :: w3 : __
    '''
    closest_words = []
    try:
        w1v = glove.get_embedding(w1)
        w2v = glove.get_embedding(w2)
        w3v = glove.get_embedding(w3)
        w4v = w3v + (w2v - w1v)
        closest_words = glove.closest_v(w4v, n=5)
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        the_closest_word = closest_words[0]
        print('{} : {} :: {} : {}'.format(w1, w2, w3, the_closest_word))

**Pronouns**

In [10]:
SAT_analogy('man', 'he', 'woman')

man : he :: woman : she


** Verb-Noun relationships **

In [11]:
SAT_analogy('fly', 'plane', 'sail')

fly : plane :: sail : ship


**Noun-Noun relationships**

In [12]:
SAT_analogy('cat', 'kitten', 'dog')

cat : kitten :: dog : pug


In [13]:
SAT_analogy('human', 'baby', 'dog')

human : baby :: dog : puppy


In [14]:
SAT_analogy('human', 'babies', 'dog')

human : babies :: dog : puppies


**Hypernymy**

In [15]:
SAT_analogy('blue', 'color', 'dog')

blue : color :: dog : animal


**Meronymy**

In [16]:
SAT_analogy('leg', 'legs', 'hand')

leg : legs :: hand : hands


**Troponymy**

In [17]:
SAT_analogy('talk', 'communicate', 'read')

talk : communicate :: read : correctly


**Metonymy**

In [18]:
SAT_analogy('blue', 'democrat', 'red')

blue : democrat :: red : republican


**Misc**

In [19]:
SAT_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse


In [20]:
SAT_analogy('man', 'leader', 'woman')

man : leader :: woman : opposition
