In [1]:
import numpy as np
from annoy import AnnoyIndex
import os
import pickle

In this example, we are going to be using the 100 dimensional GloVe embeddings from Stanford. I'm just going to show how to get comfortable around the usage of this class for reading in word embeddings.

In [2]:
class PreTrainedEmbeddings(object):
    def __init__(self, word_to_index, word_vectors=None, annoy_index=None):
        """
        Args:
            word_to_index (dict): mapping from word tokens to integers
            word_vectors (list): list of numpy arrays
        """
        if word_vectors is None and annoy_index is None:
            # failed to pass either manual word vectors, or an already made AnnoyIndex
            assert RuntimeError, "must specify either word_vectors or annoy_index"
        elif word_vectors is not None:
            # you've passed in a list of word vectors
            self.word_to_index = word_to_index
            self.word_vectors = word_vectors
            self.index_to_word = {idx: word for word,
                                  idx in self.word_to_index.items()}
            embedding_dimension = len(word_vectors[0])
            self.index = AnnoyIndex(embedding_dimension, metric="euclidean")
            for _, i in self.word_to_index.items():
                self.index.add_item(i, self.word_vectors[i])
            self.index.build(50)
        else:
            # loading word vectors and annoy index from file
            self.word_to_index = word_to_index
            self.index = annoy_index
            self.index_to_word = {idx: word for word,
                                  idx in self.word_to_index.items()}
            
    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """
        Instantiate PreTrainedEmbeddings instance from a pretrained vector file

        Assumes the pretrained vector file is of the format:
            word0 x0_0 x0_1 ... x0_N
            word1 x1_0 x1_1 ... x1_N
            ...

        Args:
            embedding_file (str): location of the file
        Returns:
            PreTrainedEmbeddings instance
        """
        # give a word and return the index
        word_to_index = {}
        word_vectors = []
        with open(embedding_file) as fp:
            for line in fp.readlines():
                line = line.split(" ")
                # the token is always the first column
                word = line[0]
                # the word vector is the rest of the columns
                vec = np.array([float(x) for x in line[1:]])
                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
        return cls(word_to_index, word_vectors)
    
    @classmethod
    def from_annoy_mmap(cls, mmap_path, word_to_index_path):
        with open(word_to_index_path, 'rb') as word_to_index_file_obj:
            word_to_index_dict = pickle.load(word_to_index_file_obj)
        annoy_index = AnnoyIndex(100, metric="euclidean")
        annoy_index.load(mmap_path)
        return cls(word_to_index_dict, word_vectors=None, annoy_index = annoy_index )
    
    def get_embedding(self, word):
        """
        Return the word vector corresponding to word

        Args:
            word (str)
        Returns:
            embedding (np.ndarray)
        """
        word_index = self.word_to_index[word]
        return np.array(self.index.get_item_vector(word_index))

    def get_closest_to_vector(self, target_word_vector, n=1):
        """
        Given a target word vector, return its n nearest neighbors in the vocabulary

        Args:
            target_word_vector (np.ndarray): needs to match the size of vectors in AnnoyIndex
        Returns:
            [str, str, ...]: list of word tokens nearest to target word vector
        """
        # get the indices for the nearest neighbors
        nearest_neighbor_indices = self.index.get_nns_by_vector(
            target_word_vector, n)
        # find their corresponding word vectors
        return [self.index_to_word[neighbor] for neighbor in nearest_neighbor_indices]

    def compute_and_print_analogy(self, word1, word2, word3, relationship="is to"):
        """
        Prints solution for word4 in the analogy:
        word1 is to word2 as word3 is to ___

        Args:
            word1 (str)
            word2 (str)
            word3 (str)            
        """
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)

        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship

        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [
            word for word in closest_words if word not in existing_words]

        if len(closest_words) == 0:
            print("Could not find any nearest neighbors for target vector!")
            return
        for word4 in closest_words:
            print("{} {} {} as {} {} {}".format(
                word1, relationship, word2, word3, relationship, word4))

    def save_embeddings(self, pickle_out_path, mmap_out_path):
        # save the annoy index
        self.index.save(mmap_out_path)
        # save the word_to_index dictionary as pickle
        with open(pickle_out_path, 'wb') as word_to_index_outfile:
            pickle.dump(self.word_to_index, word_to_index_outfile)

In [3]:
out_dir = "data/glove_wikipedia_embeddings/"
annoy_mmap_out_file_name = "glove.6B.100d.txt_annoy"
pickle_out_filename = "glove.6B.100d_word_to_index_dict.pkl"
# output location for annoy mmap
index_out_path = os.path.join(out_dir,annoy_mmap_out_file_name)
# output location for the word_to_index annoy mmap
pickle_out_path = os.path.join(out_dir, pickle_out_filename)

In [4]:
%%time
# make sure the annoy index mmap and the word to index dictionary pickle file both exist
if os.path.exists(index_out_path) and os.path.exists(pickle_out_path):
    print("Annoy mmap and word to index file already exist. Loading from disk..")
    embeddings = PreTrainedEmbeddings.from_annoy_mmap(mmap_path=index_out_path, 
        word_to_index_path=pickle_out_path)
else:
    embeddings_file_path = "data/glove_wikipedia_embeddings/glove.6B.100d.txt"
    print("Creating new embeddings from file: {}".format(embeddings_file_path))
    embeddings = PreTrainedEmbeddings.from_embeddings_file(embeddings_file_path)

Annoy mmap and word to index file already exist. Loading from disk..
CPU times: user 131 ms, sys: 54.2 ms, total: 186 ms
Wall time: 179 ms


We save our annoy index object wrapper so we don't have to regenerate the annoy index

In [5]:
embeddings.save_embeddings(pickle_out_path=pickle_out_path, mmap_out_path=index_out_path)

Here is the relationship between gendered nouns and pronouns

In [6]:
embeddings.compute_and_print_analogy("man","he","woman", relationship="uses pronoun")

man uses pronoun he as woman uses pronoun she
man uses pronoun he as woman uses pronoun never


Relationship between verb noun relationships

In [7]:
embeddings.compute_and_print_analogy("fly","plane","sail")

fly is to plane as sail is to ship
fly is to plane as sail is to vessel


Relationship between noun-noun relationships

In [8]:
embeddings.compute_and_print_analogy("cat","kitten","dog", relationship="is an older")

cat is an older kitten as dog is an older puppy
cat is an older kitten as dog is an older toddler
cat is an older kitten as dog is an older sleds


Hypernymy relationship - a word is a member of a category

In [9]:
embeddings.compute_and_print_analogy("blue","color","dog", relationship="is a kind of")

blue is a kind of color as dog is a kind of cat
blue is a kind of color as dog is a kind of animal
blue is a kind of color as dog is a kind of breed


Meronymy - a word to a part of a whole 

In [10]:
embeddings.compute_and_print_analogy("toe","foot","finger", relationship="is a part of the")

toe is a part of the foot as finger is a part of the hand
toe is a part of the foot as finger is a part of the attached
toe is a part of the foot as finger is a part of the apart


Troponymy - difference in manner

In [11]:
embeddings.compute_and_print_analogy("talk","communicate","read",relationship="is how you")

talk is how you communicate as read is how you instructions
talk is how you communicate as read is how you communicating
talk is how you communicate as read is how you transmit


Metonymy - convention / figure of speech

In [12]:
embeddings.compute_and_print_analogy("blue","democrat","red",relationship="is the color for")

blue is the color for democrat as red is the color for republican
blue is the color for democrat as red is the color for congressman
blue is the color for democrat as red is the color for senator


Adjectival scales

In [13]:
embeddings.compute_and_print_analogy("fast","fastest","young",relationship="is the")

fast is the fastest as young is the youngest
fast is the fastest as young is the female
fast is the fastest as young is the younger
fast is the fastest as young is the sixth


And now for the canonical example of word embeddings! However, you must watch out for things such as gender being encoded in word embeddings as this can introduce unwanted biases in downstream models (remember! humans created the text data our models are trained on!!!!)

In [14]:
embeddings.compute_and_print_analogy("man","king","woman",relationship="is to")

man is to king as woman is to queen
man is to king as woman is to monarch
man is to king as woman is to throne


Differentiating between language regularities and codified cultural biases is difficult. For example, long standing biases in culture lead our word embedding to learn that men are most frequently associated with doctors, whereas women are most often associated with nurses.

In [15]:
embeddings.compute_and_print_analogy("man","doctor","woman",relationship="is to")

man is to doctor as woman is to nurse
man is to doctor as woman is to physician


Aside from any real sociological concerns, realize that these undercurrent biases lead our model to learn an objectively inaccurate model for the English language (i.e. the word doctor is not defined as being only applicable to men-in the language our model was trained on, doctors were just most often associated with men)