In [1]:
import pandas as pd
import torch
import torch.nn as nn
from annoy import AnnoyIndex
import numpy as np
import random
import warnings
warnings.filterwarnings("ignore")

embedding_file = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'

# 1. PreTrainedEmbeddings class
### Details are in the following subsections 

In [2]:
class PreTrainedEmbeddings(object):
    """ A wrapper around pre-trained word vectors and their use """
    def __init__(self, word_to_index, word_vectors):
        """
        Args:
            word_to_index (dict): mapping from word to integers
            word_vectors (list of numpy arrays)
        """
        self.word_to_index = word_to_index
        self.word_vectors  = word_vectors
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}

        self.index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
        print("Building Index!")
        for _, i in self.word_to_index.items():
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        print("Finished!")
        
    @classmethod
    def from_embeddings_file(cls, embedding_file):
        """Instantiate from pre-trained vector file.
        Vector file should be of the format:
            word0 x0_0 x0_1 x0_2 x0_3 ... x0_N
            word1 x1_0 x1_1 x1_2 x1_3 ... x1_N
        Args:
            embedding_file (str): location of the file
        Returns: 
            instance of PretrainedEmbeddigns
        """
        word_to_index = {}
        word_vectors = []

        with open(embedding_file) as fp:
            for line in fp.readlines():
                line = line.split(" ")
                word = line[0]
                vec = np.array([float(x) for x in line[1:]])
                
                word_to_index[word] = len(word_to_index)
                word_vectors.append(vec)
                
        return cls(word_to_index, word_vectors)
    
    def get_embedding(self, word):
        """
        Args:
            word (str)
        Returns
            an embedding (numpy.ndarray)
        """
        return self.word_vectors[self.word_to_index[word]]

    def get_closest_to_vector(self, vector, n=1):
        """Given a vector, return its n nearest neighbors
        
        Args:
            vector (np.ndarray): should match the size of the vectors 
                in the Annoy index
            n (int): the number of neighbors to return
        Returns:
            [str, str, ...]: words that are nearest to the given vector. 
                The words are not ordered by distance 
        """
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def compute_and_print_analogy(self, word1, word2, word3):
        """Prints the solutions to analogies using word embeddings

        Analogies are word1 is to word2 as word3 is to __
        This method will print: word1 : word2 :: word3 : word4
        
        Args:
            word1 (str)
            word2 (str)
            word3 (str)
        """
        vec1 = self.get_embedding(word1)
        vec2 = self.get_embedding(word2)
        vec3 = self.get_embedding(word3)

        # now compute the fourth word's embedding!
        spatial_relationship = vec2 - vec1
        vec4 = vec3 + spatial_relationship

        closest_words = self.get_closest_to_vector(vec4, n=4)
        existing_words = set([word1, word2, word3])
        closest_words = [word for word in closest_words 
                             if word not in existing_words] 

        if len(closest_words) == 0:
            print("Could not find nearest neighbors for the computed vector!")
            return
        
        for word4 in closest_words:
            print("{} : {} :: {} : {}".format(word1, word2, word3, word4))

## 1.1 \_\_init\_\_(self, word_to_index, word_vectors)
### Read the pre-trained word embedding vectors from [Stanford’s GLoVe](https://nlp.stanford.edu/projects/glove/). 
### - GloVe, developed by Stanford, is an algorithm used for generating word embeddings, based on the Global Vectors model. GloVe (Global Vectors for Word Representation) is a word embedding model that represents words as high-dimensional vectors and encodes the semantic information of words into vector space.
### - glove.6B.zip is a pre-trained version of the Stanford GloVe model. It is a word embedding model that has been trained on large-scale corpora and can be directly used for natural language processing (NLP) tasks such as word similarity calculation, text classification, sentiment analysis, etc. Pre-trained GloVe models typically include vector representations of millions of words, with each word corresponding to a high-dimensional vector. It has 6B tokens and a vocabulary of 400K words.
### - The glove.6B.zip file in glove.6B.zip contains pre-trained vector files of the GloVe model. It includes the following files:
**1. glove.6B.50d.txt: contains 50-dimensional word vectors** 

**2. glove.6B.100d.txt: contains 100-dimensional word vectors**

**3. glove.6B.200d.txt: contains 200-dimensional word vectors**

**4. glove.6B.300d.txt: contains 300-dimensional word vectors**
### - Information
**1. Number of tokens: 6B**

**2. Number of words in the vocabulary: 400K**

**3. Dimension of the representation: 50, 100, 200, 300 (100 is used below)**



In [3]:
### Print the first 3 lines in "glove.6B.100d.txt"
with open(embedding_file, 'r') as file:
    for i in range(3):  
        line = file.readline()
        print(line)
        print(type(line))

the -0.038194 -0.24487 0.72812 -0.39961 0.083172 0.043953 -0.39141 0.3344 -0.57545 0.087459 0.28787 -0.06731 0.30906 -0.26384 -0.13231 -0.20757 0.33395 -0.33848 -0.31743 -0.48336 0.1464 -0.37304 0.34577 0.052041 0.44946 -0.46971 0.02628 -0.54155 -0.15518 -0.14107 -0.039722 0.28277 0.14393 0.23464 -0.31021 0.086173 0.20397 0.52624 0.17164 -0.082378 -0.71787 -0.41531 0.20335 -0.12763 0.41367 0.55187 0.57908 -0.33477 -0.36559 -0.54857 -0.062892 0.26584 0.30205 0.99775 -0.80481 -3.0243 0.01254 -0.36942 2.2167 0.72201 -0.24978 0.92136 0.034514 0.46745 1.1079 -0.19358 -0.074575 0.23353 -0.052062 -0.22044 0.057162 -0.15806 -0.30798 -0.41625 0.37972 0.15006 -0.53212 -0.2055 -1.2526 0.071624 0.70565 0.49744 -0.42063 0.26148 -1.538 -0.30223 -0.073438 -0.28312 0.37104 -0.25217 0.016215 -0.017099 -0.38984 0.87424 -0.72569 -0.51058 -0.52028 -0.1459 0.8278 0.27062

<class 'str'>
, -0.10767 0.11053 0.59812 -0.54361 0.67396 0.10663 0.038867 0.35481 0.06351 -0.094189 0.15786 -0.81665 0.14172 0.21939 0.

### 1.1.1 Define "word_to_index" and "word_vectors"

In [4]:
##### For demonstration purposes, we'll only use the first 10 lines of glove.6B.100d.txt, 
##### which correspond to the embeddings of the first 10 words.

### a disctionary that stores the matching from a word to its index
word_to_index  = {}
### a list that stores the embedding vectors for all the words in the vocabulary
word_vectors   = []

with open(embedding_file) as fp:
    for i,line in enumerate(fp.readlines()):
        ### For demonstration purposes, here only read the first 10 lines of glove.6B.100d.txt,
        if i >=10: 
            break
        """
            Originally, each line is a str object, like "the -0.038194 -0.24487 ... 0.27062".
            Using the .split(" ") method, split the string into substrings based on the specified 
            delimiter " ", and return a list containing the substrings.
            In particular, each line contains 101 elements: 1 word + 100-dimension vector
        """
        line = line.split(" ")
        
        ### the first element in the list (i.e., the first word in the original line) is the word 
        word = line[0]
        
        ### the remaining elements in the list are 100-dimension vectors 
        ### transferring str to float
        vec  = np.array([float(x) for x in line[1:]])
        
        ### store the the ith element in word_to_index
        word_to_index[word] = len(word_to_index)
        
        ### store the ith element in word_vectors
        word_vectors.append(vec)

In [5]:
print('word_to_index')
print(word_to_index)
print('-'*100)
print("word_to_index['to'] - the index of 'to'")
print(word_to_index['to'])
print('-'*100)
print("word_to_index['apple'] - the index of 'apple. Note 'apple' is not in the vocabulary")
try:
    print(word_to_index['apple'])
except Exception as e:
    print (e)

word_to_index
{'the': 0, ',': 1, '.': 2, 'of': 3, 'to': 4, 'and': 5, 'in': 6, 'a': 7, '"': 8, "'s": 9}
----------------------------------------------------------------------------------------------------
word_to_index['to'] - the index of 'to'
4
----------------------------------------------------------------------------------------------------
word_to_index['apple'] - the index of 'apple. Note 'apple' is not in the vocabulary
'apple'


In [6]:
### A disctionary that stores the matching from an index to the word
index_to_word = {v: k for k, v in word_to_index.items()}
index_to_word

{0: 'the',
 1: ',',
 2: '.',
 3: 'of',
 4: 'to',
 5: 'and',
 6: 'in',
 7: 'a',
 8: '"',
 9: "'s"}

In [7]:
print('word_vectors (for the first two words)')
word_vectors[:2]

word_vectors (for the first two words)


[array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
        -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
         0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
        -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
         0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
        -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
         0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
         0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
        -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
        -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
        -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
        -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
        -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
        -1.2526  ,  0.071624,  0.70565

### 1.1.2 Define “index” using AnnoyIndex
### - Library for approximate nearest neighbors: [Annoy](https://pypi.org/project/annoy/). Annoy was built by [Erik Bernhardsson](https://erikbern.com/).

### - Annoy ([Approximate Nearest Neighbors](https://en.wikipedia.org/wiki/Nearest_neighbor_search#Approximate_nearest_neighbor) Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are [mmap](https://en.wikipedia.org/wiki/Mmap)ped into memory so that many processes may share the same data.

### - Useful functions:
1) **AnnoyIndex(f, metric)**: returns a new index that’s read-write and stores vector of f dimensions. Metric can be "angular", "euclidean", "manhattan", "hamming", or "dot".

2) **a.get_n_items()**: returns the number of items in the index.

3) **a.get_item_vector(i)**: returns the vector for item i that was previously added.

4) **a.add_item(i, v)**: adds item i (any nonnegative integer) with vector v. Note that it will allocate memory for max(i)+1 items - for example, if "a" has 2 existing items, numbered 0 and 1, a.add_item(4,...) will add one new item and allocate it to the (4+1)th item, and the items with skipped numbers will be assigned with vector of zeroes.

5) **a.build(n_trees, n_jobs=-1)**: builds a forest of n_trees trees. More trees gives higher precision when querying. After calling build, no more items can be added. n_jobs specifies the number of threads used to build the trees. n_jobs=-1 uses all available CPU cores.

6) **a.get_distance(i, j)**: returns the distance between items i and j. NOTE: this used to return the squared distance, but has been changed as of Aug 2016.

7) **a.get_nns_by_item(i, n, search_k=-1, include_distances=False)**: returns the n closest items. During the query it will inspect up to search_k nodes which defaults to n_trees * n if not provided. search_k gives you a run-time tradeoff between better accuracy and speed. If you set include_distances to True, it will return a 2 element tuple with two lists in it: the second one containing all corresponding distances.

8) **a.get_nns_by_vector(v, n, search_k=-1, include_distances=False)**: same but query by vector v.

### Initialize an index object

In [8]:
##### The first argument of AnnoyIndex() is the dimension of the vectors, which means
##### all the vectors to be written and stored in the index should have this dimension.

index_temp = AnnoyIndex(10, metric='euclidean')
print('Type of “index”')
print(type(index_temp))
print('-'*60)
print('All properties and methods of “index”')
dir(index_temp)

Type of “index”
<class 'annoy.Annoy'>
------------------------------------------------------------
All properties and methods of “index”


['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'add_item',
 'build',
 'f',
 'get_distance',
 'get_item_vector',
 'get_n_items',
 'get_n_trees',
 'get_nns_by_item',
 'get_nns_by_vector',
 'load',
 'on_disk_build',
 'save',
 'set_seed',
 'unbuild',
 'unload',
 'verbose']

### Add items to the the index 

In [9]:
index_temp = AnnoyIndex(10, metric='euclidean')
print("Number of items in index_temp: {}".format(index_temp.get_n_items()))
##### Before using add_item to assign values, index_temp does not have any item.
try:
    index_temp.get_item_vector(0)
except Exception as e:
    print(e)

Number of items in index_temp: 0
Item index larger than the largest item index


In [10]:
##### Add two items - a n=10 array of 0s and a n=10 array of 1s

index_temp.add_item(0,np.ones(10)*0)
print("The first item (numbered 0) in index_temp")
print(index_temp.get_item_vector(0))
print("Number of items in index_temp: {}".format(index_temp.get_n_items()))
print('-'*60)

index_temp.add_item(1,np.ones(10)*1)
print("The second item (numbered 1) in index_temp")
print(index_temp.get_item_vector(1))
print("Number of items in index_temp: {}".format(index_temp.get_n_items()))

The first item (numbered 0) in index_temp
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Number of items in index_temp: 1
------------------------------------------------------------
The second item (numbered 1) in index_temp
[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Number of items in index_temp: 2


In [11]:
##### Add one item - a n=10 array of 4s
index_temp.add_item(4,np.ones(10)*4)

print("The third item (numbered 2) in index_temp")
print(index_temp.get_item_vector(2))
print("Number of items in index_temp: {}".format(index_temp.get_n_items()))
print('-'*60)
print("The fourth item (numbered 3) in index_temp")
print(index_temp.get_item_vector(3))
print("Number of items in index_temp: {}".format(index_temp.get_n_items()))
print('-'*60)
print("The fifth item (numbered 4) in index_temp")
print(index_temp.get_item_vector(4))
print("Number of items in index_temp: {}".format(index_temp.get_n_items()))

The third item (numbered 2) in index_temp
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Number of items in index_temp: 5
------------------------------------------------------------
The fourth item (numbered 3) in index_temp
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
Number of items in index_temp: 5
------------------------------------------------------------
The fifth item (numbered 4) in index_temp
[4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0]
Number of items in index_temp: 5


### Add all the vectors from word_vectors to index and build the index (tree)

In [12]:
index = AnnoyIndex(len(word_vectors[0]), metric='euclidean')
for _, i in word_to_index.items():
    index.add_item(i,word_vectors[i])
### 
index.build(50)
print(index.get_n_items())

10


In [13]:
print("The first item (numbered 0) in index")
print(index.get_item_vector(0))

The first item (numbered 0) in index
[-0.03819400072097778, -0.24487000703811646, 0.7281200289726257, -0.3996100127696991, 0.08317200094461441, 0.043953001499176025, -0.3914099931716919, 0.3343999981880188, -0.5754500031471252, 0.08745899796485901, 0.28786998987197876, -0.06730999797582626, 0.3090600073337555, -0.263839989900589, -0.13231000304222107, -0.20757000148296356, 0.333950012922287, -0.33847999572753906, -0.3174299895763397, -0.4833599925041199, 0.14640000462532043, -0.37303999066352844, 0.345770001411438, 0.05204100161790848, 0.4494599997997284, -0.46970999240875244, 0.026280000805854797, -0.5415499806404114, -0.15518000721931458, -0.14106999337673187, -0.03972199931740761, 0.2827700078487396, 0.14393000304698944, 0.2346400022506714, -0.3102099895477295, 0.08617299795150757, 0.20397000014781952, 0.5262399911880493, 0.17163999378681183, -0.08237800002098083, -0.7178699970245361, -0.41530999541282654, 0.2033499926328659, -0.12762999534606934, 0.41367000341415405, 0.551869988441

### 1.1.3 Calculate the distance from the item (numbered 4)

In [14]:
##### Calculate the distance using a.get_distance()
distances_from_4 = pd.DataFrame(index=range(index.get_n_items()), 
                                columns=['i', 'distance'], 
                                data=None)
for i in range(index.get_n_items()):
    distances_from_4.iloc[i,0] = i
    distances_from_4.iloc[i,1] = index.get_distance(4,i)
distances_from_4

Unnamed: 0,i,distance
0,0,4.408458
1,1,4.915794
2,2,4.384933
3,3,5.519794
4,4,0.0
5,5,4.434967
6,6,4.824089
7,7,5.05785
8,8,6.302962
9,9,5.491484


In [15]:
##### Calculate the distance using np.linalg.norm(), obtain the same results
_distances_from_4 = pd.DataFrame(index=range(index.get_n_items()), 
                                columns=['i', 'distance'], 
                                data=None)
for i in range(index.get_n_items()):
    _distances_from_4.iloc[i,0] = i
    _distances_from_4.iloc[i,1] = np.linalg.norm(np.array(index.get_item_vector(i)) \
                                                 - np.array(index.get_item_vector(4)))
_distances_from_4

Unnamed: 0,i,distance
0,0,4.408458
1,1,4.915794
2,2,4.384933
3,3,5.519794
4,4,0.0
5,5,4.434967
6,6,4.824089
7,7,5.05785
8,8,6.302962
9,9,5.491485


In [16]:
distances_from_4.sort_values('distance')

Unnamed: 0,i,distance
4,4,0.0
2,2,4.384933
0,0,4.408458
5,5,4.434967
6,6,4.824089
1,1,4.915794
7,7,5.05785
9,9,5.491484
3,3,5.519794
8,8,6.302962


In [17]:
### returns the n closest items of word_vectors[4], using a.get_nns_by_item()
index.get_nns_by_item(4,5,include_distances=True)

([4, 2, 0, 5, 6],
 [0.0,
  4.384932518005371,
  4.408458232879639,
  4.434966564178467,
  4.8240885734558105])

In [18]:
### returns the n closest items of word_vectors[4], using a.get_nns_by_vector()
index.get_nns_by_vector(word_vectors[4],5,include_distances=True)

([4, 2, 0, 5, 6],
 [0.0,
  4.384932518005371,
  4.408458232879639,
  4.434966564178467,
  4.8240885734558105])

## 1.2 from_embeddings_file()
### Instantiate from pre-trained vector file

In [19]:
embeddings = PreTrainedEmbeddings.from_embeddings_file(embedding_file)

Building Index!
Finished!


In [20]:
type(embeddings)

__main__.PreTrainedEmbeddings

In [21]:
dir(embeddings)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'compute_and_print_analogy',
 'from_embeddings_file',
 'get_closest_to_vector',
 'get_embedding',
 'index',
 'index_to_word',
 'word_to_index',
 'word_vectors']

In [22]:
dir(embeddings.index)

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'add_item',
 'build',
 'f',
 'get_distance',
 'get_item_vector',
 'get_n_items',
 'get_n_trees',
 'get_nns_by_item',
 'get_nns_by_vector',
 'load',
 'on_disk_build',
 'save',
 'set_seed',
 'unbuild',
 'unload',
 'verbose']

## 1.3 from_embeddings_file(word)
### Get the embedding (vector reporesentation) of a word

In [23]:
##### To use this function, the input word must be included in word_vectors
word_now = 'apple'
vec_now  = embeddings.get_embedding(word_now)
print('Example word:')
print(word_now)
print('-'*60)
print('Pre-trained vector of the example word:')
print('A vector with dimension = {}'.format(len(vec_now)))
print(vec_now)

Example word:
apple
------------------------------------------------------------
Pre-trained vector of the example word:
A vector with dimension = 100
[-0.5985    -0.46321    0.13001   -0.019576   0.4603    -0.3018
  0.8977    -0.65634    0.66858   -0.49164    0.037557  -0.050889
  0.6451    -0.53882   -0.3765    -0.04312    0.51384    0.17783
  0.28596    0.92063   -0.49349   -0.48583    0.61321    0.78211
  0.19254    0.91228   -0.055596  -0.12512   -0.65688    0.068557
  0.55629    1.611     -0.0073642 -0.48879    0.45493    0.96105
 -0.063369   0.17432    0.9814    -1.3125    -0.15801   -0.54301
 -0.13888   -0.26146   -0.3691     0.26844   -0.24375   -0.19484
  0.62583   -0.7377     0.38351   -0.75004   -0.39053    0.091498
 -0.36591   -1.4715    -0.45228    0.2256     1.1412    -0.38526
 -0.06716    0.57288   -0.39191    0.31302   -0.29235   -0.96157
  0.15154   -0.21659    0.25103    0.096967   0.2843     1.4296
 -0.50565   -0.51374   -0.47218    0.32036    0.023149   0.22623
 -0

In [24]:
##### If the input is not in word_vectors...
try:
    embeddings.get_embedding('aaaaaaa')
except Exception as e:
    print(e)

'aaaaaaa'


## 1.4 get_closest_to_vector(self, vector, n=1)
### Given a vector, return its n nearest neighbors

In [25]:
word_now  = 'friend'
vec_now   = embeddings.get_embedding(word_now)
nearest_n = 3
nn_indices = embeddings.index.get_nns_by_vector(vec_now, nearest_n) 
nn_indices 

[1409, 1327, 629]

In [26]:
i = 0
for neighbor in nn_indices:
    i+=1
    print('neighbor' + str(i))
    print(embeddings.index_to_word[neighbor])
    print('-'*60)

neighbor1
friend
------------------------------------------------------------
neighbor2
husband
------------------------------------------------------------
neighbor3
father
------------------------------------------------------------


In [27]:
##### Find the closest neighbors to random samples of the words in glove.6B.100d.txt
n_words     = 5
n_neighbors = 5
words = random.sample(embeddings.word_to_index.keys(), n_words)
for w in words:
    print(w)
    print(embeddings.get_closest_to_vector(embeddings.get_embedding(w),n_neighbors))
    print('-'*60)

prezzo
['prezzo', 'bilen', 'boom-boom', 'gnus', 'half-orcs']
------------------------------------------------------------
puhs
['puhs', 'preh', 'gooz', 'zeef', 'tooj']
------------------------------------------------------------
ccne
['ccne', 'lcme', 'accp', 'dtf', 'ncse']
------------------------------------------------------------
nakagami
['nakagami', 'spasov', 'emosi', 'kranenburg', 'kakizaki']
------------------------------------------------------------
roll-on
['roll-on', '4-seater', 'muzzleloader', 'torpedo-bomber', 'five-seater']
------------------------------------------------------------


In [28]:
##### Find the closest neighbors to example words
words = ['star','banana','boy','model','sky']
for w in words:
    print(w)
    print(embeddings.get_closest_to_vector(embeddings.get_embedding(w),n_neighbors))
    print('-'*60)

star
['star', 'stars', 'superstar', 'legend', 'hero']
------------------------------------------------------------
banana
['banana', 'mango', 'coconut', 'bananas', 'potato']
------------------------------------------------------------
boy
['boy', 'girl', 'kid', 'man', 'boys']
------------------------------------------------------------
model
['model', 'models', 'concept', 'design', 'introduced']
------------------------------------------------------------
sky
['sky', 'skies', 'horizon', 'bright', 'shadows']
------------------------------------------------------------


## 1.4 compute_and_print_analogy(self, word1, word2, word3)
### Prints the solutions to analogies using word embeddings

In [29]:
word1, word2, word3 = 'cat', 'kitten', 'dog'

In [30]:
### Simple hypothesis: Analogy is a spatial relationship 
### vec1 - vec2 = vec3 - vec4
### vec4 = vec3 - (vec1 - vec2)
vec1 = embeddings.get_embedding(word1) 
vec2 = embeddings.get_embedding(word2) 
vec3 = embeddings.get_embedding(word3)
vec4 = vec3 - (vec1 - vec2)

In [31]:
closest_words = embeddings.get_closest_to_vector(vec4, n=4)
print('closest_words')
print(closest_words)
print('-'*60)
existing_words = set([word1, word2, word3])
print('existing_words')
print(existing_words)
print('-'*60)
solution_words = [word for word in closest_words if word not in existing_words]
print('solution_words')
print(solution_words)
print('-'*60)
print('Solution')
for word4 in solution_words:
    print("{} : {} :: {} : {}".format(word1, word2, word3, word4))

closest_words
['kitten', 'puppy', 'furry', 'mannequin']
------------------------------------------------------------
existing_words
{'dog', 'kitten', 'cat'}
------------------------------------------------------------
solution_words
['puppy', 'furry', 'mannequin']
------------------------------------------------------------
Solution
cat : kitten :: dog : puppy
cat : kitten :: dog : furry
cat : kitten :: dog : mannequin


In [32]:
embeddings.compute_and_print_analogy('man', 'he', 'woman')

man : he :: woman : she
man : he :: woman : never


In [33]:
embeddings.compute_and_print_analogy('man', 'king', 'woman')

man : king :: woman : queen
man : king :: woman : monarch
man : king :: woman : elizabeth


In [34]:
embeddings.compute_and_print_analogy('fast', 'fastest', 'small')

fast : fastest :: small : smallest
fast : fastest :: small : largest
fast : fastest :: small : large


In [35]:
embeddings.compute_and_print_analogy('blue', 'democrat', 'red')

blue : democrat :: red : republican
blue : democrat :: red : congressman
blue : democrat :: red : senator


In [36]:
embeddings.compute_and_print_analogy('blue', 'color', 'dog')

blue : color :: dog : animal
blue : color :: dog : breed
blue : color :: dog : dogs


In [37]:
embeddings.compute_and_print_analogy('cat', 'kitten', 'dog')

cat : kitten :: dog : puppy
cat : kitten :: dog : furry
cat : kitten :: dog : mannequin
