[![Dataflowr](https://raw.githubusercontent.com/dataflowr/website/master/_assets/dataflowr_logo.png)](https://dataflowr.github.io/website/)

# Finding Synonyms and Analogies

This notebook is taken from a [PyTorch NLP tutorial](https://github.com/joosthub/pytorch-nlp-tutorial-ny2018/blob/master/day_1/0_Using_Pretrained_Embeddings.ipynb) source: [repository for the training tutorial as the 2018 O'Reilly AI Conference in NYC on April 29 and 30, 2018](https://github.com/joosthub/pytorch-nlp-tutorial-ny2018)

In [None]:
## Colab SETUP
#!pip install annoy

In [2]:
from annoy import AnnoyIndex
import numpy as np
import torch
from tqdm import notebook
import os
from pathlib import Path

Glove embeddings can be downloaded from [GloVe webpage](https://nlp.stanford.edu/projects/glove/).

You need to uncomment the appropriate part in the following cell

In [4]:
## Colab SETUP
# #!mkdir data
# #%cd data
# ROOT_DIR = '/home/andy/00_workspace/dl/data/08/content'
# %cd $ROOT_DIR
# !wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
# !unzip glove.6B.zip

## local SETUP download glove in ~/data/ with the commands wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
## and unzip glove.6B.zip
#ROOT_DIR = Path.home()

[Errno 2] No such file or directory: 'ROOT_DIR'
/spc/home/andy/00_workspace/dl/notebooks/Module8
--2023-05-09 20:29:41--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-05-09 20:32:27 (4.96 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [5]:
data_path = os.path.join(ROOT_DIR,'data/')
file = 'glove.6B.100d.txt'
glove_filename=data_path+file

In [6]:
def load_word_vectors(filename):
    word_to_index = {}
    word_vectors = []
    
    with open(filename) as fp:
        for line in notebook.tqdm(fp.readlines(), leave=False):
            line = line.split(" ")
            
            word = line[0]
            word_to_index[word] = len(word_to_index)
            
            vec = np.array([float(x) for x in line[1:]])
            word_vectors.append(vec)
            
    return word_to_index, word_vectors

In [8]:
word_to_index, word_vectors = load_word_vectors(glove_filename)

  0%|          | 0/400000 [00:00<?, ?it/s]

In [9]:
len(word_vectors)

400000

In [10]:
word_vectors[0].shape

(100,)

In [11]:
word_to_index['beautiful']

3366

In [12]:
class PreTrainedEmbeddings(object):
    def __init__(self, glove_filename):
        self.word_to_index, self.word_vectors = load_word_vectors(glove_filename)
        self.word_vector_size = len(self.word_vectors[0])
        
        self.index_to_word = {v: k for k, v in self.word_to_index.items()}
        # annoyindex: 基于内存的索引，用于快速相似度检索
        # 以欧几里得距离为度量，构建50棵树
        self.index = AnnoyIndex(self.word_vector_size, metric='euclidean')
        print('Building Index')
        for _, i in notebook.tqdm(self.word_to_index.items(), leave=False):
            self.index.add_item(i, self.word_vectors[i])
        self.index.build(50)
        print('Finished!')
    
    def get_embedding(self, word):
        return self.word_vectors[self.word_to_index[word]]
    
    def closest(self, word, n=1):
        vector = self.get_embedding(word)
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def closest_v(self, vector, n=1):
        nn_indices = self.index.get_nns_by_vector(vector, n)
        return [self.index_to_word[neighbor] for neighbor in nn_indices]
    
    def sim(self, w1, w2):
        return np.dot(self.get_embedding(w1), self.get_embedding(w2))


In [13]:
glove = PreTrainedEmbeddings(glove_filename)

  0%|          | 0/400000 [00:00<?, ?it/s]

Building Index


  0%|          | 0/400000 [00:00<?, ?it/s]

Finished!


In [14]:
glove.closest('apple', n=5)

['apple', 'microsoft', 'dell', 'ibm', 'intel']

In [15]:
glove.closest('chip', n=5)

['chip', 'chips', 'semiconductor', 'intel', 'tech']

In [16]:
glove.closest('baby', n=5)

['baby', 'babies', 'boy', 'girl', 'infant']

In [17]:
glove.closest('beautiful', n=5)

['beautiful', 'lovely', 'wonderful', 'charming', 'elegant']

In [18]:
def SAT_analogy(w1, w2, w3):
    '''
    Solves problems of the type:
    w1 : w2 :: w3 : __
    '''
    closest_words = []
    try:
        w1v = glove.get_embedding(w1)
        w2v = glove.get_embedding(w2)
        w3v = glove.get_embedding(w3)
        w4v = w3v + (w2v - w1v)
        closest_words = glove.closest_v(w4v, n=5)
        closest_words = [w for w in closest_words if w not in [w1, w2, w3]]
    except:
        pass
    if len(closest_words) == 0:
        print(':-(')
    else:
        print('{} : {} :: {} : {}'.format(w1, w2, w3, closest_words[0]))

In [19]:
SAT_analogy('man', 'he', 'woman')

man : he :: woman : she


In [20]:
SAT_analogy('fly', 'plane', 'sail')

fly : plane :: sail : ship


In [21]:
SAT_analogy('beijing', 'china', 'tokyo')

beijing : china :: tokyo : japan


In [22]:
SAT_analogy('man', 'woman', 'son')

man : woman :: son : daughter


In [23]:
SAT_analogy('man', 'doctor', 'woman')

man : doctor :: woman : nurse


In [24]:
SAT_analogy('woman', 'leader', 'man')

woman : leader :: man : leaders


In [27]:
SAT_analogy('pen', 'write', 'book')

pen : write :: book : books


[![Dataflowr](https://raw.githubusercontent.com/dataflowr/website/master/_assets/dataflowr_logo.png)](https://dataflowr.github.io/website/)