<a href="https://colab.research.google.com/github/jotagectti/Natural-Language-Processing-PyTorch/blob/main/5_Performing_Sentiment_Analysis_Using_Word_Embeddings(Full).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Generating Analogies Using GloVe Embeddings


> *bitacora notepad*

A.Word embedings:
* one-hot
* Frecuency -> count, TF-IDF, Co-ocurrence
* prediction-based -> ML based algorithms => Word2vec|GLoVe


In [1]:
import torch
import torchtext.vocab

In [2]:
glove = torchtext.vocab.GloVe(name='6B', dim=100)
print(f'Is {len(glove.itos)} words in vocab')

.vector_cache/glove.6B.zip: 862MB [02:42, 5.30MB/s]                           
100%|█████████▉| 399999/400000 [00:17<00:00, 22930.82it/s]


Is 400000 words in vocab


In [4]:
glove.vectors.shape

torch.Size([400000, 100])

In [5]:
glove.itos[:15]

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'in',
 'a',
 '"',
 "'s",
 'for',
 '-',
 'that',
 'on',
 'is']

In [6]:
glove.stoi['the']

0

In [7]:
glove.stoi['dazzle']

36623

In [9]:
glove.stoi['shenanigans']

43165

In [13]:
def get_vector(embeddings, word):
  assert word in embeddings.stoi, f'*{word}* not in vocab'
  return embeddings.vectors[embeddings.stoi[word]]

In [14]:
get_vector(glove,'paper')

tensor([-0.8503,  0.3336, -0.6589, -0.4987,  0.3659, -0.1925,  0.2566, -0.0534,
         0.3147,  0.2443,  0.2934, -0.4492,  0.1517,  0.3931, -0.3179,  0.0605,
         0.8177, -0.3885,  0.7676, -1.1041, -0.1544,  0.3165, -0.3724, -0.1148,
         0.5163, -0.3929,  0.1630, -0.2532, -0.5098,  0.1520,  0.2781,  0.5252,
        -0.3882, -0.3472, -0.6182,  0.1702,  0.1225, -0.2419, -0.3888, -0.5318,
        -0.4699, -0.7050, -0.6213, -0.3869, -0.8564, -0.4100, -0.4749, -0.2108,
        -0.8134, -0.5240,  0.4989,  0.3791,  0.5543,  1.1230, -0.4212, -1.5674,
        -0.5689,  0.4082,  1.7949,  0.1686, -0.0029,  0.2879, -0.9009, -0.0942,
         0.7999, -0.3910,  0.7629,  0.7131,  0.1319, -0.4076, -0.1869,  0.8956,
         0.4687, -0.0029,  0.0253,  1.0084,  0.1714,  0.5974, -1.1003,  0.4931,
         0.4178,  0.1728, -0.4947,  0.0878, -0.9669, -1.0920,  0.3390, -0.5129,
         0.2464,  0.2714,  0.2421, -0.2171,  0.5504,  0.0082, -0.4557,  0.1353,
        -0.0431, -0.4141,  0.7005,  0.18

In [16]:
def closest(embeddings, vector, n=6):
  distances=[]

  for neighbor in embeddings.itos:
    distances.append((neighbor, torch.dist(vector, get_vector(embeddings, neighbor))))

  return sorted(distances, key=lambda x: x[1])[:n]

In [21]:
closest(glove, get_vector(glove,'paper'))

[('paper', tensor(0.)),
 ('papers', tensor(3.8442)),
 ('printed', tensor(4.1970)),
 ('print', tensor(4.2666)),
 ('sheet', tensor(4.3835)),
 ('printing', tensor(4.4179))]

In [22]:
closest(glove, get_vector(glove,'comedy'))

[('comedy', tensor(0.)),
 ('drama', tensor(3.3022)),
 ('sitcom', tensor(3.7490)),
 ('movie', tensor(3.7593)),
 ('comedies', tensor(4.0037)),
 ('film', tensor(4.1829))]

In [24]:
def print_tuples(tuples):
  for t in tuples:
    print('(%.4f) %s' % (t[1], t[0]))

In [28]:
print_tuples(closest(glove, get_vector(glove,'stupendous')))

(0.0000) stupendous
(2.5795) marvellous
(2.7539) frightful
(2.8506) stupefying
(2.8561) awe-inspiring
(2.9179) mind-blowing


In [27]:
def analogy(embeddings, w1, w2, w3, n=6):
  print('\n[%s :%s ::%s : ?]' % (w1,w2,w3))
  #queen - king + man
  closest_words = closest(embeddings, \
                          get_vector(embeddings, w2)
                          -get_vector(embeddings, w1) \
                          +get_vector(embeddings, w3), \
                          n+3)
  closest_words = [x for x in closest_words if x[0] not in [w1,w2,w3]][:n]

  return closest_words

In [29]:
print_tuples(analogy(glove, 'moon', 'night','sun'))


[moon :night ::sun : ?]
(5.7069) morning
(5.7276) afternoon
(5.8023) evening
(6.1410) hours
(6.2797) saturday
(6.3056) sunday


In [30]:
print_tuples(analogy(glove, 'earth', 'moon','sun'))


[earth :moon ::sun : ?]
(6.2294) lee
(6.4125) kang
(6.4644) tan
(6.4757) yang
(6.4853) lin
(6.5220) chong
