In [1]:
# Adapted from Chapter 13 in LDL
# Note that technically this is NOT a part of deep learning but still very relevant for NLP perspective (which is largely influenced by neural networks)

In [4]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2022-08-27 00:54:51--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-08-27 00:54:51--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-08-27 00:54:51--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [6]:
import numpy as np
import scipy.spatial

# Read embeddings from file.
def read_embeddings():
  FILE_NAME = 'glove.6B.100d.txt'
  embeddings = {}
  file = open(FILE_NAME, 'r', encoding='utf-8')
  for line in file:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    embeddings[word] = vector
  file.close()
  print('Read %s embeddings. ' % len(embeddings))
  return embeddings

In [7]:
def print_n_closest(embeddings, vec0, n):
  word_distances = {}
  for (word, vec1) in embeddings.items():
    distance = scipy.spatial.distance.cosine(vec1, vec0)
    word_distances[distance] = word
  # Print words sorted by distance
  for distance in sorted(word_distances.keys())[:n]:
    word = word_distances[distance]
    print(word + ': %6.3f' % distance)

In [8]:
embeddings = read_embeddings()

Read 400000 embeddings. 


In [9]:
lookup_word = 'hello'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],3)


Words closest to hello
hello:  0.000
goodbye:  0.209
hey:  0.283


In [10]:
embeddings[lookup_word]

array([ 0.26688  ,  0.39632  ,  0.6169   , -0.77451  , -0.1039   ,
        0.26697  ,  0.2788   ,  0.30992  ,  0.0054685, -0.085256 ,
        0.73602  , -0.098432 ,  0.5479   , -0.030305 ,  0.33479  ,
        0.14094  , -0.0070003,  0.32569  ,  0.22902  ,  0.46557  ,
       -0.19531  ,  0.37491  , -0.7139   , -0.51775  ,  0.77039  ,
        1.0881   , -0.66011  , -0.16234  ,  0.9119   ,  0.21046  ,
        0.047494 ,  1.0019   ,  1.1133   ,  0.70094  , -0.08696  ,
        0.47571  ,  0.1636   , -0.44469  ,  0.4469   , -0.93817  ,
        0.013101 ,  0.085964 , -0.67456  ,  0.49662  , -0.037827 ,
       -0.11038  , -0.28612  ,  0.074606 , -0.31527  , -0.093774 ,
       -0.57069  ,  0.66865  ,  0.45307  , -0.34154  , -0.7166   ,
       -0.75273  ,  0.075212 ,  0.57903  , -0.1191   , -0.11379  ,
       -0.10026  ,  0.71341  , -1.1574   , -0.74026  ,  0.40452  ,
        0.18023  ,  0.21449  ,  0.37638  ,  0.11239  , -0.53639  ,
       -0.025092 ,  0.31886  , -0.25013  , -0.63283  , -0.0118

In [11]:
len(embeddings[lookup_word])

100

In [12]:
lookup_word = 'precisely'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],3)


Words closest to precisely
precisely:  0.000
exactly:  0.147
accurately:  0.293


In [13]:
lookup_word = 'dog'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],3)


Words closest to dog
dog:  0.000
cat:  0.120
dogs:  0.166


In [14]:
lookup_word = 'sad'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],10)


Words closest to sad
sad:  0.000
sorry:  0.245
awful:  0.272
tragic:  0.276
horrible:  0.295
happy:  0.320
heartbreaking:  0.324
poignant:  0.328
scary:  0.330
terrible:  0.333


In [17]:
lookup_word = 'hamburger'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],5)


Words closest to hamburger
hamburger:  0.000
burger:  0.350
pizza:  0.417
burgers:  0.425
meat:  0.426


In [16]:
# We can also do word vector arithmetic
lookup_word = 'king'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],3)

lookup_word = '(king - man + woman'
print('\nWords closest to ' + lookup_word)
vec = embeddings['king'] - embeddings['man'] + embeddings['woman']
print_n_closest(embeddings, vec,3)


Words closest to king
king:  0.000
prince:  0.232
queen:  0.249

Words closest to (king - man + woman
king:  0.145
queen:  0.217
monarch:  0.307


In [20]:
# Let's try another example
lookup_word = '(hunger - lethargy + sleep'
print('\nWords closest to ' + lookup_word)
vec = embeddings['hunger'] - embeddings['lethargy'] + embeddings['sleep']
print_n_closest(embeddings, vec,3)


Words closest to (hunger - lethargy + sleep
hunger:  0.222
sleep:  0.237
dying:  0.313


In [22]:
# Back to the example from the book - but with more words output [default was 3]
lookup_word = 'sweden'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],7)

lookup_word = 'madrid'
print('\nWords closest to ' + lookup_word)
print_n_closest(embeddings, embeddings[lookup_word],7)

lookup_word = '(madrid - spain + sweden'
print('\nWords closest to ' + lookup_word)
vec = embeddings['madrid'] - embeddings['spain'] + embeddings['sweden']
print_n_closest(embeddings, vec,7)


Words closest to sweden
sweden:  0.000
denmark:  0.138
norway:  0.193
finland:  0.209
netherlands:  0.253
austria:  0.253
switzerland:  0.277

Words closest to madrid
madrid:  0.000
barcelona:  0.157
valencia:  0.197
milan:  0.276
spain:  0.292
bilbao:  0.292
sevilla:  0.317

Words closest to (madrid - spain + sweden
stockholm:  0.271
sweden:  0.300
copenhagen:  0.305
munich:  0.336
hamburg:  0.355
oslo:  0.358
dortmund:  0.369
