In [16]:
from gensim.models import KeyedVectors
import numpy as np

In [17]:
fname = "../data/glove/glove.6B.300d.txt"
glove = KeyedVectors.load_word2vec_format(fname=fname,
                                          no_header=True)

In [18]:
print(f"Embedding size-> words: {glove.vectors.shape[0]}, dimensions: {glove.vectors.shape[1]}")

Embedding size-> words: 400000, dimensions: 300


---

### Word Similarity
We can check the 10 most similar words using ``most_similar``, it uses cosine similarity to check which embeddings are more similar

In [19]:
glove.most_similar("cactus")

[('cacti', 0.6634564399719238),
 ('saguaro', 0.6195855140686035),
 ('pear', 0.5233486890792847),
 ('cactuses', 0.5178281664848328),
 ('prickly', 0.515631914138794),
 ('mesquite', 0.4844855070114136),
 ('opuntia', 0.4540084898471832),
 ('shrubs', 0.45362064242362976),
 ('peyote', 0.45344963669776917),
 ('succulents', 0.4512787461280823)]

In [20]:
# We can see different meanings for 'fall'--> falling != spring
glove.most_similar("fall")

[('falling', 0.6513392925262451),
 ('rise', 0.6301450729370117),
 ('drop', 0.6298140287399292),
 ('decline', 0.6145920157432556),
 ('beginning', 0.6086390614509583),
 ('spring', 0.5864909887313843),
 ('year', 0.5789673328399658),
 ('coming', 0.5778051018714905),
 ('fallen', 0.5676990747451782),
 ('fell', 0.5675972104072571)]

---

### Word Analogies

We will check how semantic information is encoded by word embeddings

In [21]:
# out = king - man + woman
glove.most_similar(positive=["king", "woman"],
                   negative=["man"])


[('queen', 0.6713276505470276),
 ('princess', 0.5432624220848083),
 ('throne', 0.5386104583740234),
 ('monarch', 0.5347574949264526),
 ('daughter', 0.498025119304657),
 ('mother', 0.4956442713737488),
 ('elizabeth', 0.483265221118927),
 ('kingdom', 0.47747090458869934),
 ('prince', 0.4668239951133728),
 ('wife', 0.46473270654678345)]

In [22]:
# out = japan - yen + peso
glove.most_similar(positive=["japan", "peso"],
                   negative=["yen"])

[('mexico', 0.5726832151412964),
 ('philippines', 0.5445368885993958),
 ('peru', 0.4838225543498993),
 ('venezuela', 0.4816672205924988),
 ('brazil', 0.4664309620857239),
 ('argentina', 0.45490506291389465),
 ('philippine', 0.4417841136455536),
 ('chile', 0.4396097660064697),
 ('colombia', 0.4386259913444519),
 ('thailand', 0.43396785855293274)]

In [23]:
# out = spain - madrid + cuba
glove.most_similar(positive=["spain", "cuba"],
                   negative=["madrid"])

[('venezuela', 0.5744216442108154),
 ('nicaragua', 0.54659104347229),
 ('cuban', 0.5447268486022949),
 ('mexico', 0.5030182600021362),
 ('dominican', 0.4905185103416443),
 ('castro', 0.47028154134750366),
 ('argentina', 0.4679957926273346),
 ('panama', 0.45990291237831116),
 ('honduras', 0.4594337046146393),
 ('cubans', 0.45838162302970886)]

In [24]:
# out = best - good + tall
glove.most_similar(positive=["best", "tall"],
                   negative=["good"])

[('tallest', 0.5077418684959412),
 ('taller', 0.47616496682167053),
 ('height', 0.46000051498413086),
 ('metres', 0.4584786593914032),
 ('cm', 0.45212721824645996),
 ('meters', 0.44067245721817017),
 ('towering', 0.42784255743026733),
 ('centimeters', 0.42345431447029114),
 ('inches', 0.4174586832523346),
 ('erect', 0.4087314009666443)]

In [25]:
# out = worst - bad + small
glove.most_similar(positive=["worst", "small"],
                   negative=["bad"])

[('largest', 0.5376060605049133),
 ('tiny', 0.5351578593254089),
 ('large', 0.5282967686653137),
 ('smallest', 0.50852370262146),
 ('smaller', 0.5056758522987366),
 ('larger', 0.4700247049331665),
 ('scale', 0.43181347846984863),
 ('sized', 0.4149516820907593),
 ('in', 0.40775397419929504),
 ('biggest', 0.406604140996933)]

---

### Custom Word Similarity

In [26]:
def most_similar_words(word, vectors, index2key, key2index, topn=10):
    word_id = key2index[word]
    emb = vectors[word_id]
    # Calculate similarities to all words in the vocabulary
    similarities = vectors @ emb
    # We sort words that are similar in ascending order
    ids_ascending = similarities.argsort()
    ids_descending = ids_ascending[::-1] # same list but descending order
    # get boolean array with element corresponding to word_id set to false
    mask = ids_descending != word_id
    # obtain new array of indices that doesn't contain word_id
    # (otherwise the most similar word to the argument would be the argument itself)
    ids_descending = ids_descending[mask]
    # get topn word_ids
    top_ids = ids_descending[:topn]
    # retrieve topn words with their corresponding similarity score
    top_words = [(index2key[i], similarities[i]) for i in top_ids]
    # return results
    return top_words


In [27]:
vectors = glove.get_normed_vectors()
index_to_key = glove.index_to_key
key_to_index = glove.key_to_index
most_similar_words("cactus", vectors, index_to_key, key_to_index)

[('cacti', 0.66345644),
 ('saguaro', 0.6195854),
 ('pear', 0.5233487),
 ('cactuses', 0.5178282),
 ('prickly', 0.5156319),
 ('mesquite', 0.4844855),
 ('opuntia', 0.4540084),
 ('shrubs', 0.45362067),
 ('peyote', 0.4534496),
 ('succulents', 0.45127875)]

In [28]:
# check with 
glove.most_similar("cactus")

[('cacti', 0.6634564399719238),
 ('saguaro', 0.6195855140686035),
 ('pear', 0.5233486890792847),
 ('cactuses', 0.5178281664848328),
 ('prickly', 0.515631914138794),
 ('mesquite', 0.4844855070114136),
 ('opuntia', 0.4540084898471832),
 ('shrubs', 0.45362064242362976),
 ('peyote', 0.45344963669776917),
 ('succulents', 0.4512787461280823)]

---
### Custom Analogy

In [29]:
from numpy.linalg import norm

def analogy(positive, negative, vectors, index_to_key, key_to_index, topn=10):
    # find ids for positive and negative words
    pos_ids = [key_to_index[w] for w in positive]
    neg_ids = [key_to_index[w] for w in negative]
    given_word_ids = pos_ids + neg_ids
    # get embeddings for positive and negative words
    pos_emb = vectors[pos_ids].sum(axis=0)
    neg_emb = vectors[neg_ids].sum(axis=0)
    # get embedding for analogy
    emb = pos_emb - neg_emb
    # normalize embedding
    emb = emb / norm(emb)
    # calculate similarities to all words in out vocabulary
    similarities = vectors @ emb
    # get word_ids in ascending order with respect to similarity score
    ids_ascending = similarities.argsort()
    # reverse word_ids
    ids_descending = ids_ascending[::-1]
    # get boolean array with element corresponding to any of given_word_ids set to false
    given_words_mask = np.isin(ids_descending, given_word_ids, invert=True)
    # obtain new array of indices that doesn't contain any of the given_word_ids
    ids_descending = ids_descending[given_words_mask]
    # get topn word_ids
    top_ids = ids_descending[:topn]
    # retrieve topn words with their corresponding similarity score
    top_words = [(index_to_key[i], similarities[i]) for i in top_ids]
    # return results
    return top_words

In [30]:
positive = ["king", "woman"]
negative = ["man"]
vectors = glove.get_normed_vectors()
index_to_key = glove.index_to_key
key_to_index = glove.key_to_index
analogy(positive, negative, vectors, index_to_key, key_to_index)

[('queen', 0.6713277),
 ('princess', 0.5432624),
 ('throne', 0.5386105),
 ('monarch', 0.5347575),
 ('daughter', 0.49802512),
 ('mother', 0.49564424),
 ('elizabeth', 0.48326525),
 ('kingdom', 0.47747087),
 ('prince', 0.466824),
 ('wife', 0.46473265)]