In [1]:
from gensim.models import KeyedVectors

In [4]:
fname = "../data/glove.6B.300d.txt"
glove = KeyedVectors.load_word2vec_format(fname=fname,
                                          no_header=True)

(400000, 300)

In [11]:
print(f"Embedding size-> words: {glove.vectors.shape[0]}, dimensions: {glove.vectors.shape[1]}")

Embedding size-> words: 400000, dimensions: 300


---

### Word Similarity
We can check the 10 most similar words using ``most_similar``, it uses cosine similarity to check which embeddings are more similar

In [23]:
glove.most_similar("cactus")

[('cacti', 0.6634564399719238),
 ('saguaro', 0.6195855140686035),
 ('pear', 0.5233486890792847),
 ('cactuses', 0.5178281664848328),
 ('prickly', 0.515631914138794),
 ('mesquite', 0.4844855070114136),
 ('opuntia', 0.4540084898471832),
 ('shrubs', 0.45362064242362976),
 ('peyote', 0.45344963669776917),
 ('succulents', 0.4512787461280823)]

In [20]:
# We can see different meanings for 'fall'--> falling != spring
glove.most_similar("fall")

[('falling', 0.6513392925262451),
 ('rise', 0.6301450729370117),
 ('drop', 0.6298140287399292),
 ('decline', 0.6145920157432556),
 ('beginning', 0.6086390614509583),
 ('spring', 0.5864909887313843),
 ('year', 0.5789673328399658),
 ('coming', 0.5778051018714905),
 ('fallen', 0.5676990747451782),
 ('fell', 0.5675972104072571)]

---

### Word Analogies

We will check how semantic information is encoded by word embeddings

In [27]:
# out = king - man + woman
glove.most_similar(positive=["king", "woman"],
                   negative=["man"])


[('queen', 0.6713276505470276),
 ('princess', 0.5432624220848083),
 ('throne', 0.5386104583740234),
 ('monarch', 0.5347574949264526),
 ('daughter', 0.498025119304657),
 ('mother', 0.4956442713737488),
 ('elizabeth', 0.483265221118927),
 ('kingdom', 0.47747090458869934),
 ('prince', 0.4668239951133728),
 ('wife', 0.46473270654678345)]

In [28]:
# out = japan - yen + peso
glove.most_similar(positive=["japan", "peso"],
                   negative=["yen"])

[('mexico', 0.5726832151412964),
 ('philippines', 0.5445368885993958),
 ('peru', 0.4838225543498993),
 ('venezuela', 0.4816672205924988),
 ('brazil', 0.4664309620857239),
 ('argentina', 0.45490506291389465),
 ('philippine', 0.4417841136455536),
 ('chile', 0.4396097660064697),
 ('colombia', 0.4386259913444519),
 ('thailand', 0.43396785855293274)]

In [34]:
# out = spain - madrid + cuba
glove.most_similar(positive=["spain", "cuba"],
                   negative=["madrid"])

[('venezuela', 0.5744216442108154),
 ('nicaragua', 0.54659104347229),
 ('cuban', 0.5447268486022949),
 ('mexico', 0.5030182600021362),
 ('dominican', 0.4905185103416443),
 ('castro', 0.47028154134750366),
 ('argentina', 0.4679957926273346),
 ('panama', 0.45990291237831116),
 ('honduras', 0.4594337046146393),
 ('cubans', 0.45838162302970886)]

In [35]:
# out = best - good + tall
glove.most_similar(positive=["best", "tall"],
                   negative=["good"])

[('tallest', 0.5077418684959412),
 ('taller', 0.47616496682167053),
 ('height', 0.46000051498413086),
 ('metres', 0.4584786593914032),
 ('cm', 0.45212721824645996),
 ('meters', 0.44067245721817017),
 ('towering', 0.42784255743026733),
 ('centimeters', 0.42345431447029114),
 ('inches', 0.4174586832523346),
 ('erect', 0.4087314009666443)]

In [37]:
# out = worst - bad + small
glove.most_similar(positive=["worst", "small"],
                   negative=["bad"])

[('largest', 0.5376060605049133),
 ('tiny', 0.5351578593254089),
 ('large', 0.5282967686653137),
 ('smallest', 0.50852370262146),
 ('smaller', 0.5056758522987366),
 ('larger', 0.4700247049331665),
 ('scale', 0.43181347846984863),
 ('sized', 0.4149516820907593),
 ('in', 0.40775397419929504),
 ('biggest', 0.406604140996933)]