The goal is to check that the vector result of *king - man + woman* is close to *queen* vector

## Try with a spaCy pretrained embedding

In [1]:
import spacy
import spacy.cli
from scipy import spatial
# we dowload a nlp english model (with a pre-trained 300-dimension embedding) 
spacy.cli.download("en_core_web_md")
nlp = spacy.load('en_core_web_md')

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


spaCy allows to compute directly a pre-trained 300-dimension embedding for every word


In [None]:
king = nlp.vocab['king']
king.vector

array([ 3.1542e-01, -3.5068e-01,  4.2923e-01, -5.3825e-01, -1.8480e-01,
       -3.1082e-01,  2.9196e-01, -7.1030e-01, -2.3867e-01,  1.8471e+00,
       -3.6446e-01, -5.1282e-01,  1.2210e-01,  3.8909e-01, -7.3204e-02,
        3.5462e-02,  3.3289e-01,  6.6466e-01,  2.7175e-02,  4.2021e-01,
       -1.4520e-01,  3.7991e-01, -6.0520e-01,  1.0695e-01, -6.4716e-01,
       -1.0739e-02, -3.9754e-01,  3.8857e-01, -2.0134e-01,  6.9813e-01,
       -3.2411e-01,  7.3085e-01, -1.0930e-01, -2.3511e-01,  1.8482e-01,
       -1.1595e-01, -7.1003e-01, -2.2974e-01, -4.1979e-01,  8.1004e-03,
       -1.0504e-01, -4.4802e-01, -7.3928e-02, -4.2380e-01,  2.8482e-01,
       -7.4517e-02,  9.8161e-02,  6.4602e-01, -2.5832e-01, -2.0452e-02,
       -6.6863e-02,  5.1501e-01,  1.6758e-01,  1.2329e-01,  1.9636e-01,
        1.1958e-01, -1.8296e-01, -1.4325e-01, -2.7758e-01,  5.0597e-02,
       -6.6122e-02, -1.8920e-01,  3.3300e-01,  2.5319e-01,  6.6355e-01,
        6.6735e-01,  4.9969e-01,  1.5481e-01, -8.4247e-02, -2.29

In [None]:
king.vector.shape

(300,)

In [None]:
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']


## Try with a pretrained Word2Vec embedding model

**Important** To prevent RAM crash in the execution environment, please restart from here the running environment (Execution -> Restart the running environment)

In [10]:
!pip install  gensim
import gensim# Load pretrained vectors from Google
from gensim.models import KeyedVectors

--2022-07-04 18:59:14--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.110.45
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.110.45|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


We load the pre-trained glove vectors based on 2B tweets, 27B tokens, 1.2M vocab, uncased embedding models (100-dimension embedding)

In [11]:
import gensim.downloader as api
word_vectors = api.load("glove-wiki-gigaword-100")


In [7]:
king = word_vectors['king']

print(king)

[-0.32307  -0.87616   0.21977   0.25268   0.22976   0.7388   -0.37954
 -0.35307  -0.84369  -1.1113   -0.30266   0.33178  -0.25113   0.30448
 -0.077491 -0.89815   0.092496 -1.1407   -0.58324   0.66869  -0.23122
 -0.95855   0.28262  -0.078848  0.75315   0.26584   0.3422   -0.33949
  0.95608   0.065641  0.45747   0.39835   0.57965   0.39267  -0.21851
  0.58795  -0.55999   0.63368  -0.043983 -0.68731  -0.37841   0.38026
  0.61641  -0.88269  -0.12346  -0.37928  -0.38318   0.23868   0.6685
 -0.43321  -0.11065   0.081723  1.1569    0.78958  -0.21223  -2.3211
 -0.67806   0.44561   0.65707   0.1045    0.46217   0.19912   0.25802
  0.057194  0.53443  -0.43133  -0.34311   0.59789  -0.58417   0.068995
  0.23944  -0.85181   0.30379  -0.34177  -0.25746  -0.031101 -0.16285
  0.45169  -0.91627   0.64521   0.73281  -0.22752   0.30226   0.044801
 -0.83741   0.55006  -0.52506  -1.7357    0.4751   -0.70487   0.056939
 -0.7132    0.089623  0.41394  -1.3363   -0.61915  -0.33089  -0.52881
  0.16483  -0.98878

In [8]:
king.shape

(100,)

In [9]:
# king - man + woman = queen
# cosin similarity between the mean
print(word_vectors.most_similar(positive=['woman', 'king'], negative=['man']))

[('queen', 0.7698541283607483), ('monarch', 0.6843380928039551), ('throne', 0.6755735874176025), ('daughter', 0.6594556570053101), ('princess', 0.6520534753799438), ('prince', 0.6517034769058228), ('elizabeth', 0.6464517712593079), ('mother', 0.6311717629432678), ('emperor', 0.6106470823287964), ('wife', 0.6098655462265015)]


## Try with fastText embedding

**Important** To prevent RAM crash in the execution environment, please restart from here the running environment (Execution -> Restart the running environment)

In [1]:
#Download, extract and load Fasttext word embedding model
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gunzip /content/cc.en.300.bin.gz
!pip install fasttext

--2022-07-04 19:33:26--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2022-07-04 19:37:16 (18.7 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 5.7 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.2-py2.py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp37-cp37m-

Load the english fastText model

In [2]:
import fasttext 

model = fasttext.load_model("/content/cc.en.300.bin")



In [3]:
model.get_word_vector("king")

array([-2.63642855e-02, -4.38338369e-02, -5.22461310e-02,  2.49765869e-02,
        1.59946546e-01,  4.98980191e-03,  2.51637166e-03, -1.62712112e-02,
       -6.62135556e-02, -1.67888845e-03, -1.39499649e-01, -5.72493225e-02,
       -1.45975351e-01, -1.56568401e-02,  3.75731173e-03,  8.14326331e-02,
        9.02080238e-02, -6.22668210e-03, -1.21208653e-01,  8.42568502e-02,
        6.83858395e-02,  1.01658493e-01, -5.07243127e-02,  9.16049480e-02,
        5.08386921e-03,  6.28780201e-02,  5.67676872e-02,  1.91132650e-01,
        4.35085818e-02,  1.80901110e-01, -1.74744725e-02,  7.06654340e-02,
       -6.06337450e-02,  3.89074199e-02,  1.44602428e-03, -1.25214964e-01,
        8.63592885e-03, -7.98915625e-02, -1.00960366e-01,  4.66771051e-02,
        5.39167747e-02,  4.82006092e-03, -2.03307956e-01, -1.17739499e-01,
       -1.37199834e-01, -4.92817685e-02, -1.87217459e-01, -7.17959851e-02,
       -1.86646730e-02, -9.93231237e-02, -5.15213236e-02, -1.93316743e-01,
       -8.94939303e-02, -

It is possible to get directly the nearest neighbors of a specific word (or even n-gram)

In [4]:
model.get_nearest_neighbors("king")

[(0.7550359964370728, 'kings'),
 (0.7068519592285156, 'queen'),
 (0.7060439586639404, 'king-'),
 (0.6811205148696899, 'king.'),
 (0.660710871219635, 'king.The'),
 (0.6591265797615051, 'King'),
 (0.6495252251625061, 'prince'),
 (0.6278106570243835, '-king'),
 (0.6183920502662659, 'monarch'),
 (0.6070184707641602, 'queen-mother')]

In [5]:
model.get_analogies( "king", "man", "woman")

[(0.7554811835289001, 'queen'),
 (0.6141632199287415, 'queen-mother'),
 (0.5755330920219421, 'princess'),
 (0.5741076469421387, 'monarch'),
 (0.5688967704772949, 'kings'),
 (0.5649929046630859, 'queenship'),
 (0.5638618469238281, 'Queen'),
 (0.5544734597206116, 'empress'),
 (0.5524800419807434, 'consort'),
 (0.5497491955757141, 'queen.The')]

Reference:

https://towardsdatascience.com/word-embeddings-in-2020-review-with-code-examples-11eb39a1ee6d

 https://www.udemy.com/course/nlp-natural-language-processing-with-python