In [1]:
import spacy
nlp = spacy.load("en_core_web_md")

In [2]:
# 300 dimensional vector
nlp(u"lion").vector.shape

(300,)

In [3]:
nlp(u"lion").vector

array([ 1.8963e-01, -4.0309e-01,  3.5350e-01, -4.7907e-01, -4.3311e-01,
        2.3857e-01,  2.6962e-01,  6.4332e-02,  3.0767e-01,  1.3712e+00,
       -3.7582e-01, -2.2713e-01, -3.5657e-01, -2.5355e-01,  1.7543e-02,
        3.3962e-01,  7.4723e-02,  5.1226e-01, -3.9759e-01,  5.1333e-03,
       -3.0929e-01,  4.8911e-02, -1.8610e-01, -4.1702e-01, -8.1639e-01,
       -1.6908e-01, -2.6246e-01, -1.5983e-02,  1.2479e-01, -3.7276e-02,
       -5.7125e-01, -1.6296e-01,  1.2376e-01, -5.5464e-02,  1.3244e-01,
        2.7519e-02,  1.2592e-01, -3.2722e-01, -4.9165e-01, -3.5559e-01,
       -3.0630e-01,  6.1185e-02, -1.6932e-01, -6.2405e-02,  6.5763e-01,
       -2.7925e-01, -3.0450e-03, -2.2400e-02, -2.8015e-01, -2.1975e-01,
       -4.3188e-01,  3.9864e-02, -2.2102e-01, -4.2693e-02,  5.2748e-02,
        2.8726e-01,  1.2315e-01, -2.8662e-02,  7.8294e-02,  4.6754e-01,
       -2.4589e-01, -1.1064e-01,  7.2250e-02, -9.4980e-02, -2.7548e-01,
       -5.4097e-01,  1.2823e-01, -8.2408e-02,  3.1035e-01, -6.33

In [4]:
# document take average of all word vectors
nlp(u"The quick brown fox jumped").vector

array([-2.09217995e-01, -2.78227981e-02, -3.57064009e-02,  1.55218393e-01,
       -1.28050027e-02,  1.31627038e-01, -1.99465990e-01,  4.75811996e-02,
        1.26798794e-01,  1.64792800e+00, -3.57592016e-01, -1.39875397e-01,
       -1.26122087e-02, -2.02728346e-01, -2.25237608e-01,  2.15431936e-02,
        7.78958052e-02,  9.29676056e-01, -2.75549982e-02, -3.71005982e-01,
       -1.42800003e-01, -3.66641544e-02, -1.07376035e-02, -1.84352830e-01,
        2.29006782e-02, -5.17717972e-02, -2.78652012e-01, -1.19738199e-01,
        5.10960072e-03, -2.85990000e-01, -1.58261746e-01,  2.96241999e-01,
        1.09597601e-01, -4.18331996e-02,  1.87256075e-02, -1.03439607e-01,
       -5.10879979e-02, -3.51091917e-03, -6.81461841e-02, -2.05657601e-01,
        1.66347414e-01, -9.31599736e-03, -4.61134054e-02, -1.05457589e-01,
        2.31313989e-01,  1.80005193e-01, -2.06444815e-01, -1.37050152e-02,
        1.70106202e-01, -2.19812002e-02, -2.14003205e-01,  1.07415602e-01,
       -2.80592032e-02, -

In [5]:
tokens = nlp(u"lion cat pet")
# similarity is between 0 and 1 : 1 means 100% similar
# lion.similarity(lion) = 1 or 100%
for token1 in tokens:
    for token2 in tokens:
        print(f"{token1.text:{10}} {token2.text:{10}} {token1.similarity(token2)}")

lion       lion       1.0
lion       cat        0.5265437364578247
lion       pet        0.39923766255378723
cat        lion       0.5265437364578247
cat        cat        1.0
cat        pet        0.7505456209182739
pet        lion       0.39923766255378723
pet        cat        0.7505456209182739
pet        pet        1.0


In [6]:
tokens = nlp(u"like love hate")
for token1 in tokens:
    for token2 in tokens:
        print(f"{token1.text:{10}} {token2.text:{10}} {token1.similarity(token2)}")

like       like       1.0
like       love       0.6579039692878723
like       hate       0.6574652194976807
love       like       0.6579039692878723
love       love       1.0
love       hate       0.6393098831176758
hate       like       0.6574652194976807
hate       love       0.6393098831176758
hate       hate       1.0


In [7]:
nlp.vocab.vectors.shape # 20,000 words each with 300 dimensions 

(20000, 300)

In [8]:
# check the token in vocab
# oov : out of vocab
# vector norm : sum of squares of 300 dimensions
tokens = nlp(u"dog cat kavinda")

for token in tokens:
    print(f"{token.text:{10}} {token.has_vector:<{10}} {token.vector_norm:{10}} {token.is_oov:{10}}")

dog        1          7.033673286437988          0
cat        1          6.680818557739258          0
kavinda    0                 0.0          1


In [9]:
# some common names are in vocabulary
tokens = nlp(u"charls henry john")

for token in tokens:
    print(f"{token.text:{10}} {token.has_vector:<{10}} {token.vector_norm:{10}} {token.is_oov:{10}}")

charls     1          7.46746301651001          0
henry      1          6.386934280395508          0
john       1          6.533577919006348          0


## Arithmetic operations in word vectors

In [10]:
from scipy import spatial

In [11]:
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2)

In [12]:
king = nlp.vocab["king"].vector
man = nlp.vocab["man"].vector
woman = nlp.vocab["woman"].vector

In [13]:
# king - man + woman ---> NEW_VECTOR similar to queen, princess, highness
new_vector = king - man + woman

In [14]:
computed_similarities = []

# for all words in nlp vocab
for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

In [15]:
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])
#                                                                      |     |___ index 1 : similarity
#                                                                      |_________ descending order

In [16]:
print([w[0].text for w in computed_similarities[:10]])

['king', 'woman', 'she', 'lion', 'who', 'john', 'fox', 'henry', 'brown', 'when']
