### Word Vectors and Operations

#### Acknowledgements

- https://spacy.io/
- https://spacy.io/usage/spacy-101

#### Packages

In [16]:
import spacy as pkg_spacy
import scipy.spatial as pkg_spatial

#### Work: Small Size Model

In [17]:
# Load English tokenizer, tagger, parser and NER
nlp_sm = pkg_spacy.load("en_core_web_sm")

In [18]:
# Process whole documents
text = ("When Sebastian Thrun started working on self-driving cars at "
        "Google in 2007, few people outside of the company took him "
        "seriously. “I can tell you very senior CEOs of major American "
        "car companies would shake my hand and turn away because I wasn’t "
        "worth talking to,” said Thrun, in an interview with Recode earlier "
        "this week.")
doc = nlp_sm(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_)

Noun phrases: ['Sebastian Thrun', 'self-driving cars', 'Google', 'few people', 'the company', 'him', 'I', 'you', 'very senior CEOs', 'major American car companies', 'my hand', 'I', 'Thrun', 'an interview', 'Recode']
Verbs: ['start', 'work', 'drive', 'take', 'tell', 'shake', 'turn', 'talk', 'say']
Sebastian Thrun PERSON
2007 DATE
American NORP
Thrun GPE
Recode ORG
earlier this week DATE


In [19]:
doc = nlp_sm("Apple is looking at buying U.K. startup for $1 billion")

for token in doc:
    print(token.text, token.pos_, token.dep_)


for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dobj
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj
Apple Apple PROPN NNP nsubj Xxxxx True False
is be AUX VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. U.K. PROPN NNP dobj X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False


#### Work: Medium Size Model

In [20]:
# Load Medium size package
nlp_md = pkg_spacy.load("en_core_web_md")

#### Work: Large Size Model

In [21]:
# Load Large Package
nlp_lg = pkg_spacy.load("en_core_web_lg")

In [22]:
tokens = nlp_lg("dog cat banana afskfsd")

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

# for token in tokens:
#     print(token.vector)

dog True 75.254234 False
cat True 63.188496 False
banana True 31.620354 False
afskfsd False 0.0 True


In [23]:
doc1 = nlp_lg("I like salty fries and hamburgers.")
doc2 = nlp_lg("Fast food tastes very good.")

# Similarity of two documents
print(doc1, "<->", doc2, doc1.similarity(doc2))

# Similarity of tokens and spans
french_fries = doc1[2:4]
burgers = doc1[5]

print(french_fries, "<->", burgers, french_fries.similarity(burgers))

I like salty fries and hamburgers. <-> Fast food tastes very good. 0.6871286202797843
salty fries <-> hamburgers 0.6901010870933533


In [24]:
king_doc = nlp_lg("king")
man_doc = nlp_lg("man")

print ("king_vector_norm = {}, man_vector_norm = {}, diff_vector_norm = {}, similarity = {}".format( \
    king_doc.vector_norm, man_doc.vector_norm, abs(king_doc.vector_norm - man_doc.vector_norm), king_doc.similarity(man_doc)))

king_vector_norm = 69.68691461050098, man_vector_norm = 68.7250111232666, diff_vector_norm = 0.9619034872343803, similarity = 0.4166158683466932


**Notes**:
Vector Norm (vector_norm) values do not reflect the similary. For example, "king" and "man" have _similar_ vector_norm values (esp. diff < 1), but similarity is _not_ high. So, for doing vector arithmetic, use the full vector than vector norm value.

In [25]:
cosine_similarity = lambda x, y: 1 - pkg_spatial.distance.cosine(x, y)

In [26]:
words = ["king", "queen", "ruler", "man", "woman", "human"]
docs = []

for word in words:
    doc = nlp_lg(word)
    docs.append(doc)

#print(words)
#print(vectors)

# Similarities
for i in range(len(docs)):
    for j in range(len(docs)-i):
        print ("Similarity ({}, {}) = ({}, {}):\n\t(diff={}, similarity={}, cosine_similarity={})".format(\
            words[i], words[i+j], docs[i].vector_norm, docs[i+j].vector_norm, \
            abs(docs[i].vector_norm - docs[i+j].vector_norm), docs[i].similarity(docs[i+j]), \
            cosine_similarity(docs[i].vector, docs[i+j].vector)))

Similarity (king, king) = (69.68691461050098, 69.68691461050098):
	(diff=0.0, similarity=1.0, cosine_similarity=1)
Similarity (king, queen) = (69.68691461050098, 44.440847813672285):
	(diff=25.246066796828693, similarity=0.6108841234425123, cosine_similarity=0.6108841300010681)
Similarity (king, ruler) = (69.68691461050098, 52.792574722108114):
	(diff=16.894339888392864, similarity=0.7641486322987932, cosine_similarity=0.7641486525535583)
Similarity (king, man) = (69.68691461050098, 68.7250111232666):
	(diff=0.9619034872343803, similarity=0.4166158683466932, cosine_similarity=0.41661590337753296)
Similarity (king, woman) = (69.68691461050098, 53.99287730357096):
	(diff=15.69403730693002, similarity=0.3572872976296661, cosine_similarity=0.3572872281074524)
Similarity (king, human) = (69.68691461050098, 63.08981987699371):
	(diff=6.597094733507269, similarity=0.2968272202890541, cosine_similarity=0.2968272268772125)
Similarity (queen, queen) = (44.440847813672285, 44.440847813672285):
	(

In [27]:
king_vec = nlp_lg.vocab.get_vector("king")
man_vec = nlp_lg.vocab.get_vector("man")
woman_vec = nlp_lg.vocab.get_vector("woman")
queen_vec = nlp_lg.vocab.get_vector("queen")

k2q_vec = king_vec - man_vec + woman_vec
queen_similiarity = cosine_similarity(queen_vec, k2q_vec)

q2k_vec = queen_vec - woman_vec + man_vec
king_similiarity = cosine_similarity(king_vec, q2k_vec)

# Find the first ten closest vectors in the vocabulary to the computed vectors
k2q_similarities = []
q2k_similarities = []

for word in nlp_lg.vocab:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                k2q_similarity = cosine_similarity(k2q_vec, word.vector)
                k2q_similarities.append({ "word" : word, "similarity" : k2q_similarity})

                q2k_similarity = cosine_similarity(q2k_vec, word.vector)
                q2k_similarities.append({ "word" : word, "similarity" : q2k_similarity})

k2q_similarities = sorted(k2q_similarities, key=lambda entry: entry["similarity"])
q2k_similarities = sorted(q2k_similarities, key=lambda entry: entry["similarity"])

print ("Words near to K2Q")
print ("Similarity (queen, king-man+woman) = {}".format(queen_similiarity))
for i in range(10):
    entry = k2q_similarities[i]
    print("Word = {}, Similiarity = {}, Diff Simialrity = {}".format(\
        entry["word"].text, entry["similarity"], abs(queen_similiarity - entry["similarity"])))

print ("Words near to Q2K")
print ("Similarity (king, queen-woman+man) = {}".format(king_similiarity))
for i in range(10):
    entry = q2k_similarities[i]
    print("Word = {}, Similiarity = {}, Diff Simialrity = {}".format(\
        entry["word"].text, entry["similarity"], abs(king_similiarity - entry["similarity"])))



Words near to K2Q
Similarity (queen, king-man+woman) = 0.6178014278411865
Word = ta, Similiarity = -0.2223987877368927, Diff Simialrity = 0.8402002155780792
Word = cos, Similiarity = -0.21016256511211395, Diff Simialrity = 0.8279639929533005
Word = ai, Similiarity = -0.20889312028884888, Diff Simialrity = 0.8266945481300354
Word = cuz, Similiarity = -0.20390625298023224, Diff Simialrity = 0.8217076808214188
Word = em, Similiarity = -0.14783941209316254, Diff Simialrity = 0.7656408399343491
Word = nuff, Similiarity = -0.14137345552444458, Diff Simialrity = 0.7591748833656311
Word = coz, Similiarity = -0.1318208873271942, Diff Simialrity = 0.7496223151683807
Word = got, Similiarity = -0.12722618877887726, Diff Simialrity = 0.7450276166200638
Word = doin, Similiarity = -0.1028217151761055, Diff Simialrity = 0.720623143017292
Word = ca, Similiarity = -0.10074420273303986, Diff Simialrity = 0.7185456305742264
Words near to Q2K
Similarity (king, queen-woman+man) = 0.6300731301307678
Word = v

**Puzzles**:
- Why is cosine value > 1? If that's not the case, how come (1-cosine) is negative?
- The model version that has been downloaded when this exercise ran seems to be very badly trained based on above outputs
- For example, for q2k vector, nearest words are "v", "m", "x", "e", "c", "g" (which are single letter and not meaningful)
- Similarly, for k2q vector, nearest words are "ta", "ai", "ca" which are not meaningful words