# Imports

In [4]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from scipy import spatial

from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

  return f(*args, **kwds)
  return f(*args, **kwds)


In [7]:
#nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('en_core_web_md')
nlp = spacy.load('en_core_web_lg')

  return f(*args, **kwds)
  return f(*args, **kwds)


In [10]:
sentiment_analyzer = SentimentIntensityAnalyzer()

# UDFs

In [12]:
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

In [13]:
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)

# Data init

In [30]:
#doc = nlp(u"Great flavors and slow burning, but high pricing")
doc = nlp(u"The burning was rather slow for me, although the flavors were nothing less than great")

In [31]:
df_noun_chunks = pd.DataFrame(columns = ['text', 'root.text'])
df_parse_tree = pd.DataFrame(columns = ['text', 'dep_', 'head.text'])

# Noun chunks

In [32]:
for chunk in doc.noun_chunks:
    df = pd.DataFrame({'text': [chunk.text], 'root.text': [chunk.root.text]})
    df_noun_chunks = df_noun_chunks.append(df, ignore_index=True)

df_noun_chunks

Unnamed: 0,root.text,text
0,burning,The burning
1,me,me
2,flavors,the flavors
3,nothing,nothing


# Dependencies

In [33]:
for token in doc:
    df = pd.DataFrame({'text': [token.text], 'dep_': [token.dep_],
                            'head.text': [token.head.text]})
    df_parse_tree = df_parse_tree.append(df, ignore_index=True)

df_parse_tree

Unnamed: 0,dep_,head.text,text
0,det,burning,The
1,nsubj,was,burning
2,ROOT,was,was
3,advmod,slow,rather
4,acomp,was,slow
5,prep,was,for
6,pobj,for,me
7,punct,was,","
8,mark,were,although
9,det,flavors,the


In [34]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})

# Word vectors

In [200]:
#print(nlp.vocab['great'].vector)

## Context similarity

In [18]:
man = nlp.vocab['man']
woman = nlp.vocab['woman']
lion = nlp.vocab['lion']
lioness = nlp.vocab['lioness']
animal = nlp.vocab['animal']

print(man.similarity(woman), lion.similarity(lioness), man.similarity(lion),
      man.similarity(animal), lion.similarity(animal))

0.7401745 0.6547742 0.38981822 0.36653492 0.5395695


In [19]:
maybe_lioness_vec = woman.vector - man.vector + lion.vector
print(cosine_similarity(maybe_lioness_vec, lioness.vector))
print(cosine_similarity(man.vector, maybe_lioness_vec), 
      cosine_similarity(woman.vector, maybe_lioness_vec),
      cosine_similarity(lion.vector, maybe_lioness_vec),
      cosine_similarity(animal.vector, maybe_lioness_vec))

0.6412914395332336
0.16503818333148956 0.5402908325195312 0.7906613349914551 0.46803441643714905


## Sentiment similarity

In [20]:
great = nlp.vocab['great']
worst = nlp.vocab['worst']
bad = nlp.vocab['bad']
good = nlp.vocab['good']
ok = nlp.vocab['ok']

print(great.similarity(worst), bad.similarity(good), great.similarity(ok),
      worst.similarity(good), ok.similarity(bad))

0.4199868 0.73550904 0.3975385 0.49711248 0.60613424


In [21]:
target_doc = nlp(u"Bad flavors and fast burning, and low pricing")
 
print(target_doc.similarity(doc))

0.9684656445297347


# Extensions

In [22]:
Doc.set_extension('polarity_scores', getter=polarity_scores, force=True)

In [23]:
print(doc._.polarity_scores)
print(target_doc._.polarity_scores)

{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'compound': 0.3716}
{'neg': 0.483, 'neu': 0.517, 'pos': 0.0, 'compound': -0.6808}


# References

https://spacy.io/usage/linguistic-features

https://nlpforhackers.io/complete-guide-to-spacy/

https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/