# Imports

In [191]:
import numpy as np
import pandas as pd
import spacy
from spacy import displacy
from scipy import spatial

from spacy.tokens import Doc
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [192]:
#nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_sm')
#nlp = spacy.load('en_core_web_md')
nlp = spacy.load('en_core_web_lg')

In [193]:
sentiment_analyzer = SentimentIntensityAnalyzer()

# UDFs

In [194]:
cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

In [195]:
def polarity_scores(doc):
    return sentiment_analyzer.polarity_scores(doc.text)

# Data init

In [196]:
doc = nlp(u"Great flavors and slow burning, but high pricing")
df_noun_chunks = pd.DataFrame(columns = ['text', 'root.text'])
df_parse_tree = pd.DataFrame(columns = ['text', 'dep_', 'head.text'])

# Noun chunks

In [197]:
for chunk in doc.noun_chunks:
    df = pd.DataFrame({'text': [chunk.text], 'root.text': [chunk.root.text]})
    df_noun_chunks = df_noun_chunks.append(df, ignore_index=True)

df_noun_chunks

Unnamed: 0,root.text,text
0,flavors,Great flavors
1,burning,slow burning
2,pricing,high pricing


# Dependencies

In [198]:
for token in doc:
    df = pd.DataFrame({'text': [token.text], 'dep_': [token.dep_],
                            'head.text': [token.head.text]})
    df_parse_tree = df_parse_tree.append(df, ignore_index=True)

df_parse_tree

Unnamed: 0,dep_,head.text,text
0,amod,flavors,Great
1,ROOT,flavors,flavors
2,cc,flavors,and
3,amod,burning,slow
4,conj,flavors,burning
5,punct,flavors,","
6,cc,flavors,but
7,amod,pricing,high
8,conj,flavors,pricing


In [199]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 100})

# Word vectors

In [200]:
#print(nlp.vocab['great'].vector)

## Context similarity

In [201]:
man = nlp.vocab['man']
woman = nlp.vocab['woman']
lion = nlp.vocab['lion']
lioness = nlp.vocab['lioness']
animal = nlp.vocab['animal']

print(man.similarity(woman), lion.similarity(lioness), man.similarity(lion),
      man.similarity(animal), lion.similarity(animal))

0.740174 0.654774 0.389818 0.366535 0.539569


In [202]:
maybe_lioness_vec = woman.vector - man.vector + lion.vector
print(cosine_similarity(maybe_lioness_vec, lioness.vector))
print(cosine_similarity(man.vector, maybe_lioness_vec), 
      cosine_similarity(woman.vector, maybe_lioness_vec),
      cosine_similarity(lion.vector, maybe_lioness_vec),
      cosine_similarity(animal.vector, maybe_lioness_vec))

0.641291439533
0.165038183331 0.54029083252 0.790661334991 0.468034416437


## Sentiment similarity

In [203]:
great = nlp.vocab['great']
worst = nlp.vocab['worst']
bad = nlp.vocab['bad']
good = nlp.vocab['good']
ok = nlp.vocab['ok']

print(great.similarity(worst), bad.similarity(good), great.similarity(ok),
      worst.similarity(good), ok.similarity(bad))

0.419987 0.735509 0.397538 0.497112 0.606134


In [204]:
target_doc = nlp(u"Bad flavors and fast burning, and low pricing")
 
print(target_doc.similarity(doc))

0.968465573635


# Extensions

In [206]:
Doc.set_extension('polarity_scores', getter=polarity_scores, force=True)

In [207]:
print(doc._.polarity_scores)
print(target_doc._.polarity_scores)

{'neg': 0.0, 'neu': 0.733, 'pos': 0.267, 'compound': 0.3716}
{'neg': 0.483, 'neu': 0.517, 'pos': 0.0, 'compound': -0.6808}


# References

https://spacy.io/usage/linguistic-features

https://nlpforhackers.io/complete-guide-to-spacy/

https://www.analyticsvidhya.com/blog/2017/04/natural-language-processing-made-easy-using-spacy-%E2%80%8Bin-python/