In [1]:
from nltk.parse import CoreNLPParser, CoreNLPDependencyParser
import nltk
import re
import pickle
import pandas as pd
%matplotlib inline

In [2]:
text = 'Hahahaha, really good. No odor, very very soft, very large absorption, and can not afford the ball, big brands are trustworthy, logistics is fast, hard work!'

#### StanfordNLP

In [3]:
dep_parser = CoreNLPDependencyParser(url="http://localhost:9000")
con_parser = CoreNLPParser(url="http://localhost:9000")

In [4]:
parsed = dep_parser.parse_text(text)
for triple in parsed:
    for (gov, rel, dep) in triple.triples():
        print(gov, rel, dep)
    print()

('Hahahaha', 'NNP') punct (',', ',')
('Hahahaha', 'NNP') amod ('good', 'JJ')
('good', 'JJ') advmod ('really', 'RB')
('Hahahaha', 'NNP') punct ('.', '.')

('work', 'NN') dep ('odor', 'NN')
('odor', 'NN') neg ('No', 'DT')
('odor', 'NN') punct (',', ',')
('odor', 'NN') amod ('soft', 'JJ')
('soft', 'JJ') advmod ('very', 'RB')
('soft', 'JJ') advmod ('very', 'RB')
('odor', 'NN') punct (',', ',')
('odor', 'NN') appos ('absorption', 'NN')
('absorption', 'NN') amod ('large', 'JJ')
('large', 'JJ') advmod ('very', 'RB')
('odor', 'NN') punct (',', ',')
('odor', 'NN') cc ('and', 'CC')
('odor', 'NN') conj ('afford', 'VB')
('afford', 'VB') aux ('can', 'MD')
('afford', 'VB') neg ('not', 'RB')
('afford', 'VB') dobj ('ball', 'NN')
('ball', 'NN') det ('the', 'DT')
('work', 'NN') parataxis ('trustworthy', 'JJ')
('trustworthy', 'JJ') punct (',', ',')
('trustworthy', 'JJ') nsubj ('brands', 'NNS')
('brands', 'NNS') amod ('big', 'JJ')
('trustworthy', 'JJ') cop ('are', 'VBP')
('trustworthy', 'JJ') punct (',', 

#### Sentiment

In [5]:
from pycorenlp import StanfordCoreNLP
stanford = StanfordCoreNLP('http://localhost:9000')

In [6]:
def get_sentiment(text):
    results = stanford.annotate(text, 
                                properties={'annotators': 'sentiment',
                                            'outputFormat': 'json','timeout': '5000'})
    for sentence in results["sentences"]:
        print(" ".join([token["word"] for token in sentence["tokens"]]))
        print(sentence["sentiment"])

In [7]:
get_sentiment(text)

Hahahaha , really good .
Positive
No odor , very very soft , very large absorption , and can not afford the ball , big brands are trustworthy , logistics is fast , hard work !
Verynegative


In [8]:
get_sentiment('good absorption')

good absorption
Positive


In [9]:
get_sentiment('cheap price')

cheap price
Negative


<font color="blue">The sentiment results seem to be pretty bad.</font>

#### SpaCy

In [17]:
import spacy
nlp = spacy.load("en_core_web_lg")

In [27]:
for sent in doc.sents:
    print(sent)

Hahahaha, really good.
No odor, very very soft, very large absorption, and can not afford the ball, big brands are trustworthy, logistics is fast, hard work!


In [119]:
for sentence in sentences:
    annotation = nlp(sentence)
    for word in annotation:
        print(str(word)+"/"+word.pos_, str(word.head)+"/"+word.head.pos_, word.dep_)
    print()

Hahahaha/INTJ Hahahaha/INTJ ROOT

 /SPACE really/ADV 
really/ADV good/ADJ advmod
good/ADJ good/ADJ ROOT

 /SPACE No/DET 
No/DET odor/NOUN det
odor/NOUN odor/NOUN ROOT

 /SPACE very/ADV 
very/ADV soft/ADJ advmod
very/ADV soft/ADJ advmod
soft/ADJ soft/ADJ ROOT

 /SPACE very/ADV 
very/ADV large/ADJ advmod
large/ADJ absorption/NOUN amod
absorption/NOUN absorption/NOUN ROOT

 /SPACE and/CCONJ 
and/CCONJ afford/VERB cc
can/VERB afford/VERB aux
not/ADV afford/VERB neg
afford/VERB afford/VERB ROOT
the/DET ball/NOUN det
ball/NOUN afford/VERB dobj

 /SPACE big/ADJ 
big/ADJ brands/NOUN amod
brands/NOUN are/VERB nsubj
are/VERB are/VERB ROOT
trustworthy/ADJ are/VERB acomp

 /SPACE logistics/NOUN 
logistics/NOUN is/VERB nsubj
is/VERB is/VERB ROOT
fast/ADJ is/VERB acomp

 /SPACE hard/ADJ 
hard/ADJ work/NOUN amod
work/NOUN work/NOUN ROOT




#### Parse all English texts using Stanford Parser. The parser does a better job in collapsing indirect dependencies.

In [17]:
df_data = pd.read_csv(r"C:\Dropbox\_projects\PG\ds-nlp-interview-question_v2.csv")
print(len(df_data))
df_data.head()

36000


Unnamed: 0,REVIEW_TEXT_CN,REVIEW_TEXT_EN,ONLINE_STORE,BRAND,YEAR,MONTH
0,花王的确实不错，一直用这个牌子,"Kao is really good, always use this brand",tmall,Merries,2016,11
1,哈哈哈哈，真心不错的啊。没有异味，非常非常的柔软，吸收量很大，并且不起球，大品牌值得信赖，物...,"Hahahaha, really good. No odor, very very soft...",suning,Huggies,2017,1
2,还不错，吸水性很不错，凑单买的价格还是比较实惠,"Not bad, water absorption is very good, the pr...",JINGDONG,Merries,2018,2
3,囤货中，东西很好！先买了NB和S的，出月子会继续买大号的！包装非常好！两包分开包装！一点都没...,"In the goods, things are very good! First boug...",tmall,Merries,2014,8
4,感觉跟实体店买的不一样，打开有一股很大的味道，贴的地方做的不平有时会印到小孩，其他还好,It feels different from the one bought in t...,jingdong,Pampers,2016,3


In [39]:
df_data["id"] = list(range(len(df_data)))

In [42]:
%%time
all_parsed = {}
dep_parser = CoreNLPDependencyParser(url="http://localhost:9000")
for i, row in df_data.iterrows():
    parsed = dep_parser.parse_text(row["REVIEW_TEXT_EN"])
    record = []
    for triple in parsed:
        record.append(list(triple.triples()))
    all_parsed[row["id"]] = record

Wall time: 15min 6s


In [43]:
len(all_parsed)

36000

In [44]:
pickle.dump(all_parsed, open("deps_en.pkl", "wb"))

In [45]:
# Consolidate tags
A = set(["JJ", "JJR", "JJS"])
R = set(["RB", "RBR", "RBS"])
V = set(["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"])
N = set(["NN", "NNS"])
def replace_tag(tag):
    if tag in A:
        return "A"
    if tag in R:
        return "R"
    if tag in V:
        return "V"
    if tag in N:
        return "N"
    return tag

In [46]:
all_parsed2 = {}

for doc_id, sentences in all_parsed.items():
    new_sentences = []
    for sentence in sentences:
        new_sentence = []
        for gov, rel, dep in sentence:
            new_sentence.append(((gov[0].lower(), replace_tag(gov[1])),
                                rel,
                                (dep[0].lower(), replace_tag(dep[1])))) 
        new_sentences.append(new_sentence)
    all_parsed2[doc_id] = new_sentences

In [47]:
all_parsed2[0]

[[(('good', 'A'), 'nsubj', ('kao', 'NNP')),
  (('good', 'A'), 'cop', ('is', 'V')),
  (('good', 'A'), 'advmod', ('really', 'R')),
  (('good', 'A'), 'punct', (',', ',')),
  (('good', 'A'), 'parataxis', ('use', 'V')),
  (('use', 'V'), 'advmod', ('always', 'R')),
  (('use', 'V'), 'dobj', ('brand', 'N')),
  (('brand', 'N'), 'det', ('this', 'DT'))]]

In [48]:
pickle.dump(all_parsed2, open("deps_en2.pkl", "wb"))