In [1]:
from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')
text = 'The picture quality is great, but the value was bad. The poor battery life was disappointing. I hate the crappy battery life.'
output = nlp.annotate(text, properties = {
    'annotators': 'pos,depparse',
    'outputFormat': 'json'
})
#output['sentences'][2]['basicDependencies']
#output['sentences'][2]['tokens']

In [78]:
# Adding some rules to exclude amod relationships
# Maybe try using a whitelist (known sentiment-bearing terms) instead of a blacklist
# CoreNLP doesn't parse 'sound quality' correctly, so exclude it for now...
adj_exclude = {'first', 'second', 'new', 'extra', 'previous', 'spare', 'other', 'same', 'died', 'outside',
               'ambient', 'external', 'sound', 'left', 'right', 'similar', 'wireless'}

In [72]:
from collections import defaultdict

def parse_corenlp_deps(sentence_json):
    adj_dict = defaultdict(list)
    for sentence in sentence_json:
        dep_list = sentence['basicDependencies']
        pos_list = sentence['tokens'] #Format is a list of objects with 'index' starting at 1
        phrase_dict = defaultdict()
        for dep in dep_list:
            if dep['dep'] == 'compound':
                phrase = dep['dependentGloss'] + ' ' + dep['governorGloss']
                phrase_dict[dep['governorGloss']] = phrase
        for dep in dep_list:
            if dep['dep'] == 'nsubj':
                noun = dep['dependentGloss']
                adj = dep['governorGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['dependent']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
            elif dep['dep'] == 'amod':
                noun = dep['governorGloss']
                adj = dep['dependentGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['governor']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
    return [tup for tup in adj_dict.items()]

In [7]:
import pickle
import csv

# Pickle format: (list(output from nlp.annotate), list(review text))
def parse_product_csv(input_fp, output_fp):
    depparse_output = []
    corpus = []
    with open(input_fp, 'r') as csvfile:
        csvreader = csv.reader(csvfile, delimiter = ',')
        for row in csvreader:
            review_text = row[4]
            corpus.append(review_text)
            output = nlp.annotate(review_text, properties = {
                'annotators': 'pos,depparse',
                'outputFormat': 'json'
            })
            depparse_output.append(output)
    with open(output_fp, 'wb') as output_file:
        pickle.dump((depparse_output, corpus), output_file)
    return depparse_output, corpus

In [73]:
from collections import Counter

def extract_features(depparse_output, include_adjs):
    vocab = set()
    df_cnt = Counter()
    cum_adj_dict = defaultdict(list)
    for output in depparse_output:  
        deps = parse_corenlp_deps(output['sentences'])
        for phrase, adjs in deps:
            if len(adjs) > 0:
                vocab.add(phrase)
                df_cnt[phrase] += 1
                if include_adjs:
                    cum_adj_dict[phrase] += adjs
    if include_adjs:
        feat_adjs = [x for x, _ in df_cnt.most_common()]
        feat_adjs = list(map(lambda phrase: (phrase, cum_adj_dict[phrase]), feat_adjs))
        return feat_adjs
    else:
        return df_cnt

In [86]:
# Actual work done here

#INPUT_FP = '../samples/earbuds_B000I68BD4_(N=1018_Stdev=1.34810039761).csv'
#INPUT_FP = '../samples/mouse_B000TG4BA0_(N=306_Stdev=1.38151831291).csv'
#INPUT_FP = '../samples/router_B000BTL0OA_(N=585_Stdev=1.15157611458).csv'
INPUT_FP = '../samples/headphones_B0001FTVEK_N950_Stdev1.31976200322.csv'
OUTPUT_FP = 'headphones.pkl'

#depparse_output, corpus = parse_product_csv(INPUT_FP, OUTPUT_FP)
with open(OUTPUT_FP, 'rb') as file:
    depparse_output, corpus = pickle.load(file)
df_cnt = extract_features(depparse_output, False)
feat_adjs = extract_features(depparse_output, True)

df_cnt.most_common()
for feat, adjs in feat_adjs[:10]:
    print(feat)
    print(adjs)

sound
['loud', 'great', 'Good', 'clear', 'transmitted', 'good', 'rich', 'good', 'clean', 'clear', 'comes', 'cut', 'good', 'fine', 'noradio', 'is', 'excellent', 'awesome', 'great', 'coming', 'good', 'better', 'great', 'excellent', 'clear', 'great', 'pure', 'Great', 'good', 'great', 'is', 'loud', 'enough', 'better', 'nice', 'Clear', 'tried', 'awesome', 'great', 'clear', 'uncomfortable', 'carries', 'great', 'good', 'TERRIBLE', 'muted', 'quality', 'better', 'vocal', 'output', 'acceptable', 'static', 'great', 'excellent', 'clear', 'clear', 'OK', 'Good', 'Fit', 'Compatible', 'hissing', 'Constant', 'whirring', 'reasonable', 'great', 'starts', 'better', 'better', 'Muted', 'rich', 'good', 'lacks', 'tinny', 'better', 'low', 'clear', 'great', 'good', 'loud', 'hissing', 'was', 'Excellent', 'feature', 'steady', 'terrific', 'clear', 'definite', 'hissing', 'is', 'crappy', 'surround', 'remarkable', 'natural', 'overall', 'surround', 'great', 'has', 'clear', 'good', 'good', 'surround', 'superb', 'good',

In [13]:
for i, output in enumerate(depparse_output):
    for sentence in output['sentences']:
        dep_list = sentence['basicDependencies']
        for dep in dep_list:
            if dep['dep'] == 'amod':
                noun = dep['governorGloss']
                if noun == 'pair':
                    print(dep['dependentGloss'])
                    #print(corpus[i])

new
first
cheap
comfortable
first
extra
previous
first
second
more
$
FOURTH
nice
first
oblong
inexpensive
different
former
expensive
last
original
bad
inexpensive
decent
$
better
new
second
first
GOOD
cheap
third
broken
second
second
better
longer-lived
better
Great
last
old
new
3rd
cheap
expensive
previous
extra
first
second
first
second
second
several
extra
new
different
absolute
good
third
blue
pink
cheap
new
cheap
second
good
awful
second
same
exact
free
suspect
cheap
First
defective
higher
nice
dollar
extra
latest
first
comfortable
second
first
second
second
several
first
newer
everyday
other
last
first
second
3rd
other
new
yellow
expensive
3rd
free
of
last
new
new
second
spare
first
simple
knock-around
decent
second
incase
expensive
back-up
more
new
new
low-cost
comfortable
second
2nd
first
dreaded
first
new
extra
new
$
$
good
second
great
different
extra
second
second
great
Good
second
second
comfortable
inexpensive
decent
other
first
nice
ok
better
first
new
other
previous
hi-f