In [5]:
from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')
text = 'The picture quality is great, but the value was bad. The poor battery life was disappointing. I hate the crappy battery life.'
output = nlp.annotate(text, properties = {
    'annotators': 'pos,depparse',
    'outputFormat': 'json'
})
#output['sentences'][2]['basicDependencies']
#output['sentences'][2]['tokens']

In [6]:
# Adding some rules to exclude amod relationships
# Maybe try using a whitelist (known sentiment-bearing terms) instead of a blacklist
# CoreNLP doesn't parse 'sound quality' correctly, so exclude it for now...
adj_exclude = {'first', 'second', 'new', 'extra', 'previous', 'spare', 'other', 'same', 'died', 'outside',
               'ambient', 'external', 'sound', 'left', 'right', 'similar', 'wireless'}

In [7]:
from collections import defaultdict

def parse_corenlp_deps(sentence_json):
    adj_dict = defaultdict(list)
    for sentence in sentence_json:
        dep_list = sentence['basicDependencies']
        pos_list = sentence['tokens'] #Format is a list of objects with 'index' starting at 1
        phrase_dict = defaultdict()
        for dep in dep_list:
            if dep['dep'] == 'compound':
                phrase = dep['dependentGloss'] + ' ' + dep['governorGloss']
                phrase_dict[dep['governorGloss']] = phrase
        for dep in dep_list:
            if dep['dep'] == 'nsubj':
                noun = dep['dependentGloss']
                adj = dep['governorGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['dependent']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
            elif dep['dep'] == 'amod':
                noun = dep['governorGloss']
                adj = dep['dependentGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['governor']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
    return [tup for tup in adj_dict.items()]

In [44]:
import pickle
import csv

# Pickle format: (list(output from nlp.annotate), list(review text))
def parse_product_csv(input_fp, output_fp):
    depparse_output = []
    corpus = []
    with open(input_fp, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter = ',')
        for row in csvreader:
            review_text = row['reviewText']
            corpus.append(review_text)
            output = nlp.annotate(review_text, properties = {
                'annotators': 'pos,depparse',
                'outputFormat': 'json'
            })
            depparse_output.append(output)
    with open(output_fp, 'wb') as output_file:
        pickle.dump((depparse_output, corpus), output_file)
    return depparse_output, corpus

In [45]:
input_fp = '../samples/headphones_B0001FTVEK_N=950_Stdev=1.31976200322.csv'
with open(input_fp, 'r') as csvfile:
    csvreader = csv.DictReader(csvfile, delimiter = ',')
    print(next(csvreader))

{'summary': 'My children all work odd hours so these are a ...', 'reviewerName': '', 'overall': '5.0', 'asin': 'B0001FTVEK', 'unixReviewTime': '1405123200', 'helpful': '[0, 0]', 'reviewerID': 'APAEK95R7T8RD', 'reviewTime': '07 12, 2014', 'reviewText': 'My children all work odd hours so these are a lifesaver in letting anyone watch TV and not disturb the rest of the family'}


In [46]:
from collections import Counter

def extract_features(depparse_output, include_adjs=True):
    vocab = set()
    df_cnt = Counter()
    cum_adj_dict = defaultdict(list)
    for output in depparse_output:
        deps = parse_corenlp_deps(output['sentences'])
        for phrase, adjs in deps:
            if len(adjs) > 0:
                vocab.add(phrase)
                df_cnt[phrase] += 1
                if include_adjs:
                    cum_adj_dict[phrase] += adjs
    if include_adjs:
        feat_adjs = [x for x, _ in df_cnt.most_common()]
        feat_adjs = list(map(lambda phrase: (phrase, cum_adj_dict[phrase]), feat_adjs))
        return feat_adjs
    else:
        return df_cnt

In [67]:
import json

def extract_features_with_weight_data(depparse_output, reviews_fp, include_adjs=True):
    df_cnt = Counter()
    cum_adj_dict = defaultdict(list)
    
    # Open reviews file to access each review's helpful rating and overall rating
    with open(reviews_fp, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter = ',')

        for output in depparse_output:
            deps = parse_corenlp_deps(output['sentences'])
            review = next(csvreader)
            for phrase, adjs in deps:
                if len(adjs) > 0:
                    df_cnt[phrase] += 1
                    if include_adjs:
                        cum_adj_dict[phrase].append((adjs, json.loads(review['helpful']), float(review['overall'])))

        if include_adjs:
            feat_adjs = [x for x, _ in df_cnt.most_common()]
            feat_adjs = list(map(lambda phrase: (phrase, cum_adj_dict[phrase]), feat_adjs))
            return feat_adjs
        else:
            return df_cnt

In [68]:
# Actual work done here
#INPUT_FP = '../samples/earbuds_B000I68BD4_(N=1018_Stdev=1.34810039761).csv'
#INPUT_FP = '../samples/mouse_B000TG4BA0_(N=306_Stdev=1.38151831291).csv'
#INPUT_FP = '../samples/router_B000BTL0OA_(N=585_Stdev=1.15157611458).csv'
INPUT_FP = '../samples/headphones_B0001FTVEK_N=950_Stdev=1.31976200322.csv'
OUTPUT_FP = 'headphones.pkl'

depparse_output, corpus = parse_product_csv(INPUT_FP, OUTPUT_FP)


KeyboardInterrupt: 

In [69]:
feat_adjs = extract_features_with_weight_data(depparse_output, INPUT_FP, True)
for feat, adjs in feat_adjs[:10]:
    print(feat)
    print(adjs)

sound
[(['loud'], [1, 1], 4.0), (['great'], [1, 1], 5.0), (['Good'], [0, 0], 3.0), (['clear'], [0, 1], 5.0), (['transmitted'], [0, 0], 2.0), (['good', 'rich'], [1, 1], 3.0), (['good'], [0, 0], 2.0), (['clean'], [0, 0], 3.0), (['clear'], [0, 0], 4.0), (['comes', 'cut'], [0, 0], 4.0), (['good'], [0, 0], 4.0), (['fine'], [2, 2], 4.0), (['noradio', 'is', 'excellent'], [3, 3], 5.0), (['awesome'], [1, 2], 5.0), (['great'], [0, 0], 4.0), (['coming'], [0, 0], 5.0), (['good'], [0, 0], 4.0), (['better'], [33, 36], 3.0), (['great'], [1, 1], 2.0), (['excellent'], [4, 4], 5.0), (['clear'], [0, 0], 5.0), (['great'], [0, 0], 5.0), (['pure'], [9, 9], 5.0), (['Great'], [0, 0], 5.0), (['good'], [0, 0], 1.0), (['great'], [1, 1], 5.0), (['is'], [2, 2], 2.0), (['loud', 'enough'], [0, 0], 2.0), (['better', 'nice'], [0, 2], 2.0), (['Clear', 'tried'], [0, 0], 5.0), (['awesome', 'great'], [0, 0], 5.0), (['clear', 'uncomfortable'], [1, 1], 5.0), (['carries', 'great'], [0, 1], 4.0), (['good'], [0, 0], 2.0), (['T

In [105]:
import math
import statistics
import sentiment

# Get sentiment for ^ output.
"""
for each product quality and adj list:
    for each (adj, helpful score, review score):
        1. find sentiment valence [-1, 1] for each adjective.
            - if sentiment valence differs significantly from review score, print out
        2. weight by helpful score
            * Initial pass (11/27): if helpful ratio > 0.5, add (# helpful) - 0.5 (# unhelpful)
                TODO(ryin): improve this.
"""
WEIGHT_VOTES_THRESHOLD = 10
def weight_score(score, num_helpful, num_unhelpful, threshold=WEIGHT_VOTES_THRESHOLD, ):
    """
    Return 1 if total num votes <= threshold.
    
    Return sqrt(num_helpful) - sqrt(num_unhelpful)
    
    Log weighting idea from reddit ranking algorithm:
    https://medium.com/hacking-and-gonzo/how-reddit-ranking-algorithms-work-ef111e33d0d9
    """
    # if num_helpful + num_unhelpful <= threshold:
    #    return 1
    # TODO(ryin): try better heuristics

    net = num_helpful - num_unhelpful
    order = math.log(max(net, 2), 2)
    return order


def get_weighted_sentiment(product_feature_adjs):
    """
    Returns weighted sentiment scores for each product feature of the product.
    product_feature_adjs: list of (feature, [([adjectives...], [# helpful, # unhelpful], review score)])
    
    output: [(feature, score in [-1, 1])] sorted in descending score
    """
    wc = Counter()
    total_num = total_denom = 0
    feature_scores = []
    for product_quality, adj_data in feat_adjs:
        for adjectives, (num_helpful, num_total), review_score in adj_data:
            scores = [sentiment.adjective.get_score(adjective) for adjective in adjectives]
            score = statistics.mean(scores)
            # TODO: print out weird ones that differ from review score, or have weird varying scores, etc
            weight = weight_score(score, num_helpful, num_total - num_helpful)
            wc[weight] += 1
            total_num += score * weight
            total_denom += weight
        final_score = float(total_num) / total_denom
        feature_scores.append((product_quality, final_score))
    print(wc)
    return sorted(feature_scores, key=lambda item: item[1], reverse=True)

In [107]:
weighted_sentiment = get_weighted_sentiment(feat_adjs)
with open('sentiment_features_11.27.17_headphones_B0001FTVEK.csv', 'w') as f:
    writer = csv.writer(f)
    for row in weighted_sentiment:
        writer.writerow(*row)


Counter({1.0: 4084, 2.0: 119, 2.321928094887362: 115, 1.5849625007211563: 106, 3.1699250014423126: 82, 2.584962500721156: 50, 6.375039431346925: 32, 5.491853096329675: 18, 7.912889336229962: 18, 5.321928094887363: 17, 3.8073549220576037: 15, 6.882643049361842: 15, 3.5849625007211565: 14, 3.3219280948873626: 14, 2.807354922057604: 14, 5.169925001442312: 14, 9.643856189774725: 13, 5.129283016944966: 12, 3.700439718141092: 12, 4.906890595608519: 11, 4.247927513443585: 11, 3.0: 10, 5.285402218862249: 5, 5.930737337562887: 5, 5.614709844115208: 4, 4.459431618637297: 3})


TypeError: writerow() takes exactly one argument (2 given)

In [48]:
with open(OUTPUT_FP, 'rb') as file:
    depparse_output, corpus = pickle.load(file)
df_cnt = extract_features(depparse_output, False)
feat_adjs = extract_features_with_weight_data(depparse_output, True)

df_cnt.most_common()
for feat, adjs in feat_adjs[:10]:
    print(feat)
    print(adjs)

KeyboardInterrupt: 

In [30]:
depparse_output[0]['sentences']

[{'basicDependencies': [{'dep': 'ROOT',
    'dependent': 1,
    'dependentGloss': 'asin',
    'governor': 0,
    'governorGloss': 'ROOT'}],
  'enhancedDependencies': [{'dep': 'ROOT',
    'dependent': 1,
    'dependentGloss': 'asin',
    'governor': 0,
    'governorGloss': 'ROOT'}],
  'enhancedPlusPlusDependencies': [{'dep': 'ROOT',
    'dependent': 1,
    'dependentGloss': 'asin',
    'governor': 0,
    'governorGloss': 'ROOT'}],
  'index': 0,
  'tokens': [{'after': '',
    'before': '',
    'characterOffsetBegin': 0,
    'characterOffsetEnd': 4,
    'index': 1,
    'originalText': 'asin',
    'pos': 'NN',
    'word': 'asin'}]}]

In [13]:
for i, output in enumerate(depparse_output):
    for sentence in output['sentences']:
        dep_list = sentence['basicDependencies']
        for dep in dep_list:
            if dep['dep'] == 'amod':
                noun = dep['governorGloss']
                if noun == 'pair':
                    print(dep['dependentGloss'])
                    #print(corpus[i])

new
first
cheap
comfortable
first
extra
previous
first
second
more
$
FOURTH
nice
first
oblong
inexpensive
different
former
expensive
last
original
bad
inexpensive
decent
$
better
new
second
first
GOOD
cheap
third
broken
second
second
better
longer-lived
better
Great
last
old
new
3rd
cheap
expensive
previous
extra
first
second
first
second
second
several
extra
new
different
absolute
good
third
blue
pink
cheap
new
cheap
second
good
awful
second
same
exact
free
suspect
cheap
First
defective
higher
nice
dollar
extra
latest
first
comfortable
second
first
second
second
several
first
newer
everyday
other
last
first
second
3rd
other
new
yellow
expensive
3rd
free
of
last
new
new
second
spare
first
simple
knock-around
decent
second
incase
expensive
back-up
more
new
new
low-cost
comfortable
second
2nd
first
dreaded
first
new
extra
new
$
$
good
second
great
different
extra
second
second
great
Good
second
second
comfortable
inexpensive
decent
other
first
nice
ok
better
first
new
other
previous
hi-f