In [2]:
# Start CoreNLP Server

from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')
text = 'The picture quality is great, but the value was bad. The poor battery life was disappointing. I hate the crappy battery life.'
output = nlp.annotate(text, properties = {
    'annotators': 'pos,depparse',
    'outputFormat': 'json'
})
#output['sentences'][2]['basicDependencies']
#output['sentences'][2]['tokens']

Exception: Check whether you have started the CoreNLP server e.g.
$ cd stanford-corenlp-full-2015-12-09/ 
$ java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer

In [3]:
# Adding some rules to exclude amod relationships

# Maybe try using a whitelist (known sentiment-bearing terms) instead of a blacklist
# CoreNLP doesn't parse 'sound quality' correctly, so exclude it for now...
adj_exclude = {'first', 'second', 'new', 'extra', 'previous', 'spare', 'other', 'same', 'died', 'outside',
               'ambient', 'external', 'sound', 'left', 'right', 'similar', 'wireless'}

In [4]:
# Extract dependency info from CoreNLP parse

from collections import defaultdict

def parse_corenlp_deps(sentence_json):
    adj_dict = defaultdict(list)
    for sentence in sentence_json:
        dep_list = sentence['basicDependencies']
        pos_list = sentence['tokens'] #Format is a list of objects with 'index' starting at 1
        phrase_dict = defaultdict()
        for dep in dep_list:
            if dep['dep'] == 'compound':
                phrase = dep['dependentGloss'] + ' ' + dep['governorGloss']
                phrase_dict[dep['governorGloss']] = phrase
        for dep in dep_list:
            if dep['dep'] == 'nsubj':
                noun = dep['dependentGloss']
                adj = dep['governorGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['dependent']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
            elif dep['dep'] == 'amod':
                noun = dep['governorGloss']
                adj = dep['dependentGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['governor']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
    return [tup for tup in adj_dict.items()]

In [5]:
# Parse entire product CSV

import pickle
import csv

# Pickle format: (list(output from nlp.annotate), list(review text))
def parse_product_csv(input_fp, output_fp):
    depparse_output = []
    corpus = []
    with open(input_fp, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter = ',')
        for row in csvreader:
            review_text = row['reviewText']
            corpus.append(review_text)
            output = nlp.annotate(review_text, properties = {
                'annotators': 'pos,depparse',
                'outputFormat': 'json'
            })
            depparse_output.append(output)
    with open(output_fp, 'wb') as output_file:
        pickle.dump((depparse_output, corpus), output_file)
    return depparse_output, corpus

In [6]:
# Sample CSV line

input_fp = '../samples/headphones_B0001FTVEK_N=950_Stdev=1.31976200322.csv'
with open(input_fp, 'r') as csvfile:
    csvreader = csv.DictReader(csvfile, delimiter = ',')
    print(next(csvreader))

{'unixReviewTime': '1405123200', 'overall': '5.0', 'helpful': '[0, 0]', 'reviewTime': '07 12, 2014', 'reviewerID': 'APAEK95R7T8RD', 'reviewerName': '', 'asin': 'B0001FTVEK', 'reviewText': 'My children all work odd hours so these are a lifesaver in letting anyone watch TV and not disturb the rest of the family', 'summary': 'My children all work odd hours so these are a ...'}


In [7]:
# Extract features

from collections import Counter

def extract_features(depparse_output, include_adjs=True):
    vocab = set()
    df_cnt = Counter()
    cum_adj_dict = defaultdict(list)
    for output in depparse_output:
        deps = parse_corenlp_deps(output['sentences'])
        for phrase, adjs in deps:
            if len(adjs) > 0:
                vocab.add(phrase)
                df_cnt[phrase] += 1
                if include_adjs:
                    cum_adj_dict[phrase] += adjs
    if include_adjs:
        feat_adjs = [x for x, _ in df_cnt.most_common()]
        feat_adjs = list(map(lambda phrase: (phrase, cum_adj_dict[phrase]), feat_adjs))
        return feat_adjs
    else:
        return df_cnt

In [8]:
# Build review data structures

import json

def get_review_id(review_json):
    fields = [
        'asin',
        'reviewTime',
        'reviewerID'
    ]
    return '_'.join([review_json[field] for field in fields])
   
    
def build_review_id_to_review_map(input_fp):
    with open(input_fp, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter = ',')
        return {get_review_id(row): row for row in csvreader}


def extract_features_with_weight_data(depparse_output, reviews_fp, include_adjs=True):
    df_cnt = Counter()
    cum_adj_dict = defaultdict(list)
    
    # Open reviews file to access each review's helpful rating and overall rating
    with open(reviews_fp, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter = ',')

        for output in depparse_output:
            deps = parse_corenlp_deps(output['sentences'])
            review_id = get_review_id(next(csvreader))
            for phrase, adjs in deps:
                if len(adjs) > 0:
                    df_cnt[phrase] += 1
                    if include_adjs:
                        cum_adj_dict[phrase].append((adjs, review_id))

        if include_adjs:
            feat_adjs = [x for x, _ in df_cnt.most_common()]
            feat_adjs = list(map(lambda phrase: (phrase, cum_adj_dict[phrase]), feat_adjs))
            return feat_adjs
        else:
            return df_cnt

In [9]:
# Declare input & output files

# Actual work done here
#INPUT_FP = '../samples/earbuds_B000I68BD4_(N=1018_Stdev=1.34810039761).csv'
#INPUT_FP = '../samples/mouse_B000TG4BA0_(N=306_Stdev=1.38151831291).csv'
#INPUT_FP = '../samples/router_B000BTL0OA_(N=585_Stdev=1.15157611458).csv'
INPUT_FP = '../samples/headphones_B0001FTVEK_N=950_Stdev=1.31976200322.csv'
OUTPUT_FP = 'headphones.pkl'

In [10]:
# Build data structures from scratch

# depparse_output, corpus = parse_product_csv(INPUT_FP, OUTPUT_FP)

In [21]:
# Build data structures from pre-made file

import pickle
with open(OUTPUT_FP, "rb") as f:
    depparse_output, corpus = pickle.load(f)

In [64]:
# Build ReviewID->ReviewInfo map

review_of_id = build_review_id_to_review_map(INPUT_FP)

In [65]:
# Build asin->reviews map

reviews_for_asin = defaultdict(list)
for review_id, review_info in review_of_id.items():
    reviews_for_asin[review_info['asin']].append(review_id)

In [14]:
# Get lists of product features and adjectives

feat_adjs = extract_features_with_weight_data(depparse_output, INPUT_FP, True)
for feat, adjs in feat_adjs[:10]:
    print(feat)
    print(adjs)

sound
[(['loud'], 'B0001FTVEK_01 21, 2012_ASVZC9DBNE92O'), (['great'], 'B0001FTVEK_12 31, 2011_A1EE9SJNPZ73IO'), (['Good'], 'B0001FTVEK_12 4, 2012_A3L7HPEMZWF5YD'), (['clear'], 'B0001FTVEK_01 3, 2007_A3ED9TZ1RPJ4F8'), (['transmitted'], 'B0001FTVEK_10 15, 2009_A37A72Y18II54B'), (['good', 'rich'], 'B0001FTVEK_10 29, 2012_ANB0OKIDLNTZ'), (['good'], 'B0001FTVEK_02 26, 2009_A84MZT0GMRKQW'), (['clean'], 'B0001FTVEK_04 2, 2013_A2AVKGSNUB5ES5'), (['clear'], 'B0001FTVEK_08 16, 2011_AFNG8O2DXRCUV'), (['comes', 'cut'], 'B0001FTVEK_07 24, 2008_A3SB2TK2WHUCBV'), (['good'], 'B0001FTVEK_01 7, 2008_AQMDSQNGGLM30'), (['fine'], 'B0001FTVEK_05 22, 2008_A31E13XZ2G6ZDF'), (['noradio', 'is', 'excellent'], 'B0001FTVEK_03 3, 2012_A33ECPWPO5YMBK'), (['awesome'], 'B0001FTVEK_01 29, 2012_A2UIZDVXD0HD6E'), (['great'], 'B0001FTVEK_06 19, 2011_AJ3QG85DERQW5'), (['coming'], 'B0001FTVEK_04 24, 2014_A3OLNEGDP5UGGL'), (['good'], 'B0001FTVEK_03 2, 2013_A2SPHDVRS63M08'), (['better'], 'B0001FTVEK_03 12, 2005_A2HIHB3PSTLSH

In [None]:
# Returns list of (adj, count) pairs, sorted by count

def sorted_feature_adjs(adjs):
    adj_count = defaultdict(int)
    for adj_list, _ in adjs:
        for adj in adj_list:
            adj_count[adj.lower()] += 1
    all_adjs_sorted = sorted(adj_count.keys(), key=lambda a:adj_count[a], reverse=True)
    adj_count_tuples = [(adj,adj_count[adj]) for adj in all_adjs_sorted]
    
    return adj_count_tuples

In [35]:
# Sentiment analysis

import math
import statistics
import sentiment

# Get sentiment for ^ output.
"""
for each product quality and adj list:
    for each (adj, helpful score, review score):
        1. find sentiment valence [-1, 1] for each adjective.
            - if sentiment valence differs significantly from review score, print out
        2. weight by helpful score
            * Initial pass (11/27): if helpful ratio > 0.5, add (# helpful) - 0.5 (# unhelpful)
                TODO(ryin): improve this.
"""
WEIGHT_VOTES_THRESHOLD = 10
def weight_score(score, num_helpful, num_unhelpful, threshold=WEIGHT_VOTES_THRESHOLD, ):
    """
    Return 1 if total num votes <= threshold.
    
    Return sqrt(num_helpful) - sqrt(num_unhelpful)
    
    Log weighting idea from reddit ranking algorithm:
    https://medium.com/hacking-and-gonzo/how-reddit-ranking-algorithms-work-ef111e33d0d9
    """
    # if num_helpful + num_unhelpful <= threshold:
    #    return 1
    # TODO(ryin): try better heuristics

    net = num_helpful - num_unhelpful
    order = math.log(max(net, 2), 2)
    return order


def get_review(review_id):
    return review_of_id[review_id]


def get_weighted_sentiment(product_feature_adjs):
    """
    Returns weighted sentiment scores for each product feature of the product.
    product_feature_adjs: list of (feature, [([adjectives...], review_id)])
    
    output: [(feature, score in [-1, 1])] sorted in descending score
    """
    wc = Counter()
    total_num = total_denom = 0
    feature_scores = []
    feature_scores_dict = {}
    for product_quality, adj_data in feat_adjs:
        for adjectives, review_id in adj_data:
            review = get_review(review_id)

            num_helpful, num_total = json.loads(review['helpful'])
            num_unhelpful = num_total - num_helpful

            scores = [sentiment.adjective.get_score(adjective) for adjective in adjectives]
            score = statistics.mean(scores)
            # TODO: print out weird ones that differ from review score, or have weird varying scores, etc
            weight = weight_score(score, num_helpful, num_unhelpful)
            wc[weight] += 1
            total_num += score * weight
            total_denom += weight
        final_score = float(total_num) / total_denom
        feature_scores.append((product_quality, final_score))
        feature_scores_dict[product_quality] = final_score
    return sorted(feature_scores, key=lambda item: item[1], reverse=True), feature_scores_dict

In [36]:
weighted_sentiment, weighted_sentiment_dict = get_weighted_sentiment(feat_adjs)
# with open('sentiment_features_11.27.17_headphones_B0001FTVEK.csv', 'w') as f:
#     writer = csv.writer(f)
#     for row in weighted_sentiment:
#         writer.writerow(row)


In [37]:
weighted_sentiment_dict['range']

0.32122089434372714

In [66]:
# Load dependency parse from pickle, extract features
# Assumes pickle contains dep_parse of a single product

with open(OUTPUT_FP, 'rb') as file:
    depparse_output, corpus = pickle.load(file)
feat_adjs = extract_features_with_weight_data(depparse_output, INPUT_FP, True)
weighted_sentiment = get_weighted_sentiment(feat_adjs)
review_of_id = build_review_id_to_review_map(INPUT_FP)

In [75]:
# Model Output
# [feature] [sentiment score] [adjs+counts]
for feat, adjs in feat_adjs[:10]:
    adj_count_tuples = sorted_feature_adjs(adjs)
    
    print(feat, weighted_sentiment_dict[feat], adj_count_tuples[:10])
    print('\n')

sound 0.3095560970154733 [('good', 49), ('great', 38), ('clear', 35), ('better', 14), ('excellent', 14), ('surround', 11), ('is', 9), ('loud', 8), ('hissing', 8), ('awesome', 5)]


quality 0.30942547302471535 [('good', 69), ('sound', 28), ('excellent', 22), ('great', 21), ('better', 17), ('audio', 15), ('poor', 7), ('superb', 6), ('high', 6), ('amazing', 5)]


range 0.32122089434372714 [('good', 25), ('great', 25), ('excellent', 9), ('dynamic', 5), ('limited', 4), ('amazing', 2), ('full', 2), ('seems', 2), ('wide', 2), ('high', 2)]


wife 0.29104484229442934 [('likes', 5), ('happy', 4), ('sleeping', 3), ('watch', 3), ('has', 2), ('sleep', 2), ('use', 2), ('gone', 2), ('sleeps', 2), ('listen', 2)]


product 0.2842444822232028 [('great', 15), ('good', 10), ('works', 5), ('excellent', 4), ('bad', 2), ('work', 2), ('have', 2), ('nice', 2), ('better', 1), ('lets', 1)]


noise 0.25460971642964136 [('static', 18), ('loud', 12), ('white', 11), ('hissing', 5), ('is', 4), ('clicking', 3), ('horr

In [79]:
# Sample of 10 random reviews

from pprint import pprint
from random import random
for review_id, review_info in sorted(review_of_id.items(), key=lambda k:random())[:10]:
    print(review_id)
    pprint(review_info)
    print('\n')

B0001FTVEK_10 3, 2013_A1JQXA6HYS47QR
{'asin': 'B0001FTVEK',
 'helpful': '[0, 0]',
 'overall': '5.0',
 'reviewText': 'This product works as advertised. The range is very good, '
               'sound quality excellent, battery life is very good as well. '
               'I like to watch tv and play video games in bed late into the '
               'night, this allows me to do this while my wife sleeps.',
 'reviewTime': '10 3, 2013',
 'reviewerID': 'A1JQXA6HYS47QR',
 'reviewerName': 'L. Smith',
 'summary': 'Good range',
 'unixReviewTime': '1380758400'}


B0001FTVEK_10 19, 2013_AF2WOW0XJR1F1
{'asin': 'B0001FTVEK',
 'helpful': '[1, 1]',
 'overall': '5.0',
 'reviewText': 'I can hear perfectly from my living room all the way to the '
               'back bedroom and out my back door to the lake 25 feet which '
               '75 feet from the transmitter, And the back bedroom has no '
               'line of sight. Easy to set up and works with both headsets,',
 'reviewTime': '10 19, 2013',
