In [55]:
from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP('http://localhost:9000')
text = 'The picture quality is great, but the value was bad. The poor battery life was disappointing. I hate the crappy battery life.'
output = nlp.annotate(text, properties = {
    'annotators': 'pos,depparse',
    'outputFormat': 'json'
})
#output['sentences'][2]['basicDependencies']
#output['sentences'][2]['tokens']

In [1]:
# Adding some rules to exclude amod relationships
# Maybe try using a whitelist (known sentiment-bearing terms) instead of a blacklist
# CoreNLP doesn't parse 'sound quality' correctly, so exclude it for now...
adj_exclude = {'first', 'second', 'new', 'extra', 'previous', 'spare', 'other', 'same', 'died', 'outside',
               'ambient', 'external', 'sound', 'left', 'right', 'similar', 'wireless'}

In [2]:
from collections import defaultdict, Counter

def parse_corenlp_deps(sentence_json):
    adj_dict = defaultdict(list)
    for sentence in sentence_json:
        dep_list = sentence['basicDependencies']
        pos_list = sentence['tokens'] #Format is a list of objects with 'index' starting at 1
        phrase_dict = defaultdict()
        for dep in dep_list:
            if dep['dep'] == 'compound':
                phrase = dep['dependentGloss'] + ' ' + dep['governorGloss']
                phrase_dict[dep['governorGloss']] = phrase
        for dep in dep_list:
            if dep['dep'] == 'nsubj':
                noun = dep['dependentGloss']
                adj = dep['governorGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['dependent']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
            elif dep['dep'] == 'amod':
                noun = dep['governorGloss']
                adj = dep['dependentGloss']
                if adj in adj_exclude or not adj.isalpha():
                    continue
                pos_idx = dep['governor']
                if pos_list[pos_idx-1]['pos'] == 'NN':
                    if noun in phrase_dict:
                        noun = phrase_dict[noun]
                    adj_dict[noun].append(adj)
    return [tup for tup in adj_dict.items()]

In [3]:
import pickle
import csv
import time

# Pickle format: (list(output from nlp.annotate), list(review text))
def parse_product_csv(input_fp, output_fp):
    start = time.time()
    depparse_output = []
    corpus = []
    with open(input_fp, 'r') as csvfile:
        csvreader = csv.DictReader(csvfile, delimiter = ',')
        for row in csvreader:
            review_text = row['reviewText']
            corpus.append(review_text)
            output = nlp.annotate(review_text, properties = {
                'annotators': 'pos,depparse',
                'outputFormat': 'json'
            })
            depparse_output.append(output)
    print('Finished in %s secs' % (time.time() - start))
    with open(output_fp, 'wb') as output_file:
        pickle.dump((depparse_output, corpus), output_file)
    return depparse_output, corpus

In [4]:
# Actual work done here
#INPUT_FP = '../samples/earbuds_B000I68BD4_(N=1018_Stdev=1.34810039761).csv'
#INPUT_FP = '../samples/mouse_B000TG4BA0_(N=306_Stdev=1.38151831291).csv'
#INPUT_FP = '../samples/router_B000BTL0OA_(N=585_Stdev=1.15157611458).csv'
INPUT_FP = 'samples/headphones_B0001FTVEK_N=950_Stdev=1.31976200322.csv'
OUTPUT_FP = 'headphones.pkl'

# depparse_output, corpus = parse_product_csv(INPUT_FP, OUTPUT_FP)

# with open(OUTPUT_FP, "rb") as f:
#     depparse_output, corpus = pickle.load(f)

In [5]:
import pickle
import csv
import time

# Pickle format: (list(output from nlp.annotate), list(review text))
def parse_products(reviews, output_fp):
    start = time.time()
    depparse_output = []
    corpus = []
    for row in reviews:
        review_text = row['reviewText']
        corpus.append(review_text)
        output = nlp.annotate(review_text, properties = {
            'annotators': 'pos,depparse',
            'outputFormat': 'json'
        })
        depparse_output.append(output)
    print('Finished in %s secs' % (time.time() - start))
    with open(output_fp, 'wb') as output_file:
        pickle.dump((depparse_output, corpus), output_file)
    return depparse_output, corpus


In [6]:
# load reviews of top 20 asins
with open("samples/reviews_of_top_20_asins.pickle", "rb") as f:
    reviews_of_asin = pickle.load(f)

with open("samples/reviews_of_bad_products_asins.pickle", "rb") as f:
    reviews_of_asin.update(pickle.load(f))

In [7]:
bad_products = [
    'B0007N55NM', # headphones
    'B000II6YEA', # remote
    'B0001FV36E', # antenna
    'B00007LTBA', # router
    'B000CS1TLE', # remote
    'B000MS3VGA', # earbuds
    'B000629GES', # headphones
]

# Run Dependency parser on desired asins.
desired_asins = [
    'B00004ZCJE',
    'B00004T8R2',
    'B00001P4ZH',
    'B00005LEN4',
    'B00001WRSJ',
    'B00004ZC8Y',
    'B00005N6KG',
    'B00004Z5M1',
    'B00005ATMB',
    'B00004WCID',
#     'B00005NIMJ',
#     'B00004SB92',
#     'B00004THCZ',
#     'B00004SABB',
#     '0972683275',
#     'B00002EQCW',
#     'B00004WCIC',
#     'B00005T39Y',
#     'B00001P4XH',
#     'B00004Z5D1'
] + bad_products


# for asin in desired_asins:
    # get reviews for asin
#     output_fp = '%s_parsed_reviews.pickle' % asin
#     print('Starting to parse', asin)
#     parse_products(reviews_of_asin[asin], output_fp)

    # todo: get into good format for sumit to run sentiment shit.

In [8]:
import json
import gzip
import pickle

def get_review_id(review_json):
    fields = [
        'asin',
        'reviewTime',
        'reviewerID'
    ]
    return '_'.join([review_json[field] for field in fields])
   

def extract_features_with_review_id(depparse_output, reviews=None, reviews_fp=None, include_adjs=True, include_review_id=True):
    assert (not include_review_id) or (reviews or reviews_fp), 'if including review_id, reviews or reviews_fp must be provided as an argument'
    df_cnt = Counter()
    cum_adj_dict = defaultdict(list)

    if reviews:
        reviews = iter(reviews)
    elif reviews_fp:
        with open(reviews_fp, 'r') as csvfile:
            reviews = csv.DictReader(csvfile, delimiter = ',')

    for output in depparse_output:
        try:
            sentences = output['sentences']
        except:
            sentences = eval(output)['sentences']
        deps = parse_corenlp_deps(sentences)
        if include_review_id:
            review_id = get_review_id(next(reviews))

        for phrase, adjs in deps:
            if len(adjs) > 0:
                df_cnt[phrase] += 1
                if include_adjs:
                    if include_review_id:
                        cum_adj_dict[phrase].append((adjs, review_id))
                    else:
                        cum_adj_dict[phrase] += adjs
    if include_adjs:
        feat_adjs = [x for x, _ in df_cnt.most_common()]
        feat_adjs = list(map(lambda phrase: (phrase, cum_adj_dict[phrase]), feat_adjs))
        return feat_adjs
    else:
        return df_cnt
    
# Create review id -> review dict, with all review ids in desired_asins

def build_review_id_to_review_map(input_fp, mode='pickle'):
    d = {}
    count = 0
    if mode == 'json':
        reader = gzip.open(input_fp, 'r')
    elif mode == 'csv':
        with open(input_fp, 'r') as f:
            reader = csv.DictReader(f, delimiter = ',')
    elif mode == 'pickle':
        with open(input_fp, 'rb') as f:
            input_d = pickle.load(f)
        for asin_reviews in input_d.values():
            for row in asin_reviews:
                count += 1
                if count % 1000 == 0:
                    print('Processed',count)
                d[get_review_id(row)] = row
        return d

    for row in reader:
        count += 1
        if count % 1000 == 0:
            print('Processed',count)
        d[get_review_id(row)] = row
    return d


review_of_id = build_review_id_to_review_map('samples/reviews_of_top_20_asins.pickle')

# From reviews_of_asin
for asin, reviews in reviews_of_asin.items():
    for row in reviews:
        review_of_id[get_review_id(row)] = row

Processed 1000
Processed 2000
Processed 3000
Processed 4000
Processed 5000
Processed 6000


In [9]:
# Extract improved features for a given asin.

# read improved features pickle
with open("src/features/results/improved_features.pickle", "rb") as f:
    improved_features_dict_pickle = pickle.load(f)
# convert (product_quality, [modifier list]) to dict
features_of_product = defaultdict(dict)
for asin, (features, _) in improved_features_dict_pickle.items():
    for feature, adjs in features:
        features_of_product[asin][feature] = adjs

        
def load_parsed_reviews(asin):
    depparse_pickle_file = 'samples/depparse_output/%s_parsed_reviews.pickle' % asin
    with open(depparse_pickle_file, 'rb') as file:
        depparse_output, _ = pickle.load(file)
    return depparse_output

def extract_improved_features(asin):
    depparse_output = load_parsed_reviews(asin)
    yes, no = 0,0
    reviews = reviews_of_asin[asin]
    unfiltered_features_with_review_id = extract_features_with_review_id(depparse_output, reviews=reviews, include_adjs=True)
    # Filter out adjectives not in improved_features
    filtered_features = defaultdict(list)
    for feature, modifier_reviews in unfiltered_features_with_review_id:
        for (adjectives, review_id) in modifier_reviews:
            filtered_adjectives = [adj for adj in adjectives if adj in features_of_product[asin].get(feature, [])]
            if filtered_adjectives:
                filtered_features[feature].append((filtered_adjectives, review_id))
    return filtered_features

# asin = desired_asins[0]
# feat_adjs = extract_improved_features(asin)

In [10]:
# Pipeline to analyze new products not in improved_features.pickle

# ryin: dump unfiltered features of bad products, pass into caro's code, read in new improved features pickle for bad prods.
# product_dict: asin -> list(phrase, [adjs]), count
product_dict = {}
for asin in bad_products:
    depparse_output= load_parsed_reviews(asin)
    product_dict[asin] = (extract_features_with_review_id(depparse_output, include_review_id=False), len(depparse_output))

# dump into pickle
with open("./bad_products_features.pickle", "wb") as file:
    pickle.dump(product_dict, file, 0)

# get improved features for bad products and update
with open("src/features/results/bad_products_improved_features.pickle", "rb") as f:
    improved_features_dict_pickle = pickle.load(f)
for asin, (features, _) in improved_features_dict_pickle.items():
    for feature, adjs in features:
        features_of_product[asin][feature] = adjs

In [43]:
import math
import statistics
from src import sentiment
from IPython.core.debugger import set_trace


# Get sentiment for ^ output.
"""
for each product quality and adj list:
    for each (adj, helpful score, review score):
        1. find sentiment valence [-1, 1] for each adjective.
        2. weight by helpful score?

TODO:
if sentiment valence differs significantly from review score, print out
"""
def weight_score(score, review_id, mode='default'):
    """
    Returns weight of the score.

    Log weighting idea from reddit ranking algorithm:
    https://medium.com/hacking-and-gonzo/how-reddit-ranking-algorithms-work-ef111e33d0d9
    """
    if mode == 'default':
        return 1
    elif mode == 'weight_helpful':
        # TRY 1: log net helpful
        review = get_review(review_id)
        helpful = review['helpful'] if isinstance(review['helpful'], list) else json.loads(review['helpful'])
        num_helpful, num_total = helpful
        num_unhelpful = num_total - num_helpful
        net = num_helpful - num_unhelpful
        order = math.log(max(net, 2), 2)
#         order = max(net, 1)
#         print('helpful: %s/%s ==> weight: %s; score: %s' % (num_helpful, num_total, order, score))
        return order
    elif mode == 'weight_0_lower':
        # TRY 2: if 0, give it half weight.
        if score == 0:
            return 0.5
        else:
            return 1
    
    # TODO(ryin): try better heuristics


def get_review(review_id):
    return review_of_id[review_id]


def get_weighted_sentiment(product_feature_adjs, weight_mode='default'):
    """
    Returns weighted sentiment scores for each product feature of the product.
    product_feature_adjs: list of (feature, [([adjectives...], [# helpful, # unhelpful], review score)])
    
    output: [(feature, score in [-1, 1])] sorted in descending score
    """
    wc = Counter()
    total_num = total_denom = 0
    feature_scores = []
    sent_scores_of_adj = {}
    for product_quality, adj_data in product_feature_adjs:
        all_scores = []
        for adjectives, review_id in adj_data:
            scores = [sentiment.adjective.get_score(adjective) for adjective in adjectives]
            score = statistics.mean(scores)
            # TODO: print out weird ones that differ from review score, or have weird varying scores, etc
            weight = weight_score(score, review_id, mode=weight_mode)
            wc[weight] += 1
            total_num += score * weight
            total_denom += weight
            all_scores += zip(adjectives, scores)
        final_score = float(total_num) / total_denom
#         if product_quality in ['range', 'fit', 'price']:
#             set_trace()
#         print('final score for %s from %s reviews: %s' % (product_quality, len(adj_data), final_score, ))
        feature_scores.append((product_quality, final_score))
    return sorted(feature_scores, key=lambda item: item[1], reverse=True)

In [17]:
# Create dictionary of asin -> average review score
scores_of_asin = {}
for asin, reviews in reviews_of_asin.items():
    scores_of_asin[asin] = statistics.mean(review['overall'] for review in reviews)

In [18]:
# Clickable asin links.

from IPython.core.display import display, HTML

def display_link(asin):
    url = 'https://www.amazon.com/dp/'+asin
    display(HTML("""<a href="%s">%s</a>""" % (url, url)))

# for asin in desired_asins:
#     display_link(asin)

In [30]:
# TESTING:

# Set names of products.

names = [
    'Tiffen 46mm UV Protection Filter',
    'Panasonic On-Ear Stereo Headphones RP-HT21 (Black & Silver) Lightweight and Comfortable, Powerful Bass',
    'Koss Porta Pro On Ear Headphones with Case, Black / Silver',
    'Nikon AF FX NIKKOR 50mm f/1.8D Lens with Auto Focus for Nikon DSLR Cameras',
    'Sony MDRV6 Studio Monitor Headphones with CCAW Voice Coil',
    'Tiffen 46mm Circular Polarizer',
    'Sony MDR-W08L Vertical In-The-Ear Headphones',
    'Belkin Hi-Speed USB A/B Cable, USB Type-A and USB Type-B (10 Feet)',
    'Case Logic CDW-32 32 Capacity Classic CD Wallet (Black)',
    'Canon Remote Switch RS60 E3',
    
    # bad products
    'Sony MDR-XD100 Stereo Headphones', 
    'Logitech Harmony 1000 Advanced Universal Remote (Silver)',
    'Terk Technology HDTVi VHF/UHF HDTV Indoor Antenna',
    'D-Link DI-624 Wireless Cable/DSL Router, 4-Port Switch, 802.11g, 108Mbps',
    'Logitech Harmony 890 Advanced Universal Remote Control',
    'V-MODA Vibe In-Ear Noise-Isolating Metal Headphone (Blush)',
    'Sony MDR-NC6 Noise Canceling Headphones ',
]

asin_to_name = {asin: name for asin, name in zip(desired_asins, names)}

# SET OF HEADPHONES ASINS FOR COMPARISON!
headphones_asins = []
for i,x in enumerate(names):
    if 'headphone' in x.lower():
        headphones_asins.append(desired_asins[i])
    
headphones_asins

['B00004T8R2',
 'B00001P4ZH',
 'B00001WRSJ',
 'B00005N6KG',
 'B0007N55NM',
 'B000MS3VGA',
 'B000629GES']

In [47]:
import importlib
importlib.reload(sentiment)


# Printing out results
ws_of_asin = {}
for asin in desired_asins:
    print('\n============\n')
    print('%s (ASIN: %s; overall: %.3f)\n\n' % (asin_to_name[asin], asin, scores_of_asin[asin]))
    # get review score
    feat_adjs = extract_improved_features(asin)
    weighted_sentiment = get_weighted_sentiment(feat_adjs.items(), weight_mode='weight_helpful')
    ws_of_asin[asin] = weighted_sentiment
    for feat, score in weighted_sentiment[:20]:
        print('%s: %.3f' % (feat, score))



Tiffen 46mm UV Protection Filter (ASIN: B00004ZCJE; overall: 4.282)


price: 0.275
protection: 0.233
uv filter: 0.225
quality: 0.225
glass: 0.218
lens: 0.203
filter: 0.197


Panasonic On-Ear Stereo Headphones RP-HT21 (Black & Silver) Lightweight and Comfortable, Powerful Bass (ASIN: B00004T8R2; overall: 4.379)


sound: 0.306
bass: 0.280
headphone: 0.276
value: 0.244
quality: 0.239
weight: 0.219
fit: 0.165
volume: 0.139
cord: 0.124
price: 0.104


Koss Porta Pro On Ear Headphones with Case, Black / Silver (ASIN: B00001P4ZH; overall: 4.465)


sound: 0.325
design: 0.294
setting: 0.278
value: 0.275
quality: 0.274
bass: 0.268
weight: 0.262
case: 0.254
headphone: 0.249
price: 0.168
range: 0.154
fit: 0.145


Nikon AF FX NIKKOR 50mm f/1.8D Lens with Auto Focus for Nikon DSLR Cameras (ASIN: B00005LEN4; overall: 4.752)


bokeh: 0.235
image quality: 0.220
depth: 0.215
photography: 0.213
lens: 0.212
picture: 0.183
value: 0.182
quality: 0.172
portrait lens: 0.171
focus: 0.171
light: 0.170
image: 0

In [23]:
# compare shared features for group.
shared_features = {feat for feat, _ in ws_of_asin[headphones_asins[0]]}
for asin in headphones_asins[1:]:
    shared_features = shared_features.intersection({feat for feat, _ in ws_of_asin[asin]})

# For each shared feature, show the sentiment of each product.

for asin in sorted(headphones_asins, key=lambda asin: scores_of_asin[asin], reverse=True):
    print('\n===== \n\n%s (ASIN: %s; overall: %.3f)\n\n' % (asin_to_name[asin], asin, scores_of_asin[asin]))
    weighted_sentiment = ws_of_asin[asin]
    for feat, score in sorted(ws_of_asin[asin]):
        if feat in shared_features:
            print('%s: %.3f' % (feat, score))
        


===== 

Sony MDRV6 Studio Monitor Headphones with CCAW Voice Coil (ASIN: B00001WRSJ; overall: 4.630)


bass: 0.235
price: 0.183
quality: 0.259
sound: 0.300

===== 

Koss Porta Pro On Ear Headphones with Case, Black / Silver (ASIN: B00001P4ZH; overall: 4.465)


bass: 0.275
price: 0.225
quality: 0.290
sound: 0.338

===== 

Panasonic On-Ear Stereo Headphones RP-HT21 (Black & Silver) Lightweight and Comfortable, Powerful Bass (ASIN: B00004T8R2; overall: 4.379)


bass: 0.263
price: 0.109
quality: 0.237
sound: 0.295

===== 

Sony MDR-W08L Vertical In-The-Ear Headphones (ASIN: B00005N6KG; overall: 4.072)


bass: 0.213
price: 0.070
quality: 0.251
sound: 0.262

===== 

Sony MDR-NC6 Noise Canceling Headphones  (ASIN: B000629GES; overall: 3.457)


bass: 0.161
price: 0.274
quality: 0.174
sound: 0.153

===== 

V-MODA Vibe In-Ear Noise-Isolating Metal Headphone (Blush) (ASIN: B000MS3VGA; overall: 3.457)


bass: 0.177
price: 0.519
quality: 0.257
sound: 0.254

===== 

Sony MDR-XD100 Stereo Headphones

In [24]:
# For each shared feature, show the sentiment of each product.

print('Cross-section of headphones products and shared features')
for feat in shared_features:
    print('\n=============\nFeature:', feat)
    
    l = []
    for asin in headphones_asins:
        for f, score in sorted(ws_of_asin[asin]):
            if f == feat:
                l.append('%.3f (star rating: %.3f)' % (score, scores_of_asin[asin]))

    for s in sorted(l, reverse=True):
        print(s)


Cross-section of headphones products and shared features

Feature: price
0.519 (star rating: 3.457)
0.299 (star rating: 3.275)
0.274 (star rating: 3.457)
0.225 (star rating: 4.465)
0.183 (star rating: 4.630)
0.109 (star rating: 4.379)
0.070 (star rating: 4.072)

Feature: bass
0.275 (star rating: 4.465)
0.263 (star rating: 4.379)
0.235 (star rating: 4.630)
0.218 (star rating: 3.275)
0.213 (star rating: 4.072)
0.177 (star rating: 3.457)
0.161 (star rating: 3.457)

Feature: sound
0.338 (star rating: 4.465)
0.300 (star rating: 4.630)
0.295 (star rating: 4.379)
0.262 (star rating: 4.072)
0.254 (star rating: 3.457)
0.233 (star rating: 3.275)
0.153 (star rating: 3.457)

Feature: quality
0.290 (star rating: 4.465)
0.259 (star rating: 4.630)
0.257 (star rating: 3.457)
0.251 (star rating: 4.072)
0.237 (star rating: 4.379)
0.202 (star rating: 3.275)
0.174 (star rating: 3.457)


In [54]:
# Finding review text snippets for particular adjectives of a review.

def find_review_text(feature, adjective, review_id):
    """Returns snippets of review text given feature, adjective and review_id (see get_review_id).
    
    """
    review = get_review(review_id)
    review_text_lines = review['reviewText'].split('.')
    for i, line in enumerate(review_text_lines):
        if adjective in line and feature in line:
            return line

# Sample review text snippets!
import random

asin = desired_asins[4]
for asin in desired_asins:
    improved_features = extract_improved_features(asin)
    # Randomly choose features and adjectives to show
    for feature, modifiers in improved_features.items():
        for adjs, review_id in modifiers:
            for adj in adjs:
                if sentiment.adjective.get_score(adj) < 0:
                    print('Evidence for %s %s: \n\n %s \n\n' % (adj, feature, find_review_text(feature, adj, review_id)))

Evidence for low price: 

 I wanted lense protection at a very low price 


Evidence for low price: 

  This one is a mid-grade quality filter at a very low price 


Evidence for low price: 

 5mm UV Filter I could find on Amazon at a low price 


Evidence for low price: 

 Such a low price for a UV filter 


Evidence for low price: 

   Not one of the classiest filters I've owned - but it does the job I want it to do and at a very low price 


Evidence for low price: 

 Right now 4 stars seems fine, mainly because it is good quality at a very low price 


Evidence for low price: 

  High quality at a low price 


Evidence for low price: 

   For some of my more expensive lenses I've gone with Hoya multi-coated or B&W filters but the bottom line is this Tiffen filter works good for a low price like $13 


Evidence for low price: 

   I was a little uncertain of the quality, because of the low price, ( 1/3 the cost of Vivtar or Nikon equivalents), but after receiving them and inspected 