In [13]:
import pandas as pd
import numpy as np

In [1]:
import os
os.getcwd()

'C:\\Users\\Brad\\Desktop\\Keras - GPU'

## Overview

We experimented with several approaches to label our unlabeled data

1. Group data by book and apply a percentile-based threshold on `annual_HVAR` values.<br><br>

2. Group data by book, min-max scale `annual_HVAR` values, ungroup and apply KBinsDiscretizer to perform 1-dimensional kmeans clustering of scaled values into 2 classes.<br><br>

3. Group data by book, transform `annual_HVAR` values using within-group z-score normalization, ungroup and apply KBinsDiscretizer to perform 1-dimensional kmeans clustering of scaled values into 2 classes.

In practice option 3 proved the most-conservative labeling approach, producing the most-meaningful class labels.

This notebook examines each labeling strategy to uncover errors with the approach.

In [3]:
df = pd.read_csv('w266_proj/data/labeled_training_set_clust_FINAL.csv')

In [34]:
asin_list = list(df['asin'].unique())


In [27]:
len(asin_list)

197297

In [35]:
asin_list[:5]

['000100039X', '0001055178', '0001712772', '0001714538', '0002005395']

In [36]:
import pickle

with open('asin_list.pkl', 'wb') as f:
    pickle.dump(asin_list, f)

In [23]:
df.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,helpful_votes,review_age_days,...,book_num_reviews,std_HVAR,top_quartile_HVAR,most_helpful,min_max,mean_sd,scaled,group_z,class_2,group_z_class
0,000100039X,5,This is one my must have books. It is a master...,2003-12-11,A2S166WSCFIFP5,"adead_poet@hotmail.com ""adead_poet@hotmail.com""",close to god,1071100800,0,3877,...,86,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)",0.0,-0.368577,0.0,0.0
1,000100039X,5,A timeless classic. It is a very demanding an...,2002-10-07,A2XQ5LZHTD4AFT,Alaturka,A Modern Rumi,1033948800,7,4307,...,86,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)",0.02441,-0.142781,0.0,0.0
2,000100039X,5,I discovered The Prophet fifty years ago in co...,2013-01-23,A19N3FCQCLJYUA,Amazon Customer,A book everyone &#34;should&#34; read,1358899200,1,546,...,86,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)",0.027508,-0.114128,0.0,0.0
3,000100039X,5,Can't say enough about Kahlil Gibran's work am...,2012-06-27,A3FFNE1DR5SI1W,A. Morelli,phenomenal piece of literature!,1340755200,1,756,...,86,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)",0.019867,-0.184808,0.0,0.0
4,000100039X,5,"Cool book, I really like the quality of the pr...",2012-02-15,ARDQ9KNB8K22N,Anwar,"""The Prophet"" is cool",1329264000,1,889,...,86,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(0.9683401157041025, 2.6272423654684296)",0.016895,-0.212301,0.0,0.0


In [39]:
# most_helpful is dervived using a simple wihtin-book threshold (top quartile of within-book HVAR)
df['most_helpful'].value_counts()

0    1981162
1     678555
Name: most_helpful, dtype: int64

In [41]:
# How many of the "helpful" reviews using the top quartile appraoch
# are labeled the same way with a min-max scaling and clustering?
df[(df['most_helpful'] == 1) & (df['class_2'] == 1)].shape

(303773, 21)

In [42]:
# How many of the "helpful" reviews using the top quartile appraoch
# are labeled the same way with a z-score normalization and clustering approach?
df[(df['most_helpful'] == 1) & (df['group_z_class'] == 1)].shape

(302064, 21)

In [62]:
# how do min-max and z-score compare?
print('min-max and z-score agree on {} helpful labels'.format(df[(df['class_2'] == 1) & (df['group_z_class'] == 1)].shape[0]))
print('min-max has a total of {} helpful labels'.format(df[df['class_2'] == 1].shape[0]))
print('z-score has a total of {} helpful labels'.format(df[df['group_z_class'] == 1].shape[0]))

min-max and z-score agree on 273163 helpful labels
min-max has a total of 396720 helpful labels
z-score has a total of 311442 helpful labels


In [67]:
# the percentile threshold DOES result in all reviews for some books being assigned NO helpful labels at all
df.groupby('asin')['most_helpful'].sum().sort_values().head(20)

asin
0615850987    0
B00BNPTUSO    0
B00CAJCC14    0
B00AMPXBWW    0
B00BH68UKI    0
B00BNM5UFE    0
B00DNCONGE    0
0451230930    0
1466252928    0
B00BNHQ9WM    0
1440213836    0
0192728644    0
B00CTN419W    0
B00BVEHHWS    0
031254667X    0
0373442610    0
B00A03D1WQ    0
B00B9P2M38    0
B00BND1C50    0
1433679647    0
Name: most_helpful, dtype: int64

In [65]:
# inspect that first result
df[df['asin'] == '0615850987'][['asin','helpful_votes','review_age_days','annual_HVAR', 'std_HVAR', 'top_quartile_HVAR', 'most_helpful','class_2', 'group_z_class']]

Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,std_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
568053,615850987,1,377,0.96817,0.488392,1.946667,0,0.0,0.0
568054,615850987,2,376,1.941489,0.488392,1.946667,0,1.0,0.0
568055,615850987,2,375,1.946667,0.488392,1.946667,0,1.0,0.0
1646804,615850987,2,375,1.946667,0.488392,1.946667,0,1.0,0.0


**`most_helpful` and `group_z_class` seem to have reasonable labels. `class_2` does not.**

In [53]:
df[df['asin'] == 'B00BNPTUSO'][['asin','helpful_votes','review_age_days','annual_HVAR', 'top_quartile_HVAR', 'most_helpful','class_2', 'group_z_class']]

Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
1350783,B00BNPTUSO,1,465,0.784946,0.784946,0,0.0,0.0
1350784,B00BNPTUSO,1,501,0.728543,0.784946,0,0.0,0.0
1350785,B00BNPTUSO,1,506,0.721344,0.784946,0,0.0,0.0
1350786,B00BNPTUSO,1,465,0.784946,0.784946,0,0.0,0.0
1350787,B00BNPTUSO,1,505,0.722772,0.784946,0,0.0,0.0
1350788,B00BNPTUSO,1,465,0.784946,0.784946,0,0.0,0.0
1350789,B00BNPTUSO,1,502,0.727092,0.784946,0,0.0,0.0
1350790,B00BNPTUSO,1,502,0.727092,0.784946,0,0.0,0.0


**Once again `most_helpful` and `group_z_class` seem to have reasonable labels.**

In [66]:
df[df['asin'] == 'B00CAJCC14'][['asin','helpful_votes','review_age_days','annual_HVAR', 'std_HVAR', 'top_quartile_HVAR', 'most_helpful','class_2', 'group_z_class']]



Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,std_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
1356919,B00CAJCC14,1,467,0.781585,1.036213,2.339744,0,0.0,0.0
1356920,B00CAJCC14,1,467,0.781585,1.036213,2.339744,0,0.0,0.0
1356921,B00CAJCC14,3,468,2.339744,1.036213,2.339744,0,1.0,1.0
1973869,B00CAJCC14,3,468,2.339744,1.036213,2.339744,0,1.0,1.0
1973870,B00CAJCC14,3,468,2.339744,1.036213,2.339744,0,1.0,1.0
2306135,B00CAJCC14,0,458,0.0,1.036213,2.339744,0,0.0,0.0


**`most_helpful` and `group_z_class` disagree, and `most_helpful` might make more sense.**

In [70]:
df[df['asin'] == 'B00BH68UKI'][['asin','helpful_votes','review_age_days', 'annual_HVAR', 'std_HVAR', 'top_quartile_HVAR', 'most_helpful','class_2', 'group_z_class']]


Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,std_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
1348537,B00BH68UKI,2,456,1.600877,1.043644,2.401316,0,1.0,0.0
1348538,B00BH68UKI,3,456,2.401316,1.043644,2.401316,0,1.0,1.0
1348539,B00BH68UKI,3,456,2.401316,1.043644,2.401316,0,1.0,1.0
1348540,B00BH68UKI,1,456,0.800439,1.043644,2.401316,0,0.0,0.0
2493183,B00BH68UKI,0,368,0.0,1.043644,2.401316,0,0.0,0.0


## Why min-max scaling breaks down

**We see here why we should NOT use the min-max approach:**

When a book has reviews with very low variance in `annual_HVAR` the min-max scaling and then clustering to label results in labels that do not make sense.

In [69]:
df[df['asin'] == 'B00BNM5UFE'][['asin','helpful_votes','review_age_days','annual_HVAR', 'std_HVAR', 'top_quartile_HVAR', 'most_helpful','class_2', 'group_z_class']]



Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,std_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
1350754,B00BNM5UFE,1,490,0.744898,0.001851,0.744898,0,1.0,0.0
1350755,B00BNM5UFE,1,491,0.743381,0.001851,0.744898,0,1.0,0.0
1350756,B00BNM5UFE,1,493,0.740365,0.001851,0.744898,0,1.0,0.0
1350757,B00BNM5UFE,1,491,0.743381,0.001851,0.744898,0,1.0,0.0
1350758,B00BNM5UFE,1,490,0.744898,0.001851,0.744898,0,1.0,0.0


For a given book, the z-score and then cluster approach to labeling will sometimes label zero reviews helpful. Ocassionally, if a review's `annual_HVAR` is high enough compared to the mean value across all reviews for the same book, the z-score approach labels those reviews as helpful

In [71]:
## Other direction
df.groupby('asin')['most_helpful'].sum().sort_values(ascending=False).head(20)

asin
030758836X    618
0345803485    569
0439023483    549
0002007770    386
0345803493    372
1469984202    365
0141039280    364
0151008116    341
031604461X    326
0439023513    313
0805096663    293
0307277674    277
0805093079    274
0002247399    268
144235948X    268
0849922070    257
0345803507    256
0316228532    255
0425263924    253
0143170090    249
Name: most_helpful, dtype: int64

## Why within-book z-score normalization and then cluster-labeling is good
For a book with many reviews, and wide distribution of votes, the percentile approach labels more reviews as helpful and the z-score approach sets a more-conservative criteria for class membership.

In [89]:
test = df[df['asin'] == '030758836X'][['asin','helpful_votes','review_age_days','annual_HVAR', 'std_HVAR', 'top_quartile_HVAR', 'most_helpful','class_2', 'group_z_class']].sort_values(by='annual_HVAR', ascending=False)
print(test['most_helpful'].value_counts())
print(test['group_z_class'].value_counts())

0    1854
1     618
Name: most_helpful, dtype: int64
0.0    2462
1.0      10
Name: group_z_class, dtype: int64


In [95]:
test.head()

Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,std_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
200047,030758836X,835,798,381.923559,11.851664,0.632047,1,0.0,1.0
1461964,030758836X,602,836,262.834928,11.851664,0.632047,1,0.0,1.0
2519391,030758836X,485,725,244.172414,11.851664,0.632047,1,0.0,1.0
2519501,030758836X,429,747,209.618474,11.851664,0.632047,1,0.0,1.0
2519508,030758836X,214,765,102.104575,11.851664,0.632047,1,0.0,1.0


In [97]:
test[(test['most_helpful']==1) & (test['helpful_votes'] == 1)].head(20)

Unnamed: 0,asin,helpful_votes,review_age_days,annual_HVAR,std_HVAR,top_quartile_HVAR,most_helpful,class_2,group_z_class
2030698,030758836X,1,366,0.997268,11.851664,0.632047,1,0.0,0.0
2030566,030758836X,1,376,0.970745,11.851664,0.632047,1,0.0,0.0
2030671,030758836X,1,376,0.970745,11.851664,0.632047,1,0.0,0.0
2030595,030758836X,1,381,0.958005,11.851664,0.632047,1,0.0,0.0
200266,030758836X,1,381,0.958005,11.851664,0.632047,1,0.0,0.0
1461936,030758836X,1,382,0.955497,11.851664,0.632047,1,0.0,0.0
200424,030758836X,1,383,0.953003,11.851664,0.632047,1,0.0,0.0
2030760,030758836X,1,385,0.948052,11.851664,0.632047,1,0.0,0.0
200401,030758836X,1,387,0.943152,11.851664,0.632047,1,0.0,0.0
200295,030758836X,1,394,0.926396,11.851664,0.632047,1,0.0,0.0


Well that is certainly some odd behavior, and really makes the z-score approach look far too restrictive. But take a look at this...

In [93]:
print('Number of reviews with no helpful votes: {}'.format(test[test['helpful_votes'] == 0].shape[0]))
print('Number of reviews with helpful votes: {}'.format(test[test['helpful_votes'] != 0].shape[0]))
print('Total number of reviews: {}'.format(test.shape[0]))

Number of reviews with no helpful votes: 1454
Number of reviews with helpful votes: 1018
Total number of reviews: 2472


Not very reasonable to say that 618 of 1018 reviews are helpful

In [94]:
test[(test['most_helpful']==1) & (test['helpful_votes'] == 1)].shape

(171, 9)