In [2]:
import datetime
import json
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 500)

In [3]:
book_cutoffs = "w266_proj/data/booklist.csv" 
cutoffs = pd.read_csv(book_cutoffs,index_col='asin')

In [31]:
cutoffs.head()

Unnamed: 0_level_0,book_num_reviews,std_HVAR,top_quartile_HVAR
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000100039X,86,2.930287,0.842702
0001055178,12,0.332911,0.465856
0001712772,4,0.635544,1.148897
0001714538,4,0.099335,0.171246
0002005395,7,0.287376,0.782166


In [4]:
dataset = "w266_proj/data/test.json"
df = pd.read_json(dataset,lines=True)

print('finished reading in data')

finished reading in data


In [5]:
# Peel out count of helpful votes into its own column
df['helpful_votes'] = df.apply(lambda x: x.helpful[0],axis=1)
# To avoid confusion later, drop the 'helpful' column at this point
df.drop('helpful', axis = 1, inplace=True)
# Convert reviewTime to datetime type info we can work with
df.reviewTime = pd.to_datetime(df.reviewTime,infer_datetime_format=True)

print('finished creating helpful_votes column')

finished creating helpful_votes column


In [6]:
print("Our dataset initially contains {} reviews".format(df.shape[0]))

Our dataset initially contains 475978 reviews


In [7]:
# Our dataset snapshot has as its maximum review date 2014-07-23
snapshotted_string = "20140723"
snapshotted = datetime.datetime.strptime(snapshotted_string, '%Y%m%d')

# We will DROP reviews less than one year old
# They may not have accumulated enough evidence to support being helpful or not
ourcutoff = snapshotted - datetime.timedelta(days=365)
df = df[df.reviewTime < ourcutoff]
print("After removing reviews less than one year old, our dataset contains {} reviews".format(df.shape[0]))

After removing reviews less than one year old, our dataset contains 360680 reviews


In [8]:
# Here we store the age of each review in days
df['review_age_days'] = df.apply(lambda x: (snapshotted - x.reviewTime).days,axis = 1)

In [9]:
# We create the annual HVAR score
df['annual_HVAR'] = df.apply(lambda x: 365*x.helpful_votes/x.review_age_days, axis = 1)

print('task complete')

task complete


In [38]:
df.describe()

Unnamed: 0,overall,unixReviewTime,helpful_votes,review_age_days,annual_HVAR
count,360680.0,360680.0,360680.0,360680.0,360680.0
mean,4.069926,1251397000.0,6.704306,1790.239168,1.603826
std,1.209969,116477100.0,27.198241,1348.11448,6.830568
min,1.0,843004800.0,0.0,366.0,0.0
25%,3.0,1178842000.0,1.0,653.0,0.292468
50%,5.0,1291853000.0,2.0,1322.0,0.661232
75%,5.0,1349654000.0,5.0,2630.0,1.385725
max,5.0,1374451000.0,7136.0,6517.0,2088.72494


In [10]:
# Now we join the test dataset and the by-book cutoff dataset
df = df.join(cutoffs,on='asin')

In [12]:
df.describe()

Unnamed: 0,overall,unixReviewTime,helpful_votes,review_age_days,annual_HVAR,book_num_reviews,std_HVAR,top_quartile_HVAR
count,360680.0,360680.0,360680.0,360680.0,360680.0,313656.0,313656.0,313656.0
mean,4.069926,1251397000.0,6.704306,1790.239168,1.603826,75.411846,3.556318,1.539988
std,1.209969,116477100.0,27.198241,1348.11448,6.830568,195.637539,10.580872,1.877889
min,1.0,843004800.0,0.0,366.0,0.0,4.0,0.000186,0.0
25%,3.0,1178842000.0,1.0,653.0,0.292468,9.0,0.722466,0.692892
50%,5.0,1291853000.0,2.0,1322.0,0.661232,22.0,1.554675,1.042659
75%,5.0,1349654000.0,5.0,2630.0,1.385725,58.0,3.45475,1.778518
max,5.0,1374451000.0,7136.0,6517.0,2088.72494,2472.0,378.643335,113.758186


In [11]:
print(f"{df.book_num_reviews.isna().sum()} reviews in the test set get dropped for books having inadequate review counts/variance")

47024 reviews in the test set get dropped for books having inadequate review counts/variance


In [12]:
# We drop these here
df = df[df.book_num_reviews.notna()]
print(f"After removing off-limits books we have {df.shape[0]} test reviews")

After removing off-limits books we have 313656 test reviews


In [17]:
# Gather and pickle hvar lists per book

#test_hvar_list = pd.DataFrame(df.groupby('asin')['annual_HVAR'].apply(list))

In [14]:
#test_hvar_list.head()

In [20]:
#test_hvar_list.to_pickle('test_hvar_list.pkl')

In [13]:
# Retain the most_helpful column for easy model comparisons using the same data set

# And now we label the reviews that are in the top quartile as 'most_helpful' = 1 (otherwise 0)
df['most_helpful'] = df.apply(lambda x: int(x.annual_HVAR > x.top_quartile_HVAR),axis = 1)

In [14]:
# Load global per-book min-max values
import pickle

global_min_max_pkl = open('global_min_max.pkl','rb')
global_min_max = pickle.load(global_min_max_pkl)
global_min_max_pkl.close()

In [15]:
# Join on asin
df = df.join(global_min_max, on='asin')

In [16]:
# check for missing min_max values after the join
df.min_max.isnull().sum()

0

In [90]:
df.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,helpful_votes,review_age_days,annual_HVAR,book_num_reviews,std_HVAR,top_quartile_HVAR,most_helpful,group_z,min_max,mean_sd
0,000100039X,5,"Anything I've read by Gibran is, in my mind, flawless. This, the most famous of his works, is no exception. It is simple, yet deep; honest and profound; moving and inspirational. Gibran's work is one of a kind, and can be far more encouraging and moving than any self-help program or therapy or anything like that. The poetic style, the aphorisms, the parables, the almost biblical feel, are all just what over-worked, over-stressed, modern and spiritually starved worldly people need.",2005-11-16,A2X4HE21JTAL98,Antiquarian,Flawless,1132099200,3,3171,0.345317,86.0,2.930287,0.842702,0,-0.082556,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)"
1,000100039X,5,"It's a thin book which has deep thoughts about topics like love, generosity, death, prayer, etc. It can be read again and again because it is thin while at the sametime it contains so many words of wisdon.",2004-10-03,A3L6UC8985ORUY,"Ayesha Riaz ""ashriaz""",Loved it,1096761600,1,3580,0.101955,86.0,2.930287,0.842702,0,-1.240527,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)"
2,000100039X,5,"A decade past, I knew very little about the writer/artist/poet/prophet, Kahlil Gibran. I am happy to say, that today, I am well acquainted with his work. The Prophet was the first book I read written by Gibran, and ever since that first reading I have immersed myself in his extraordinarily consummate skill of exquisitely crafted words. To say the Prophet or any other writings by Kahlil Gibran is simply fine literature is to say the mysterious of the soul are mediocre. I recommend that if you...",2012-02-08,A1JP8MMNY1EACY,"Christopher Covert ""Author of Hands On Fire""",Remarkable!,1328659200,1,896,0.407366,86.0,2.930287,0.842702,0,0.212688,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)"
3,000100039X,5,"I was given this book as a high school graduation gift and I have found it to be the book I look to in hard times as well as good times, for advise, inspiration, hope and words of wisdom. I highly recommend it.",2003-06-02,A31WDOV3Q22ANV,J. Eure,"Inspiring - full of hope, love and beautiful prose",1054512000,3,4069,0.269108,86.0,2.930287,0.842702,0,-0.445176,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)"
4,000100039X,5,"This book was recommended to me by a friend. What he told me of it is that, just as The Art of War, this is the book from which you take out whatever you want. What you find in it depends as much on the book as it does on you. It speaks of love, life and people in the most touching way ever. I have it in my handbag at all times. When I first read it, I cried. There is literally something mystical about it. I like to think I took a lot out of it. One of those books you should read again and a...",2005-02-03,ABFOAYZA2UHD3,J. Malnar,I cried reading this book,1107388800,3,3457,0.316749,86.0,2.930287,0.842702,0,-0.21849,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)"


In [17]:
# label by within-book z-score normalization (standardization)

#def z_calc(x):
    #return (x - np.mean(x))/np.std(x, ddof=0)
    
def z_calc(row):
    x = row['annual_HVAR']
    mean = row['mean_sd'][0]
    std = row['mean_sd'][1]
    normalized = (x - mean)/std
    return normalized

In [18]:
df['group_z'] = df[['annual_HVAR','mean_sd']].apply(z_calc, axis=1)

In [19]:
np.where(np.isnan(df['group_z']))

(array([], dtype=int64),)

In [20]:
# Use kmeans clustering to bucket scaled values
from sklearn.preprocessing import KBinsDiscretizer

est2 = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='kmeans')

In [21]:
df['group_z_class'] = est2.fit_transform(df[['group_z']])

print('z-score normalization labeling task complete')

z-score normalization labeling task complete


In [22]:
df['group_z_class'].value_counts()

0.0    281075
1.0     32581
Name: group_z_class, dtype: int64

In [23]:
df['most_helpful'].value_counts()

0    226650
1     87006
Name: most_helpful, dtype: int64

In [25]:
print(f"There are {df.shape[0]} reviews in our test split")
print("{} are labeled 1".format(df.group_z_class.sum()))
print(f"{df.group_z_class.sum()/df.shape[0]:.2%} of reviews in the test set carry the most_helpful label of 1")

There are 313656 reviews in our test split
32581.0 are labeled 1
10.39% of reviews in the test set carry the most_helpful label of 1


In [1]:
import os
os.getcwd()

'C:\\Users\\Brad\\Desktop\\Keras - GPU'

In [19]:
# Write it out to a file
df.to_csv('w266_proj/data/labeled_test_set_clust.csv',index=False)

In [20]:
!pwd

/home/jend/fp/RealRelevantReviews
