In [1]:
import datetime
import json
import numpy as np
import pandas as pd
pd.set_option('max_colwidth', 500)

In [2]:
book_cutoffs = "w266_proj/data/booklist.csv" 
cutoffs = pd.read_csv(book_cutoffs,index_col='asin')

In [3]:
cutoffs.head()

Unnamed: 0_level_0,book_num_reviews,std_HVAR,top_quartile_HVAR
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
000100039X,86,2.930287,0.842702
0001055178,12,0.332911,0.465856
0001712772,4,0.635544,1.148897
0001714538,4,0.099335,0.171246
0002005395,7,0.287376,0.782166


In [4]:
cutoffs.describe()

Unnamed: 0,book_num_reviews,std_HVAR,top_quartile_HVAR
count,197298.0,197298.0,197298.0
mean,13.480745,1.777825,1.685347
std,28.154276,3.280041,2.001595
min,4.0,1.199178e-16,0.0
25%,5.0,0.4637533,0.70574
50%,7.0,0.9378294,1.154282
75%,13.0,1.950738,1.987549
max,2472.0,378.6433,128.641696


In [5]:
dataset = "w266_proj/data/dev.json"
df = pd.read_json(dataset,lines=True)

print('finished reading in data')

finished reading in data


In [6]:
# Peel out count of helpful votes into its own column
df['helpful_votes'] = df.apply(lambda x: x.helpful[0],axis=1)
# To avoid confusion later, drop the 'helpful' column at this point
df.drop('helpful', axis = 1, inplace=True)
# Convert reviewTime to datetime type info we can work with
df.reviewTime = pd.to_datetime(df.reviewTime,infer_datetime_format=True)

print('finished creating helpful_votes column')

finished creating helpful_votes column


In [7]:
print("Our dataset initially contains {} reviews".format(df.shape[0]))

Our dataset initially contains 476793 reviews


In [8]:
# Our dataset snapshot has as its maximum review date 2014-07-23
snapshotted_string = "20140723"
snapshotted = datetime.datetime.strptime(snapshotted_string, '%Y%m%d')

# We will DROP reviews less than one year old
# They may not have accumulated enough evidence to support being helpful or not
ourcutoff = snapshotted - datetime.timedelta(days=365)
df = df[df.reviewTime < ourcutoff]

print("After removing reviews less than one year old, our dataset contains {} reviews".format(df.shape[0]))

After removing reviews less than one year old, our dataset contains 361268 reviews


In [9]:
# Here we store the age of each review in days
df['review_age_days'] = df.apply(lambda x: (snapshotted - x.reviewTime).days,axis = 1)

print('task complete')

task complete


In [10]:
# We create the annual HVAR score
df['annual_HVAR'] = df.apply(lambda x: 365*x.helpful_votes/x.review_age_days, axis = 1)

print('task complete')

task complete


In [11]:
# Now we join the dev dataset and the by-book cutoff dataset
df = df.join(cutoffs,on='asin')

In [12]:
df['asin'].nunique()

154891

In [13]:
print(f"{df.book_num_reviews.isna().sum()} reviews in the dev set get dropped for books having inadequate review counts/variance")

47156 reviews in the dev set get dropped for books having inadequate review counts/variance


In [14]:
# We drop these here
df = df[df.book_num_reviews.notna()]
print(f"After removing off-limits books we have {df.shape[0]} dev reviews")

After removing off-limits books we have 314112 dev reviews


In [20]:
# Only do this step if trying to gather hvar lists for train/dev/test sets
# Gather and pickle hvar lists per book

#dev_hvar_list = pd.DataFrame(df.groupby('asin')['annual_HVAR'].apply(list))

In [16]:
#dev_hvar_list.head()

NameError: name 'dev_hvar_list' is not defined

In [22]:
#dev_hvar_list.to_pickle('dev_hvar_list.pkl')

In [15]:
# Retain the most_helpful column for easy model comparisons using the same data set

# And now we label the reviews that are in the top quartile as 'most_helpful' = 1 (otherwise 0)
df['most_helpful'] = df.apply(lambda x: int(x.annual_HVAR > x.top_quartile_HVAR),axis = 1)

In [18]:
df.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,helpful_votes,review_age_days,annual_HVAR,book_num_reviews,std_HVAR,top_quartile_HVAR,most_helpful
0,000100039X,5,"I would have to say that this is the best book I've ever read.. I could feel every word deep in my heart everytime, of the many times I've read it! I would never get enough of it! its a treasure..",2001-02-24,A26GKZPS079GFF,Areej,Touches my heart.. again and.. again...,982972800,2,4897,0.149071,86.0,2.930287,0.842702,0
1,000100039X,5,"This is Gibran's most celebrated work and it is so with goodreason. It is a profound insight and expose into the human characterand the emotions that work within the heart and soul. Humble words,seared with the heights and depths of love and of pain, this is a life-changing arrangement of words. If I had to sum the book up in one word, it would be &quot;SUBLIME.&quot; If you're picking up this title, be sure to order Gibran's 'The Beloved' to go along with it. These are books that deserve...",2000-05-03,A15ACUAJEJXCS3,Caz,Superb,957312000,1,5194,0.070273,86.0,2.930287,0.842702,0
2,000100039X,5,"Gibran Khalil Gibran was born in 1883 in what is now Northern Lebanon. In 1909 he went to Paris to study, but he did not like the strict education, and so he traveled, eventually moving to New York. Gibran became both an artist and a writer, and in 1923 he published ""The Prophet"", which is generally considered to be his greatest work. He died of cancer in a New York hospital at the very young age of 48.The Prophet is a story about Almustafa (The Prophet) who after living 12 years in Orpha...",2006-01-10,AWLFVCT9128JV,"Dave_42 ""Dave_42""",The Lessons Of Life,1136851200,8,3116,0.937099,86.0,2.930287,0.842702,1
3,000100039X,5,"_The Prophet_ is a short read (my copy checks in at just under 100 pages), but its berevity belies both the power and beauty of Gibran's words. At its simplest, it is a discourse on the human condition: love, work, joy and sorrow, crime and punishment, reason and passion, Gibran runs the gamut of emotion and being, laying bare the paradox of who we are as human beings. While the tone is somewhat mystical (which I didn't really care for), the sheer poetic beauty of his writing moved me.For ...",2012-08-12,A2NHD7LUXVGTD3,doc peterson,a beautiful poetic commentary on what it is to be human,1344729600,1,710,0.514085,86.0,2.930287,0.842702,0
4,000100039X,5,"The Prophet, for me, is a very vivid yet dense book. It speaks some sort of wisdom, and I delight in that wisdom when I can understand it.The illustrations are done by the author himself, which is nice.",2007-11-29,AAEP8YFERQ8FC,General Breadbasket,Speak to Us of the Prophet,1196294400,1,2428,0.150329,86.0,2.930287,0.842702,0


In [16]:
# Load global per-book min-max values
import pickle

global_min_max_pkl = open('global_min_max.pkl','rb')
global_min_max = pickle.load(global_min_max_pkl)
global_min_max_pkl.close()

In [17]:
# Join on asin
df = df.join(global_min_max, on='asin')

In [18]:
# check for missing min_max values after the join
df.min_max.isnull().sum()

0

In [19]:
# Min max scale individual reviews using the global min-max values in the min_max column

def global_min_max(row):
    #print('current row: {}'.format(row))
    x = row['annual_HVAR']
    
    try:
        minimum = row['min_max'][0]
    except TypeError:
        print('type error on min calc: {}'.format(row))
        #raise TypeError('you are a dummy!')
        
    try:
        maximum = row['min_max'][1]
    except TypeError:
        print('type error on max calc: {}'.format(row))
        return
    
    try:
        scaled_value = (x-minimum)/(maximum-minimum)
    except ZeroDivisionError:
        # Failed to find reviews for the same book with zero variance
        print('ZeroDivisionError on scaling calc for: {}'.format(row))
        raise ZeroDivisionError('Evaluate the error and try to catch it or remove the offending data')
        
    return scaled_value

In [20]:
df['scaled'] = df[['annual_HVAR', 'min_max']].apply(global_min_max, axis=1)

In [21]:
# label by within-book z-score normalization (standardization)

#def z_calc(x):
    #return (x - np.mean(x))/np.std(x, ddof=0)
    
def z_calc(row):
    x = row['annual_HVAR']
    mean = row['mean_sd'][0]
    std = row['mean_sd'][1]
    normalized = (x - mean)/std
    return normalized

In [22]:
df['group_z'] = df[['annual_HVAR','mean_sd']].apply(z_calc, axis=1)

In [32]:
df.head()

Unnamed: 0,asin,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,helpful_votes,review_age_days,annual_HVAR,book_num_reviews,std_HVAR,top_quartile_HVAR,most_helpful,min_max,mean_sd,scaled,class_2,group_z
0,000100039X,5,"I would have to say that this is the best book I've ever read.. I could feel every word deep in my heart everytime, of the many times I've read it! I would never get enough of it! its a treasure..",2001-02-24,A26GKZPS079GFF,Areej,Touches my heart.. again and.. again...,982972800,2,4897,0.149071,86.0,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)",0.006134,0.0,-0.32278
1,000100039X,5,"This is Gibran's most celebrated work and it is so with goodreason. It is a profound insight and expose into the human characterand the emotions that work within the heart and soul. Humble words,seared with the heights and depths of love and of pain, this is a life-changing arrangement of words. If I had to sum the book up in one word, it would be &quot;SUBLIME.&quot; If you're picking up this title, be sure to order Gibran's 'The Beloved' to go along with it. These are books that deserve...",2000-05-03,A15ACUAJEJXCS3,Caz,Superb,957312000,1,5194,0.070273,86.0,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)",0.002892,0.0,-0.351103
2,000100039X,5,"Gibran Khalil Gibran was born in 1883 in what is now Northern Lebanon. In 1909 he went to Paris to study, but he did not like the strict education, and so he traveled, eventually moving to New York. Gibran became both an artist and a writer, and in 1923 he published ""The Prophet"", which is generally considered to be his greatest work. He died of cancer in a New York hospital at the very young age of 48.The Prophet is a story about Almustafa (The Prophet) who after living 12 years in Orpha...",2006-01-10,AWLFVCT9128JV,"Dave_42 ""Dave_42""",The Lessons Of Life,1136851200,8,3116,0.937099,86.0,2.930287,0.842702,1,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)",0.038561,0.0,-0.039531
3,000100039X,5,"_The Prophet_ is a short read (my copy checks in at just under 100 pages), but its berevity belies both the power and beauty of Gibran's words. At its simplest, it is a discourse on the human condition: love, work, joy and sorrow, crime and punishment, reason and passion, Gibran runs the gamut of emotion and being, laying bare the paradox of who we are as human beings. While the tone is somewhat mystical (which I didn't really care for), the sheer poetic beauty of his writing moved me.For ...",2012-08-12,A2NHD7LUXVGTD3,doc peterson,a beautiful poetic commentary on what it is to be human,1344729600,1,710,0.514085,86.0,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)",0.021154,0.0,-0.191579
4,000100039X,5,"The Prophet, for me, is a very vivid yet dense book. It speaks some sort of wisdom, and I delight in that wisdom when I can understand it.The illustrations are done by the author himself, which is nice.",2007-11-29,AAEP8YFERQ8FC,General Breadbasket,Speak to Us of the Prophet,1196294400,1,2428,0.150329,86.0,2.930287,0.842702,0,"(0.0, 24.302026375040207)","(1.0470776160076654, 2.7821006685613363)",0.006186,0.0,-0.322328


In [23]:
np.where(np.isnan(df['scaled']))

(array([], dtype=int64),)

In [24]:
# check for errors
errors = np.where(np.isnan(df['scaled']))
print(len(errors[0]))

0


In [25]:
# Use kmeans clustering to bucket scaled values
from sklearn.preprocessing import KBinsDiscretizer

est = KBinsDiscretizer(n_bins=2, encode='ordinal', strategy='kmeans')

In [26]:
df['class_2'] = est.fit_transform(df[['scaled']])

print('min-max labeling task complete')

min-max labeling task complete


In [27]:
df['group_z_class'] = est.fit_transform(df[['group_z']])

print('z-score normalization labeling task complete')

z-score normalization labeling task complete


In [28]:
df['class_2'].value_counts()

0.0    271071
1.0     43041
Name: class_2, dtype: int64

In [29]:
df['group_z_class'].value_counts()

0.0    281325
1.0     32787
Name: group_z_class, dtype: int64

In [30]:
# Write it out to a file
# - This modified dev set contains new label column generated by kmeans clustering min-max scaled annual_HVAR values
df.to_csv('w266_proj/data/labeled_dev_set_clust_FINAL.csv',index=False)