In [1]:
# Import all the libraries
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn', Mutes warnings when copying a slice from a DataFrame.
pd.set_option('display.max_columns', None)
import glob
import datetime
import os
import json
from io import StringIO
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Constants
DATA_FOLDER = 'data/'
N_REVIEW_THRESHOLD = 20

In [3]:
# Load data
files = glob.glob(DATA_FOLDER + 'part-000*')
def load_one_file(path):
    with open(path) as f:
        return [json.loads(line) for line in f]

data = sum(map(load_one_file, files), [])

df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,B005FYPK9C,"[0, 0]",5.0,I was sketchy at first about these but once yo...,"01 8, 2013",A000008615DZQRRI946FO,mj waldon,great buy,1357603200
1,0871714779,"[0, 0]",1.0,"Because I have not gotten the book yet, I may ...","05 22, 2013",A000096625CHSNKYTYGZN,Melody L Fearn,I Have Not Got It Yet.,1369180800
2,B00E7OIOVC,"[0, 0]",5.0,Great!,"07 3, 2014",A0001528BGUBOEVR6T5U,igozingo,Five Stars,1404345600
3,B00EXOZ9W8,"[0, 0]",5.0,"My man was more then happy with it, looking fo...","07 2, 2014",A00030342K9JCQO8Q7C3L,Joseph Dvorak,Five Stars,1404259200
4,B000G2OYOU,"[1, 1]",5.0,The spirit of Selassie is with us through the ...,"11 20, 2012",A00040103SIRXWSG7KCB6,Brian DelaCruz,Jah Jah Kingstone,1353369600


In [5]:
# Create dataframe with needed information
product_reviews = pd.DataFrame({'productID': df.asin,
                                'rating': df.overall,
                                'time': pd.to_datetime(df.reviewTime, format='%m %d, %Y')
                                })
product_reviews['month'] = product_reviews.time.map(lambda t: t.month)
product_reviews.set_index('productID', drop=True, inplace=True)
product_reviews.head()

Unnamed: 0_level_0,rating,time,month
productID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
B005FYPK9C,5.0,2013-01-08,1
0871714779,1.0,2013-05-22,5
B00E7OIOVC,5.0,2014-07-03,7
B00EXOZ9W8,5.0,2014-07-02,7
B000G2OYOU,5.0,2012-11-20,11


In [8]:
# Create a product by month table to spot periodicity
# Each entry is the mean of ratings the product gets within certain month
# Nans are filled with 0
product_month = product_reviews.pivot_table(values='rating',
                                            index=product_reviews.index,
                                            columns='month',
                                            fill_value=0)

# Select products with number of reviews larger than the threshold
review_count = product_reviews.index.value_counts()
product_month = product_month.loc[review_count>N_REVIEW_THRESHOLD]

product_month.head()

month,1,2,3,4,5,6,7,8,9,10,11,12
productID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
006202406X,3.5,3.0,3.2,4.0,0.0,3.333333,0.0,0.0,0.0,3.0,3.75,2.0
006228648X,3.0,0.0,1.0,2.333333,4.5,1.5,0.0,0.0,0.0,2.5,3.166667,3.0
030758836X,3.777778,4.0,2.0,3.0,3.0,3.666667,4.0,3.0,3.6,3.0,3.8,2.5
0307588378,4.666667,4.0,5.0,0.0,4.0,4.0,1.0,5.0,4.25,2.666667,2.5,3.0
038536315X,4.4,4.666667,3.0,4.5,5.0,0.0,0.0,0.0,0.0,0.0,3.5,4.0


## ToDo: Spot/measure the periodicity (automatically)