In [102]:
import pandas as pd
import numpy as np
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

In [48]:
path = '/Users/booranium/usf/622_dataviz/project/'

In [23]:
# functions from Prof. McCauley for reading JSON files 
def parse(path):
    g = open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [53]:
reviews = getDF(path + 'reviews_Digital_Music_5.json')
metadata = getDF(path + 'meta_Digital_Music.json')

# Data Prep

## Clean up Metadata

In [56]:
# clean up weird list of list categories
metadata.categories = metadata.categories.apply(lambda x: [val for sublist in x for val in sublist]).apply(lambda x: ', '.join(x))

In [57]:
def try_split_2(string):
    ls = string.split(',')
    try:
        return ls[1].strip()
    except:
        pass
    

def try_split_3(string):
    ls = string.split(',')
    try:
        return ls[2].strip()
    except:
        pass
    
def try_split_4(string):
    ls = string.split(',')
    try:
        return ls[3].strip()
    except:
        pass    

def try_split_5(string):
    ls = string.split(',')
    try:
        return ls[4].strip()
    except:
        pass

def try_split_6(string):
    ls = string.split(',')
    try:
        return ls[5].strip()
    except:
        pass  

def try_split_7(string):
    ls = string.split(',')
    try:
        return ls[6].strip()
    except:
        pass

In [59]:
# split out concatenated categories
metadata['cat1'] = metadata.categories.apply(lambda x: x.split(',')[0])
metadata['cat2'] = metadata.categories.apply(try_split_2)
metadata['cat3'] = metadata.categories.apply(try_split_3)
metadata['cat4'] = metadata.categories.apply(try_split_4)
metadata['cat5'] = metadata.categories.apply(try_split_5)
metadata['cat6'] = metadata.categories.apply(try_split_6)
metadata['cat7'] = metadata.categories.apply(try_split_7)

In [61]:
# Notice there are some random categories of products 
metadata.cat1.unique()

array(['CDs & Vinyl', 'Digital Music', 'Beauty', 'Arts', 'Baby Products'],
      dtype=object)

In [69]:
# filter to music-related products
metadata = metadata[metadata.cat1.isin(['CDs & Vinyl', 'Digital Music'])]

In [79]:
# filter to reviews for these products only
reviews = reviews.merge(pd.DataFrame(metadata.asin.unique()).rename(columns = {0 : 'asin'}))

## Feature Engineering on Reviews

Remove missing reviews

In [195]:
reviews = reviews[pd.isnull(reviews) == False]

In [193]:
reviews = reviews[~(reviews.reviewText == '')]

### Add review year

In [179]:
reviews['reviewYear'] = reviews.reviewTime.apply(lambda x: x.split(',')[1])

### Add review sentiment

Vader

In [100]:
analyser = SentimentIntensityAnalyzer()

In [101]:
reviews['sent_vader'] = reviews.reviewText.apply(lambda x : analyser.polarity_scores(x)['compound'])

In [146]:
reviews['sent_vader_summary'] = reviews.summary.apply(lambda x : analyser.polarity_scores(x)['compound'])

TextBlob - using PatternAnalyzer from Pattern library

In [104]:
reviews['sent_blob'] = reviews.reviewText.apply(lambda x: TextBlob(x).sentiment[0])

In [147]:
reviews['sent_blob_summary'] = reviews.summary.apply(lambda x: TextBlob(x).sentiment[0])

Take Average

In [157]:
reviews['sent_mean'] = reviews[['sent_vader', 'sent_blob', 'sent_vader_summary', 'sent_blob_summary']].apply(np.mean, axis = 1)

In [159]:
def get_sign(x):
    if x >= 0:
        return 1
    else: 
        return 0

In [160]:
reviews['sent_sign'] = reviews.sent_mean.apply(get_sign)

### Add review helpfulness score

In [163]:
reviews['helpful_1'] = reviews.helpful.apply(lambda x: x[0])
reviews['helpful_2'] = reviews.helpful.apply(lambda x: x[1])

In [174]:
def get_prop(x):
    if x[1] == 0:
        return 0
    else: 
        return x[0]/x[1]

In [177]:
reviews['helpful_prop'] = reviews[['helpful_1', 'helpful_2']].apply(get_prop, axis = 1)

Write out data to csv for analysis in R 

In [207]:
reviews[['reviewText', 'sent_sign', 'helpful_prop', 'overall', 'reviewYear']].to_csv('../data/reviews_clean', sep = '|')

In [222]:
reviews[['reviewText', 'overall', 'reviewYear']].head().rename(columns = {'reviewText': 'review_text', 'overall': 'product_rating', 'reviewYear' : 'review_year'})

Unnamed: 0,review_text,product_rating,review_year
0,"It's hard to believe ""Memory of Trees"" came ou...",5.0,2006
1,"A clasically-styled and introverted album, Mem...",5.0,2001
2,I never thought Enya would reach the sublime h...,5.0,2003
3,This is the third review of an irish album I w...,5.0,2000
4,"Enya, despite being a successful recording art...",4.0,2008
