In [1]:
import pandas as pd
import numpy as np 
import collections
import re

import spacy
from spacy import displacy
import en_core_web_sm

import warnings
warnings.filterwarnings('ignore')

In [2]:
# functions from McCauley to read Amazon data
# needed for metadata files which have funny json formatting
def parse(path):
    g = open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

In [4]:
path = '/Users/booranium/practicum/amazon/data/'

### Load Amazon Data

In [4]:
# load health and groceries data 
amz_data_health = getDF(path + 'reviews_Health_and_Personal_Care.json')
amz_meta_health = getDF(path + 'meta_Health_and_Personal_Care.json')
amz_data_groc = getDF(path + 'reviews_Grocery_and_Gourmet_Food.json')
amz_meta_groc = getDF(path + 'meta_Grocery_and_Gourmet_Food.json')

In [1]:
data.head()

NameError: name 'data' is not defined

Notes on data:
* More ASINs in metadata than in data; this doesn't matter because...
* All ASINs in data have corresponding metadata record 
* 1 duplicate ASIN: 'B00A2AWKZG'

In [35]:
# check 'categories' variable in health data 
health_cats = amz_meta_health.categories.values.tolist()
flat_health_cats = [val for sublist in health_cats for val in sublist]
flatter_health_cats = [val for sublist in flat_health_cats for val in sublist]
unique_health_cats = list(set(flatter_health_cats))
len(unique_health_cats)

1490

In [34]:
# check 'categories' variable in grocery data 
groc_cats = amz_meta_groc.categories.values.tolist()
flat_groc_cats = [val for sublist in groc_cats for val in sublist]
flatter_groc_cats = [val for sublist in flat_groc_cats for val in sublist]
unique_groc_cats = list(set(flatter_groc_cats))
len(unique_groc_cats)

537

In [242]:
unique_health_cats[:5]

['Passions',
 'Vitamins',
 'Paper & Plastic',
 'Piercing Supplies',
 'Poison Ivy Relief']

In [287]:
# filter metadata to 'categories' of interest 
# currently hardcodings matches; can later do a match on all terms listed on examine.com 
patterns_list = ['vitamin', 'supplement', 'mineral', 'complex',\
                 'iron', 'calcium', 'omega', 'herb', 'creatine',\
                 'enzyme', 'spiru', 'carnit', 'green', 'curc',\
                 'turme', 'fish oil', 'ashw'] 
patterns = re.compile('|'.join(patterns_list))
amz_meta_health.categories_lower.str.contains(patterns, regex = True, \
                                              flags = re.IGNORECASE, na = False)

In [306]:
amz_meta_health['cat_list'] = amz_meta_health['categories'].apply(lambda x: [val for sublist in x for val in sublist])
amz_meta_health['categories_clean'] = amz_meta_health['cat_list'].apply(lambda x: ', '.join(x))
amz_meta_health['categories_lower'] = amz_meta_health['categories_clean'].apply(lambda x: x.lower())
# lower manually as flags = re.IGNORECASE cannot be processed with a compiled pattern
amz_meta_health.drop(['categories', 'cat_list'], axis = 1, inplace = True)
amz_meta_health[:2]

Unnamed: 0,asin,description,title,imUrl,related,salesRank,price,brand,categories_clean,categories_lower
0,77614992,This is an example product description.,Principles of Mgmt + Oper-CSUF Custom C,http://ecx.images-amazon.com/images/I/51G%2BRq...,"{u'also_bought': [u'0471730726', u'0132834871'...",{u'Health & Personal Care': 168429},,,Health & Personal Care,health & personal care
1,615208479,By now we all know the benefits of exercise fo...,Brain Fitness Exercises Software,http://ecx.images-amazon.com/images/I/41kbZB04...,,{u'Health & Personal Care': 1346973},,,"Health & Personal Care, Personal Care","health & personal care, personal care"


In [304]:
amz_meta_small = amz_meta_health[amz_meta_health.categories_lower.str.contains(patterns, regex = True,  na = False)]
len(amz_meta_small)

49761

In [307]:
# Proportion of ASINs retained:
print("%.2f" % (len(amz_meta_small)*1.0/len(amz_meta_health)*1.0))

0.19


In [308]:
# check for duplicates that would affect table joins:
a = amz_meta_small.asin.tolist()
print [item for item, count in collections.Counter(a).items() if count > 1]

[]


In [309]:
data_all = amz_data_health.merge(amz_meta_small, how='inner', on='asin')
len(data_all)

585678

In [310]:
# Proportion of reviews retained:
print("%.2f" % (len(data_all)*1.0/len(amz_data_health)*1.0))

0.20


In [313]:
data_all[:2]

Unnamed: 0,reviewerID,asin,reviewerName,helpful,unixReviewTime,reviewText,overall,reviewTime,summary,description,title,imUrl,related,salesRank,price,brand,categories_clean,categories_lower
0,AY5NWX45GCYUE,929619730,seasalt,"[0, 0]",1401494400,B-flax-D is a regular at our house. It does it...,5.0,"05 31, 2014",Dpes the job well,Contains Organic Cold-Milled Flaxseed\nValuabl...,New Generation B-Flax-D,http://ecx.images-amazon.com/images/I/41YpSZ%2...,"{u'also_bought': [u'B000VUJY6A', u'B000VULNBO'...",{u'Health & Personal Care': 199107},24.95,New Generation B-Flax-D,"Health & Personal Care, Vitamins & Dietary Sup...","health & personal care, vitamins & dietary sup..."
1,A36Y3ZIL9416N8,978559088,Adam P. Simer,"[1, 1]",1266624000,Studies show that Resveratrol is poorly absorb...,4.0,"02 20, 2010","Fast shipping, good communication",Everyone knows that resveratrol is an amazing ...,Nutrihill Resveratrol Lozenges,http://ecx.images-amazon.com/images/I/31znLKmN...,,{u'Health & Personal Care': 405706},,,"Health & Personal Care, Vitamins & Dietary Sup...","health & personal care, vitamins & dietary sup..."


In [314]:
# keep only those columns related to reviews and star ratings 
data = data_all[['asin', 'helpful', 'reviewText', 'overall', 'summary', 'description', 'title', 'categories_clean']]

In [315]:
# filter out pet/vet supplies! 
len(data.title.unique())

48315

In [316]:
data.columns[data.isnull().any()].tolist()

['description', 'title']

In [317]:
# Prop. data with missing description or title 
print ("%.2f" % (len(data[pd.isnull(data).any(axis=1)])*1.0/len(data)*1.0))

0.02


In [318]:
# Fill na with 'none'
data.fillna('none', inplace = True)

In [320]:
pet_patterns_list = ['Canine', 'Feline', 'Pet ', 'Cat', 'Dog']
# use capitalized version as 'pet' is in many not-pet related titles
pet_pat = re.compile('|'.join(pet_patterns_list))
data_keep = data[data.title.str.match(pet_pat) == False]

In [322]:
# Number pet-related records removed
len(data) - len(data_keep)

234

In [324]:
data_keep.to_csv('data_clean', sep='|', index = False)

### Spacy NLP

In [1]:
text = u'We got this GPS for my husband who is an (OTR) over the road trucker.  Very Impressed with the shipping time, it arrived a few days earlier than expected...  within a week of use however it started freezing up... could of just been a glitch in that unit.  Worked great when it worked!  Will work great for the normal person as well but does have the "trucker" option. (the big truck routes - tells you when a scale is coming up ect...)  Love the bigger screen, the ease of use, the ease of putting addresses into memory.  Nothing really bad to say about the unit with the exception of it freezing which is probably one in a million and that\'s just my luck.  I contacted the seller and within minutes of my email I received a email back with instructions for an exchange! VERY impressed all the way around!'

In [29]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)

(u'We', u'-PRON-', u'PRON', u'PRP', u'nsubj', u'Xx', True, False)
(u'got', u'get', u'VERB', u'VBD', u'ROOT', u'xxx', True, False)
(u'this', u'this', u'DET', u'DT', u'det', u'xxxx', True, True)
(u'GPS', u'gps', u'NOUN', u'NN', u'dobj', u'XXX', True, False)
(u'for', u'for', u'ADP', u'IN', u'prep', u'xxx', True, True)
(u'my', u'-PRON-', u'ADJ', u'PRP$', u'poss', u'xx', True, True)
(u'husband', u'husband', u'NOUN', u'NN', u'pobj', u'xxxx', True, False)
(u'who', u'who', u'NOUN', u'WP', u'nsubj', u'xxx', True, True)
(u'is', u'be', u'VERB', u'VBZ', u'relcl', u'xx', True, True)
(u'an', u'an', u'DET', u'DT', u'det', u'xx', True, True)
(u'(', u'(', u'PUNCT', u'-LRB-', u'punct', u'(', False, False)
(u'OTR', u'otr', u'PROPN', u'NNP', u'attr', u'XXX', True, False)
(u')', u')', u'PUNCT', u'-RRB-', u'punct', u')', False, False)
(u'over', u'over', u'ADP', u'IN', u'prep', u'xxxx', True, True)
(u'the', u'the', u'DET', u'DT', u'det', u'xxx', True, True)
(u'road', u'road', u'NOUN', u'NN', u'compound', u'x

In [None]:
nlp = spacy.load('en')
doc = nlp(text)
displacy.serve(doc, style = 'dep')
#displacy.render(doc, style='dep', jupyter = True)


[93m    Serving on port 5000...[0m
    Using the 'dep' visualizer



127.0.0.1 - - [12/Apr/2018 17:01:22] "GET / HTTP/1.1" 200 113098
127.0.0.1 - - [12/Apr/2018 17:01:22] "GET /favicon.ico HTTP/1.1" 200 113098
