In [1]:
import os
import json
import gzip
import pandas as pd
from urllib.request import urlopen

In [15]:
### load the meta data

data = []
with gzip.open('data/meta_AMAZON_FASHION.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))
    
# total length of list, this number equals total number of products
print(len(data))

# first row of the list
print(data[0])

186637
{'title': 'Slime Time Fall Fest [With CDROM and Collector Cards and Neutron Balls, Incredi-Ball and Glow Stick Necklace, Paper Fram', 'brand': 'Group Publishing (CO)', 'feature': ['Product Dimensions:\n                    \n8.7 x 3.6 x 11.4 inches', 'Shipping Weight:\n                    \n2.4 pounds'], 'rank': '13,052,976inClothing,Shoesamp;Jewelry(', 'date': '8.70 inches', 'asin': '0764443682', 'imageURL': ['https://images-na.ssl-images-amazon.com/images/I/51bSrINiWpL._US40_.jpg'], 'imageURLHighRes': ['https://images-na.ssl-images-amazon.com/images/I/51bSrINiWpL.jpg']}


In [None]:
# convert list into pandas dataframe

df = pd.DataFrame.from_dict(data)

print(len(df))

18772


In [None]:
### remove rows with unformatted title (i.e. some 'title' may still contain html style content)

df3 = df.fillna('')
df4 = df3[df3.title.str.contains('getTime')] # unformatted rows
df5 = df3[~df3.title.str.contains('getTime')] # filter those unformatted rows
print(len(df4))
print(len(df5))

135
18637


In [None]:
# how those unformatted rows look like
df4.iloc[0]

description                                                      
title           var aPageStart = (new Date()).getTime();\nvar ...
image                                                            
brand                                              DS Miller Inc.
rank            [>#1,826,312 in Electronics (See Top 100 in El...
main_cat                                                Computers
date                                                March 1, 2010
asin                                                   B0016C5EXY
feature                                                          
tech1                                                            
also_buy                                                         
price                                                            
also_view                                                        
tech2                                                            
details                                                          
similar_it

In [3]:
def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield json.loads(l)
    
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


In [4]:
df = getDF('data/AMAZON_FASHION_5.json.gz')

In [8]:
df=df.drop_duplicates(['reviewerID','asin'])

In [9]:
df

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"09 4, 2015",ALJ66O1Y6SLHA,B000K2PJ4K,"{'Size:': ' Big Boys', 'Color:': ' Blue/Orange'}",Tonya B.,Great product and price!,Five Stars,1441324800,,
5,3.0,True,"05 6, 2015",A3W11493KS6Z2L,B000K2PJ4K,"{'Size:': ' Little Boys', 'Color:': ' White/Bl...",NaeNae,Waaay too small. Will use for futur children!,Oops!,1430870400,,
10,2.0,True,"01 25, 2018",A3HX4X3TIABWOV,B000KPIHQ4,"{'Size Name:': ' Men's 6-6.5, Women's 8-8.5', ...",Denise A. Conte,Relieved my Plantar Fascitis for 3 Days. Then ...,These were recommended by my Podiatrist,1516838400,,
11,2.0,True,"01 5, 2017",AW8UBYMNJ894V,B000KPIHQ4,"{'Size Name:': ' Men's 8-8.5, Women's 10-10.5'...",Cognizant Consumer,This is my 6th pair and they are the best thin...,Not the same as all my other pairs.,1483574400,,
12,5.0,True,"10 17, 2016",A265UZVOZWTTXQ,B000KPIHQ4,,William_Jasper,We have used these inserts for years. They pr...,Great inserts,1476662400,,
...,...,...,...,...,...,...,...,...,...,...,...,...
3171,5.0,True,"07 2, 2018",A2077NII5H62R2,B005AGO4LU,"{'Size:': ' 8.5 B(M) US', 'Color:': ' Green Gl...",Amazon Customer,Perfect fit!,Five Stars,1530489600,,
3172,5.0,True,"06 28, 2018",A2IBS6PIPAGAB5,B005AGO4LU,"{'Size:': ' 5 B(M) US', 'Color:': ' Wolf Grey/...",J. Avila,My favorite cross trainers!,Comfortable,1530144000,,
3173,5.0,True,"06 25, 2018",A1GTC5EVSJNCQ8,B005AGO4LU,"{'Size:': ' 8 B(M) US', 'Color:': ' Blue Tint/...",Amazon Customer,Love them fit perfect,Five Stars,1529884800,,
3174,5.0,True,"06 20, 2018",A311XHHLM12MUT,B005AGO4LU,"{'Size:': ' 9 B(M) US', 'Color:': ' Blue Tint/...",Peter,Favorite Nike shoe ever! The flex sole is exce...,Love them!,1529452800,,


In [11]:
df.describe()

Unnamed: 0,overall,unixReviewTime
count,3042.0,3042.0
mean,4.41979,1493214000.0
std,1.023712,21766680.0
min,1.0,1261699000.0
25%,4.0,1480291000.0
50%,5.0,1492992000.0
75%,5.0,1508976000.0
max,5.0,1530749000.0


In [14]:
df['overall'].nunique()

5