In [2]:
import gzip
import json

In [3]:
path = 'Data/amazon_reviews/Gift_Cards.json.gz'
file = gzip.open(path, 'rt')

In [4]:
line = file.readline()
json.loads(line)

{'overall': 1.0,
 'vote': '25',
 'verified': False,
 'reviewTime': '12 19, 2008',
 'reviewerID': 'APV13CM0919JD',
 'asin': 'B001GXRQW0',
 'style': {'Gift Amount:': ' 50'},
 'reviewerName': 'LEH',
 'reviewText': 'Amazon,\nI am shopping for Amazon.com gift cards for Christmas gifts and am really so disappointed that out of five choices there isn\'t one that says "Merry Christmas" or mentions Christmas at all!  I am sure I am not alone in wanting a card that reflects the actual "holiday" we are celebrating. On principle, I cannot send a Amazon gift card this Christmas.  What\'s up with all the Political Correctness?  Bad marketing decision.\nLynn',
 'summary': 'Merry Christmas.',
 'unixReviewTime': 1229644800}

In [5]:
data = []
for i in range(50000):
    data.append(json.loads(file.readline()))

In [6]:
# Store ratings into a list
# Method 1: Using list comprehension 
ratings = [index['overall'] for index in data ]

# Method 2: Using for-loop -> Essentially same as the list comprehension but easier to follow
ratings = []
for record in data:
    ratings.append(record['overall'])

In [7]:
# To find average rating
avg_rating = sum(ratings)/len(ratings)
print('Average Rating: {}'.format(avg_rating))

# To find rating distribution - E.g. How many 1s, 2s ...
# Method 1: My own method using for-loops
ratings_dist = {0:0,
                1:0,
                2:0,
                3:0,
                4:0,
                5:0}

for record in data:
    if record['overall'] == 0:
        ratings_dist[0] +=1
    elif record['overall'] == 1:
        ratings_dist[1] +=1
    elif record['overall'] == 2:
        ratings_dist[2] +=1
    elif record['overall'] == 3:
        ratings_dist[3] +=1
    elif record['overall'] == 4:
        ratings_dist[4] +=1
    else:
        ratings_dist[5] +=1

for key, value in ratings_dist.items():
    ratings_dist[key] = value/len(data)

# Method 2: List comprehension -> Really makes the code shorter
for record in data:
    ratings_dist[record['overall']] +=1

for key, value in ratings_dist.items():
    ratings_dist[key] = value/len(data)

    
# Printing out the distribution in a neat format
print('Distribution of Ratings \n------------------------\n0: {zero}\n1: {one}\n2: {two}\n3: {three}\n4: {four}\n5: {five}'.format(zero=ratings_dist[0], 
                                                                                                                                   one=ratings_dist[1], 
                                                                                                                                   two=ratings_dist[2], 
                                                                                                                                   three=ratings_dist[3], 
                                                                                                                                   four=ratings_dist[4], 
                                                                                                                                   five=ratings_dist[5]))

Average Rating: 4.69238
Distribution of Ratings 
------------------------
0: 0.0
1: 0.0410408208
2: 0.0121202424
3: 0.022300446
4: 0.06250125
5: 0.8620572408


In [8]:
# Method 3: Using default dict -> Smartest way to do it
# Above 2 methods you're required to intialize the dictionary manually
# This works well when the number of unique entries are known & small
# Its not possible/excessive tedious when no. of unique entries are large
# defaultdict() automates this process -> Automatically records first entry as 1
from collections import defaultdict

ratings_dict = defaultdict(int)

for record in data:
    ratings_dict[record['overall']] += 1

ratings_dict

defaultdict(int, {5.0: 43102, 1.0: 2052, 3.0: 1115, 4.0: 3125, 2.0: 606})

In [9]:
# Counting the number of verified and non-verified purchases
verified_dict = defaultdict(int)

for record in data:
    verified_dict[record['verified']] += 1

verified_dict

defaultdict(int, {False: 3563, True: 46437})

In [10]:
# Determining product's popularity
product_dict = defaultdict(int)

for record in data:
    product_dict[record['asin']] += 1

# To get top 10 most popular products by count
# Essentially, we're going to need to create a list of tuples
# Tuple format -> (count, product_id)
# Then make use of list sorting function to arrange the counts
# sort() will look at the first item in the tuple to determine sort sequence
# Cause of this we need to format it in (count, product_id)
# Then we do a [-10:] splice to the list to get last 10 entries cause sort is in ascending order
# Method 1: My method
counts = []
for i in product_dict:
    counts.append((product_dict[i], i))
counts.sort()
counts[-10:]

# Method 2: List comprehension
counts = [(product_dict[p], p) for p in product_dict]
counts.sort()
counts[-10:]

[(1108, 'B004LLIL4G'),
 (1330, 'B005DHN6E2'),
 (1461, 'B005ESMJ02'),
 (1704, 'B007V6ETDK'),
 (1768, 'B005ESMMWW'),
 (1827, 'B004LLIL5A'),
 (2213, 'B0066AZGD4'),
 (2224, 'B004LLIKY2'),
 (5410, 'B004LLIKVU'),
 (8164, 'B004Q7CK9M')]

In [11]:
# Now we wanna compute each product's average ratings
# Method 1: My method
# First initialize a default dictionary
# Iterate through each record in the data
# Then for each record, take down the product id
# Then use this product id to get the count of this product from previous dictionary
# Then get this record's cummulative score by taking this record's given score divided by the count of this record's product
# Then do a cummulative add for every record
product_ratings = defaultdict(int)

for record in data:
    pid = record['asin']
    p_count = product_dict[pid]
    avg_cummulative_score = record['overall'] / p_count
    
    product_ratings[record['asin']] += avg_cummulative_score

pdt_ratings = []

for i in product_ratings:
    pdt_ratings.append((product_ratings[i], i))
pdt_ratings.sort()
pdt_ratings[-10:]


# Method 2
# Different method, but this uses a 'list' type default dict
product_ratings = defaultdict(list)

# Iterate through each record
# Then for each record, for every unique product id, append the score into a list in the value part
# E.g. pid_1 : [score_1, score_2...], pid_2 : [score_1, score_2, ....]
for record in data:
    product_ratings[record['asin']].append(record['overall'])

# Create a new dictionary
# Then iterate through the above dictionary
# This time when you iterate, it will go through different keys
# The keys are the product_ids due to how you structured the dict above
# Then when you refer to the key, you'll get the entire list of ratings for that product
# So in the new dictionary, you do a sum/len of each list from the previous dictionary
output_dict = {}
for product in product_ratings:
    output_dict[product] = sum(product_ratings[product]) / len(product_ratings[product])

output_dict   

{'B001GXRQW0': 4.110791366906475,
 'B002BSHDJK': 4.594059405940594,
 'B002DN7XS4': 1.0,
 'B002QFXC7U': 2.8,
 'B002XNLC04': 4.153846153846154,
 'B002YEWXZ0': 2.5,
 'B00414Y7Y6': 4.3,
 'B004KNWWO0': 4.574468085106383,
 'B004KNWWUE': 4.739130434782608,
 'B004KNWWT0': 4.867924528301887,
 'B004KNWWQS': 4.69811320754717,
 'B004KNWX3K': 5.0,
 'B004KNWWR2': 4.526315789473684,
 'B004KNWX2Q': 4.65,
 'B004KNWWNG': 4.5,
 'B004KNWWRC': 4.608695652173913,
 'B004KNWWPY': 4.596774193548387,
 'B004KNWX12': 3.6666666666666665,
 'B004KNWX8U': 4.883720930232558,
 'B004KNWX0I': 4.5,
 'B004KNWWOA': 4.7727272727272725,
 'B004KNWWTK': 4.72,
 'B004KNWWPO': 4.84375,
 'B004KNWX4O': 4.822429906542056,
 'B004KNWXA8': 4.481481481481482,
 'B004KNWWP4': 4.666666666666667,
 'B004KNWX1W': 4.824675324675325,
 'B004KNWWRM': 4.742857142857143,
 'B004KNWX9Y': 4.765625,
 'B004KNWX3A': 4.6976744186046515,
 'B004KNWX6C': 4.84055459272097,
 'B004KNWX4Y': 4.864864864864865,
 'B004KNWX5I': 4.384615384615385,
 'B004KNWWTA': 4.744

In [17]:
# To sort the products into top N most popular products
# We wanna avoid products with very little reviews so we needa build that into our logic
# Method 1: My method
# First, create a new dictionary
# Iterate through the previously generated ratings dictionary -> product_id : avg_ratings
# Then for each product_id, we check against the previous counts dictionary to see if the counts is 50 & above
# If it is, then we include this in our new dictionary
filtered = {}
for product in output_dict:
    pid = product
    if product_dict[product] > 49:
        filtered[product] = output_dict[product]

# We just do the usual sorting and converting into a list of tuples here
filtered_list = []
for i in filtered:
    filtered_list.append((filtered[i], i))
filtered_list.sort()
filtered_list[-10:]
        


[(4.849206349206349, 'B005EISOUQ'),
 (4.867924528301887, 'B004KNWWT0'),
 (4.884984025559105, 'B007V6ETXA'),
 (4.885057471264368, 'B004LLIL0A'),
 (4.890052356020942, 'B0062ONAM8'),
 (4.890909090909091, 'B004LLIL6Y'),
 (4.910256410256411, 'B004LLILC8'),
 (4.912621359223301, 'B004KNWX94'),
 (4.941747572815534, 'B004KNWX76'),
 (4.956521739130435, 'B004LLIL5K')]

In [18]:
# Let's try to count number of unique users and items
# Using sets here because sets only allow 1 unique item in it
# Meaning if you try to add something already inside into it, it wont work

items = set()
users = set()
for record in data:
    items.add(record['asin'])
    users.add(record['reviewerID'])
    
print('No. of Unique Items: {}\nNo. of Unique Users: {}'.format(len(items), len(users)))

No. of Unique Items: 187
No. of Unique Users: 48736


In [27]:
# Counting average rating for verified vs unverified customers
# Method 1: My method
# Create a new dict to store the averages for both types of customers
comparison = defaultdict(int)

# Iterate through the data
# For each record, calculate the ratings given / total number of ratings in both types
# Add this cummulatively to the new dictionary
for record in data:
    state = record['verified']
    state_count = verified_dict[state]
    adj_cumm_score = record['overall'] / state_count
    comparison[state] += adj_cumm_score
    
    

# Method 2
# Do a individual counters then just iterate through each record and do a cummulative count
# Then do a simple total / total number to get average
tot_verified = 0
tot_unverified = 0
num_verified = 0
num_unverified = 0

for record in data:
    if record['verified'] == True:
        tot_verified += record['overall']
        num_verified += 1
    else:
        tot_unverified += record['overall']
        num_unverified += 1

avg_verified = tot_verified / num_verified
avg_unverified = tot_unverified / num_unverified

print('Method 1')
print(comparison)
print('----------------------------------------------')
print('Method 2')
print('Average Verified: {}\nAveraged Unverified: {}'.format(avg_verified, avg_unverified))

Method 1
defaultdict(<class 'int'>, {False: 4.252596126859385, True: 4.726123565260736})
----------------------------------------------
Method 2
Average Verified: 4.72612356526046
Averaged Unverified: 4.252596126859388


In [28]:
string = "seller_id, product_id, price, customer_id, review_id"
string.split()

['seller_id,', 'product_id,', 'price,', 'customer_id,', 'review_id']