In [39]:
import gzip
import json
from collections import defaultdict

In [40]:
path = 'Data/amazon_reviews/Gift_Cards.json.gz'
file = gzip.open(path, 'rt')

In [41]:
# Loading in the data
data = []
for line in file:
    data.append(json.loads(file.readline()))
    
data[0]

{'overall': 5.0,
 'verified': False,
 'reviewTime': '12 17, 2008',
 'reviewerID': 'A3G8U1G1V082SN',
 'asin': 'B001GXRQW0',
 'style': {'Gift Amount:': ' 50'},
 'reviewerName': 'Tali',
 'reviewText': 'I got this gift card from a friend, and it was the best! The site has so much to choose from... great gift.',
 'summary': 'Gift card with best selection',
 'unixReviewTime': 1229472000}

In [42]:
# Filtering out all the records with the reviewTime field
# Saving the filtered records into the list new_data
new_data = []
for row in data:
    if 'reviewTime' in row:
        new_data.append(row)

len(new_data)

73597

In [43]:
# Creating a new field called yearInt by taking the last 4 characters of the reviewTime string
# Then converting it to a integer type
for row in new_data:
    row['yearInt'] = int(row['reviewTime'][-4:])
    
new_data[0:2]

[{'overall': 5.0,
  'verified': False,
  'reviewTime': '12 17, 2008',
  'reviewerID': 'A3G8U1G1V082SN',
  'asin': 'B001GXRQW0',
  'style': {'Gift Amount:': ' 50'},
  'reviewerName': 'Tali',
  'reviewText': 'I got this gift card from a friend, and it was the best! The site has so much to choose from... great gift.',
  'summary': 'Gift card with best selection',
  'unixReviewTime': 1229472000,
  'yearInt': 2008},
 {'overall': 5.0,
  'verified': False,
  'reviewTime': '12 17, 2008',
  'reviewerID': 'A9YKGBH3SV22C',
  'asin': 'B001GXRQW0',
  'style': {'Gift Amount:': ' 25'},
  'reviewerName': 'Giotravels',
  'reviewText': "You can always get someone something from Amazon with the safety net that they can return it if they don't like it. But returning things is such a hassle. The card takes care of that--no more returns, no more hassles.",
  'summary': 'Totally make sense',
  'unixReviewTime': 1229472000,
  'yearInt': 2008}]

In [44]:
# Use the newly created yearInt field to filter old old reviews
# Lets take out reviews that were written before 2010
data_aft_2010 = []
for row in new_data:
    if row['yearInt'] > 2009:
        data_aft_2010.append(row)

len(data_aft_2010)

73516

In [45]:
# Lets try to now filter out reviews of users who have only written a single review
# We will first need to perform a count of the number of reviews each user has written
review_count = defaultdict(int)

for row in data_aft_2010:
    review_count[row['reviewerID']] += 1

In [46]:
# Next, we create a new list to store records w more than 1 review
data_morethan1review = []

# Iterate through the previous list
# For every record iterated on, if the count in review_count is more than 1, we add to our new list
for row in data_aft_2010:
    if review_count[row['reviewerID']] > 1:
        data_morethan1review.append(row)

len(data_morethan1review)

10028

In [47]:
# Lets now try to filter reviews that are very short as they may not be very helpful
# We first need to make sure that there's reviewText in all the data
data_with_reviews = []
for row in data_morethan1review:
    if 'reviewText' in row:
        data_with_reviews.append(row)

print(len(data_with_reviews))

# We define short reviews as those with less than 10 words
# To count the words, we need to do a .split() on the reviewText field
# Then if there's 10 words or more, we add to final_data
final_data = []
for row in data_with_reviews:
    if len(row['reviewText'].split()) >= 10 :
        final_data.append(row)

len(final_data)

10019


3971