In [1]:
import pandas as pd
import numpy as np

In [2]:
# Read in the json file
reviews = pd.read_json('data/reviews_Electronics_5.json', lines=True)

In [3]:
# Preview the file
reviews.head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,528881469,"[0, 0]",5,We got this GPS for my husband who is an (OTR)...,"06 2, 2013",AO94DHGC771SJ,amazdnu,Gotta have GPS!,1370131200
1,528881469,"[12, 15]",1,"I'm a professional OTR truck driver, and I bou...","11 25, 2010",AMO214LNFCEI4,Amazon Customer,Very Disappointed,1290643200
2,528881469,"[43, 45]",3,"Well, what can I say. I've had this unit in m...","09 9, 2010",A3N7T0DY83Y4IG,C. A. Freeman,1st impression,1283990400
3,528881469,"[9, 10]",2,"Not going to write a long review, even thought...","11 24, 2010",A1H8PY3QHMQQA0,"Dave M. Shaw ""mack dave""","Great grafics, POOR GPS",1290556800
4,528881469,"[0, 0]",1,I've had mine for a year and here's what we go...,"09 29, 2011",A24EV6RXELQZ63,Wayne Smith,"Major issues, only excuses for support",1317254400


In [4]:
# What size of file are we looking at?
reviews.shape

(1689188, 9)

In [5]:
# Through exploring the data I found that there are some empty strings in the review column.
# It will work out better if convert these empty string values to np.NAN values
reviews.replace('', np.NaN, inplace=True)

In [6]:
# Let's see a sample of what some of our review text looks like
sample_reviews = reviews['reviewText'].sample(n=10)

for review in sample_reviews:
    print(review)     
    print('- - - - - -')

I strongly recommend this product I use it on all of my devices, laptop and glasses. It is very helpful and requires no liquid
- - - - - -
I was looking for a way to stream Rhapsody into my A/V Receiver, and maybe have some other music options too. The Squeezebox Touch is just what I needed. I hard wired it into my home network and used a digital cable to go into my A/V Receiver. I now have a massive collection of digital audio from Rhapsody to choose from. I'm in heaven. The unit was as easy as pie to hook up. The touch screen is reactive, and the remote control is very nice too. What a fabulous product. It does everything I was looking for and more. The fact that Amazon got it here fast and at a reasonable price just makes my enjoyment of it that much more. I love my Squeezebox Touch and I love Amazon. Thanks
- - - - - -
Well made and sized right for the Macbook Air.  Only minor observation is that it would be nice it had a handle.
- - - - - -
Not all GPS devices are made equal.  The

In [7]:
# We're interested in working with the text and the review score so let's drop any rows that don't have these values

# See the shape before the drop 
print(reviews.shape)
reviews.dropna(axis=0, how='any', subset=['overall', 'reviewText'], inplace=True)

# See the shape after the drop
reviews.shape

(1689188, 9)


(1688117, 9)

In [8]:
# Let's bring the sample down to 200,000 to make it more managable
reviews_subset = reviews.sample(n=200000, random_state=17).copy()
reviews_subset.shape

(200000, 9)

In [9]:
# Let's see the results:
reviews_subset.head(5)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
674608,B002W7CW32,"[1, 3]",5,got my first projector yesterday from visuala...,"04 7, 2010",A33XWDU85V5ZGM,"Prasad N. Badal ""PB""",updated review- still loving it,1270598400
572683,B0028PJ6OQ,"[0, 0]",4,It is a decently made keyboard with a very bri...,"08 16, 2013",AG2CSACQ7SXRK,"Ginny's Store ""Ginny""",Very brite,1376611200
1142343,B005HMO6A6,"[0, 0]",5,This keyboard is spot on. The first thing that...,"08 14, 2013",A3REQ9RHP8LLWZ,theonerandom,Quality Keyboard A+,1376438400
1101357,B005BHX0A4,"[1, 1]",4,This fits nicely on the Canon HF G10 and gives...,"12 31, 2012",A1HHOD2XB7R64C,Michael Griffith,Good way to convert,1356912000
667661,B002V88HFE,"[0, 0]",5,I've ordered these in the past and they outlas...,"01 21, 2013",A1YQDDM0ACMR97,L. Capps,Great Batteries,1358726400


In [10]:
# While exploring the file I noticed that HTML entities are escaped. So instead of having
# " there is &#34; which is the HTML entity for double quote.
# Let's clean this up by escaping the HTML entities in the reviw text

# Let's preview rows that have HTML entities in the review text
reviews_subset[reviews_subset['reviewText'].str.contains("&#34;")]

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
1232663,B006K553LU,"[1, 1]",5,I chose this product because it seemed to be t...,"07 14, 2013",A1QDWPIXFKS0CU,Lexi,Great value and awesome product,1373760000
369887,B0012S4APK,"[0, 0]",5,I mounted my LG 42&#34; tv. Easy and did the j...,"01 6, 2014",A1428302YLXZXD,Jerry Choate,"Good, Solid Mount!",1388966400
393272,B0015EDVVU,"[0, 0]",5,This is my second set of Wiremold kits. I had...,"10 27, 2013",A38LVONPQQT3GW,"A. Mcintyre ""Frugal Shopper""",Great addition to any home entertainment system,1382832000
1072538,B0053O9ZNQ,"[0, 0]",4,I chose it because it had almost the same key ...,"01 24, 2014",A3AVSGDYKZMCOX,chriek,Good compact keyboard,1390521600
1659055,B00FF6J532,"[6, 6]",5,Memo from a hard core iPad user: Lets face it....,"03 11, 2014",ATGLHCY1BZ0TO,conis,Totally SLICK!,1394496000
1311812,B007MW73C2,"[2, 2]",5,I am an OLD IT person....I started programming...,"05 31, 2013",AQXB4AR9HCZTE,1912 House,"Leap of Faith - gaming, school and work",1369958400
997812,B004OVECUA,"[0, 0]",5,Works great. If you have ever used a Harmony ...,"09 17, 2013",A3RACIXI8CK305,Ken,This is my favorite remote. Period,1379376000
48490,B00007E7C8,"[0, 0]",3,I only had them on 15 minutes before my right ...,"01 11, 2014",ALIYGP2HGXOZT,"Spots ""shialavati""",Ear cups too shallow,1389398400
553053,B001XURP7W,"[0, 0]",5,Great Deal......yes they are a &#34;Bulk-Pack&...,"11 30, 2013",A3MJR0F0NUWJL7,Steven,Great Deal: Two for the price of One!,1385769600
1614296,B00DGNZ9G8,"[9, 10]",4,The first thing I want to put out first is tha...,"11 16, 2013",AYANKSMR7OVJX,YourName,"Missing parts, but never the less a good value.",1384560000


In [11]:
reviews_subset[reviews_subset['reviewText'].str.contains("&#34;")].shape

(8060, 9)

In [12]:
# use an apply function to unescape html entities in the review text
from html import unescape
unescaper = lambda text: unescape(text)

reviews_subset['reviewText'] = reviews_subset['reviewText'].apply(unescaper)

# Verify that it worked by counting the number of rows with &#34; (the HTML entity for ")
reviews_subset[reviews_subset['reviewText'].str.contains("&#34;")].shape

(0, 9)

In [13]:
# Save the file as a CSV 
reviews_subset.to_csv('data/amazon_electronics_reviews_subset.csv')
reviews_subset.shape

(200000, 9)

In [14]:
# Read the file back in to check that it saved correctly
test_df = pd.read_csv('data/amazon_electronics_reviews_subset.csv', header=0, index_col=0)
test_df.head(3)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
674608,B002W7CW32,"[1, 3]",5,got my first projector yesterday from visuala...,"04 7, 2010",A33XWDU85V5ZGM,"Prasad N. Badal ""PB""",updated review- still loving it,1270598400
572683,B0028PJ6OQ,"[0, 0]",4,It is a decently made keyboard with a very bri...,"08 16, 2013",AG2CSACQ7SXRK,"Ginny's Store ""Ginny""",Very brite,1376611200
1142343,B005HMO6A6,"[0, 0]",5,This keyboard is spot on. The first thing that...,"08 14, 2013",A3REQ9RHP8LLWZ,theonerandom,Quality Keyboard A+,1376438400


In [15]:
test_df.shape

(200000, 9)

In [16]:
# Let's make another data set has reviews for the single product with the most reviews
# Which product has the most reviews?
reviews['asin'].value_counts()[0:10]

B007WTAJTO    4914
B003ES5ZUU    4143
B00DR0PDNE    3798
B0019EHU8G    3435
B002WE6D44    2813
B003ELYQGG    2652
B0002L5R78    2598
B009SYZ8OC    2542
B00BGGDVOO    2104
B002V88HFE    2081
Name: asin, dtype: int64

In [17]:
# Let's see what kind of produc this is
most_reviews = reviews.loc[reviews['asin']=="B007WTAJTO"].copy()

In [18]:
sample_most_reviews = most_reviews['reviewText'].sample(n=5)

for review in sample_most_reviews:
    print(review)     
    print('- - - - - -')

This card has worked flawlessly so far. I use it in my Garmin Montana 600 and it allows me to load all my USGS 1:24,000 quads (US - SW, W, S Central, N Central and inland lakes) I was at first concerned that the 600 would not take 32 gb micro SD but no problem.
- - - - - -
Works perfect in Samsung Tab 2. It quadrupled my storage capability and seems to be very fast.  I would recommend to anyone
- - - - - -
I use these in my tablet as well as my camera. I find slower speed cards do not work well when taking pictures or video. It would be nice if the SD adapter had built in WiFi, that way for my DSLR, I would only need to buy one WiFi card and simply replace it with these micro SD cards and save money.
- - - - - -
I bought two of these for our new telephones. They work fine and were just what we needed if we have extra pictures on the phones. Pictures do take up a lot of room but now we don't have to worry about running out of room.
- - - - - -
Good size card and a great price. My only c

In [19]:
# It looks like these are review for a Sandisk MicroSD card: http://a.co/6lnlOVo
# Let's save these reviews as another data file and go through the same
# Steps to unescape the HTML entities.

In [20]:
# Which rows have HTML entities in the reviews for the MicroSD card?
most_reviews[most_reviews['reviewText'].str.contains("&#34;")]

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
1336624,B007WTAJTO,"[0, 0]",5,I like this SD Card because it can take music ...,"05 11, 2013",A1RTQROTWR5NCB,808TREX50,32GB MicroSD Card.,1368230400
1336670,B007WTAJTO,"[0, 0]",4,Used in my phone it's plenty fast and works ju...,"11 5, 2013",A1R0JMNRV7HVQM,ACZ,Works just like it should in my phone,1383609600
1336674,B007WTAJTO,"[0, 0]",5,You'll pay $10 more for these anywhere else. I...,"02 12, 2014",AI1NJ92OUOM2V,Adam M Barnes,Great product at a great price.,1392163200
1336699,B007WTAJTO,"[0, 0]",5,Lot of questions about this memory card fittin...,"02 17, 2014",A16PH8I60IGM4F,A. GONZALEZ MEDINA,Works OK with the Samsung Galaxy Tab 3 10.1...,1392595200
1336727,B007WTAJTO,"[0, 0]",4,I bought this for my son's tablet for cartoon ...,"04 12, 2013",A1NZR4UN6DJQIO,Albert,Works great,1365724800
1336860,B007WTAJTO,"[0, 0]",5,I use this in my Samsung Galaxy Note II. For t...,"05 30, 2013",A2HY1IGOK2MX5G,Amazon Customer,Great Quality,1369872000
1336864,B007WTAJTO,"[0, 0]",5,I wasn't absolutely certain if this would work...,"04 27, 2013",A33M3XQJF4PDE1,Amazon Customer,Reliable expansion for your MicroSDXC needs,1367020800
1336910,B007WTAJTO,"[0, 3]",2,Thus microSD card worked fine for a year in my...,"04 28, 2014",A308JP3BK4VAL7,Amazon Customer,Inadequate life span - lasted 1 year then failed,1398643200
1336915,B007WTAJTO,"[1, 1]",5,I have this regularly plugged into my ASUS Tra...,"09 2, 2013",A2O8NMK3RVDBAE,Amazon Customer,"Speed, capacity and price!",1378080000
1336966,B007WTAJTO,"[0, 0]",4,I just bought two of these for our two new Sam...,"10 4, 2013",A3SMW7RDCEUJ24,"Amazon Customer ""ZundapMan""",Probably could have scrimpted and paid less...,1380844800


In [21]:
most_reviews[most_reviews['reviewText'].str.contains("&#34;")].shape

(135, 9)

In [22]:
# use an apply function to unescape html entities in the review text
from html import unescape
unescaper = lambda text: unescape(text)

most_reviews['reviewText'] = most_reviews['reviewText'].apply(unescaper)

# Verify that it worked by counting the number of rows with &#34; (the HTML entity for ")
most_reviews[most_reviews['reviewText'].str.contains("&#34;")].shape

(0, 9)

In [23]:
# Save the file as a CSV 
most_reviews.to_csv('data/sandisk_sd_card_reviews.csv')
most_reviews.shape

(4914, 9)

In [24]:
# Read the file back in to check that it saved correctly
test_df = pd.read_csv('data/sandisk_sd_card_reviews.csv', header=0, index_col=0)
test_df.head(3)

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
1336614,B007WTAJTO,"[0, 0]",4,No issues.,"07 23, 2014",A3SBTW3WS4IQSN,,Four Stars,1406073600
1336615,B007WTAJTO,"[0, 0]",5,"Purchased this for my device, it worked as adv...","10 25, 2013",A18K1ODH1I2MVB,0mie,MOAR SPACE!!!,1382659200
1336616,B007WTAJTO,"[0, 0]",4,it works as expected. I should have sprung for...,"12 23, 2012",A2FII3I2MBMUIA,1K3,nothing to really say....,1356220800


In [25]:
test_df.shape

(4914, 9)