# Amazon Wireless Headphone Review 
### Scraped by Google Chrome Extension
#### https://chrome.google.com/webstore/detail/web-scraper/jnhgnonknehpejjnehehllkliplmbmhn/related

In [1]:
# https://www.scrapehero.com/amazon-review-scraper/
import pandas as pd
import numpy as np

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 500

### Apple Airpods Reviews

In [2]:
# Uploading the Apple Airpod reviews
airpods = pd.read_csv('../datasets/airpod_amazon_review_scrape.csv')
airpods.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,ReviewTitle,ReviewBody,ReviewStar,next,next-href
0,1573042898-1816,https://www.amazon.com/Apple-AirPods-Charging-...,Great wireless music,Product as expecred,5.0 out of 5 stars,Next page→,https://www.amazon.com/Apple-AirPods-Charging-...
1,1573042596-467,https://www.amazon.com/Apple-AirPods-Charging-...,battery life dies really quick,i really like my airpods but the battery life ...,4.0 out of 5 stars,Next page→,https://www.amazon.com/Apple-AirPods-Charging-...
2,1573042820-1471,https://www.amazon.com/Apple-AirPods-Charging-...,The audio sound muffled (right ear),I bought Airpods 2 on Amazon. But right ears w...,3.0 out of 5 stars,Next page→,https://www.amazon.com/Apple-AirPods-Charging-...
3,1573042975-2170,https://www.amazon.com/Apple-AirPods-Charging-...,Comprarlo sin problemas,Perfect,5.0 out of 5 stars,Next page→,https://www.amazon.com/Apple-AirPods-Charging-...
4,1573043073-2606,https://www.amazon.com/Apple-AirPods-Charging-...,good！,so,5.0 out of 5 stars,Next page→,https://www.amazon.com/Apple-AirPods-Charging-...


In [3]:
airpods.shape

(3414, 7)

In [4]:
# Dropping the unneccesary columns
airpods.drop(columns=['web-scraper-order', 'web-scraper-start-url', 'next', 'next-href'], inplace=True)

In [5]:
# Creating a product column
airpods['Product'] = 'Apple Airpods-Latest Model'

In [6]:
airpods.tail()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
3409,Didn’t work,Air pods wouldn’t connect. Checked serial numb...,1.0 out of 5 stars,Apple Airpods-Latest Model
3410,Really work well,These are great. Battery lasts for hours.,5.0 out of 5 stars,Apple Airpods-Latest Model
3411,Great!,Airpod! It shows everything.,5.0 out of 5 stars,Apple Airpods-Latest Model
3412,Not latest model,This is not the wireless charging model this...,1.0 out of 5 stars,Apple Airpods-Latest Model
3413,Great gift for my girlfriend,This is a gift for my girlfriend. She likes it...,5.0 out of 5 stars,Apple Airpods-Latest Model


In [7]:
airpods.dtypes

ReviewTitle    object
ReviewBody     object
ReviewStar     object
Product        object
dtype: object

In [8]:
airpods['ReviewStar'].value_counts()

5.0 out of 5 stars    2565
1.0 out of 5 stars     371
4.0 out of 5 stars     245
3.0 out of 5 stars     135
2.0 out of 5 stars      97
Name: ReviewStar, dtype: int64

In [9]:
# Changing the review column to an integer datatype and mapping the values
airpods['ReviewStar'] = airpods['ReviewStar'].map({'5.0 out of 5 stars': 5,
                          '4.0 out of 5 stars' : 4,
                          '3.0 out of 5 stars': 3,
                          '2.0 out of 5 stars': 2,
                          '1.0 out of 5 stars': 1})
airpods['ReviewStar'].value_counts()

5.0    2565
1.0     371
4.0     245
3.0     135
2.0      97
Name: ReviewStar, dtype: int64

In [10]:
airpods.columns

Index(['ReviewTitle', 'ReviewBody', 'ReviewStar', 'Product'], dtype='object')

In [11]:
airpods.rename(columns={'ReviewTitle': 'title',
                        'ReviewBody': 'body',
                        'ReviewStar':'rating',
                        'Product': 'product'}, inplace=True)

In [12]:
airpods.head()

Unnamed: 0,title,body,rating,product
0,Great wireless music,Product as expecred,5.0,Apple Airpods-Latest Model
1,battery life dies really quick,i really like my airpods but the battery life ...,4.0,Apple Airpods-Latest Model
2,The audio sound muffled (right ear),I bought Airpods 2 on Amazon. But right ears w...,3.0,Apple Airpods-Latest Model
3,Comprarlo sin problemas,Perfect,5.0,Apple Airpods-Latest Model
4,good！,so,5.0,Apple Airpods-Latest Model


In [13]:
# Checking for nulls
airpods.isnull().sum()

title      3
body       1
rating     1
product    0
dtype: int64

In [14]:
# These can be assumed to be missing at random, and they are such a small amount we can drop them
airpods.dropna(inplace=True)

In [15]:
#Checking to see there are no more nulls
airpods.isnull().sum()

title      0
body       0
rating     0
product    0
dtype: int64

In [16]:
# Dropping duplicates
airpods.drop_duplicates(inplace=True)
airpods.shape

(3362, 4)

In [17]:
# saving to a new csv
airpods.to_csv('../datasets/apple_reviews_eda.csv', index = False)

### Tozo T10 Bluetooth Wireless Earbuds Reviews

In [18]:
# Reading in the data for the reviews on the TOZO10 Bluetooth Wireless Earbuds
tozo = pd.read_csv('../datasets/tozot10_bluetooth_wireless_earbuds_amazon_reviews.csv')
tozo.shape

(5001, 7)

In [19]:
tozo.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,title,body,rating,next,next-href
0,1573068482-22176,https://www.amazon.com/TOZO-Bluetooth-Wireless...,"Sounds great, but low battery life","They really do sound good, but the battery lif...",3.0 out of 5 stars,Next page→,https://www.amazon.com/TOZO-Bluetooth-Wireless...
1,1573067991-20581,https://www.amazon.com/TOZO-Bluetooth-Wireless...,Comfortable & great sounding earbuds,Really like my new earbuds. They are comforta...,5.0 out of 5 stars,Next page→,https://www.amazon.com/TOZO-Bluetooth-Wireless...
2,1573068236-21380,https://www.amazon.com/TOZO-Bluetooth-Wireless...,These are the best earbuds for music!!!,I am soooo pleased with these earbuds! I’ve t...,5.0 out of 5 stars,Next page→,https://www.amazon.com/TOZO-Bluetooth-Wireless...
3,1573067954-20468,https://www.amazon.com/TOZO-Bluetooth-Wireless...,Great Sound Quality and Waterproof In Steam Room,"Love these earbuds, needed them to use in a st...",5.0 out of 5 stars,Next page→,https://www.amazon.com/TOZO-Bluetooth-Wireless...
4,1573068262-21459,https://www.amazon.com/TOZO-Bluetooth-Wireless...,"Excellent Quality, Sound, and battery life",These earbuds are excellent! They have some of...,4.0 out of 5 stars,Next page→,https://www.amazon.com/TOZO-Bluetooth-Wireless...


In [20]:
tozo.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'title', 'body', 'rating',
       'next', 'next-href'],
      dtype='object')

In [21]:
tozo.drop(columns=['web-scraper-order', 'web-scraper-start-url', 'next', 'next-href'], inplace=True)

In [22]:
tozo.columns

Index(['title', 'body', 'rating'], dtype='object')

In [23]:
tozo.rating.value_counts()

5.0 out of 5 stars    3228
4.0 out of 5 stars     665
1.0 out of 5 stars     398
2.0 out of 5 stars     360
3.0 out of 5 stars     349
Name: rating, dtype: int64

In [24]:
# Changing the review column to an integer datatype and mapping the values
tozo['rating'] = tozo['rating'].map({'5.0 out of 5 stars': 5,
                          '4.0 out of 5 stars' : 4,
                          '3.0 out of 5 stars': 3,
                          '2.0 out of 5 stars': 2,
                          '1.0 out of 5 stars': 1})
tozo['rating'].value_counts()

5.0    3228
4.0     665
1.0     398
2.0     360
3.0     349
Name: rating, dtype: int64

In [25]:
# Checking for nulls
tozo.isnull().sum()

title     1
body      1
rating    1
dtype: int64

In [26]:
# Dropping the nulls (MAR)
tozo.dropna(inplace=True)
tozo.isnull().sum()

title     0
body      0
rating    0
dtype: int64

In [27]:
# Dropping Duplicates
tozo.drop_duplicates(inplace=True)
tozo.shape

(5000, 3)

In [28]:
#Adding a product column
tozo['product'] = 'Tozo10 Bluetooth Wireless Earbuds'
tozo.head()

Unnamed: 0,title,body,rating,product
0,"Sounds great, but low battery life","They really do sound good, but the battery lif...",3.0,Tozo10 Bluetooth Wireless Earbuds
1,Comfortable & great sounding earbuds,Really like my new earbuds. They are comforta...,5.0,Tozo10 Bluetooth Wireless Earbuds
2,These are the best earbuds for music!!!,I am soooo pleased with these earbuds! I’ve t...,5.0,Tozo10 Bluetooth Wireless Earbuds
3,Great Sound Quality and Waterproof In Steam Room,"Love these earbuds, needed them to use in a st...",5.0,Tozo10 Bluetooth Wireless Earbuds
4,"Excellent Quality, Sound, and battery life",These earbuds are excellent! They have some of...,4.0,Tozo10 Bluetooth Wireless Earbuds


In [29]:
# saving to a new CSV
tozo.to_csv('../datasets/tozo_reviews_eda.csv', index = False )

### Samsung Galaxybuds Amazon Reviews

In [30]:
samsung = pd.read_csv('../datasets/samsung_galaxybud_amazon_reviews.csv')
samsung.shape

(2768, 7)

In [31]:
samsung.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'ReviewTitle',
       'ReviewBody', 'ReviewRating', 'next', 'next-href'],
      dtype='object')

In [32]:
#Dropping the unneeded columns
samsung.drop(columns=['web-scraper-order', 
                      'web-scraper-start-url',
                      'next', 'next-href'], inplace=True)
#Renaming the columns
samsung.rename(columns = {
    'ReviewTitle': 'title',
    'ReviewBody': 'body',
    'ReviewRating': 'rating'
}, inplace=True )
#Adding the product column
samsung['product'] = 'Samsung Galaxybuds (latest model)'
#Checking
samsung.head()

Unnamed: 0,title,body,rating,product
0,Stay in well,They kinda hurt after a couple of hours but o...,4.0 out of 5 stars,Samsung Galaxybuds (latest model)
1,Terrible sound quality,It was good at first but the sound quality wen...,1.0 out of 5 stars,Samsung Galaxybuds (latest model)
2,Sweet buds,Excellent sound for buds and crystal clear voi...,5.0 out of 5 stars,Samsung Galaxybuds (latest model)
3,Very bad bluetooth and poor volume,I got mine for free with the S10 and figured I...,1.0 out of 5 stars,Samsung Galaxybuds (latest model)
4,Please get it right Samsung!,Maybe I am doing it wrong but I bought these h...,3.0 out of 5 stars,Samsung Galaxybuds (latest model)


In [33]:
# Changing the review column to an integer datatype and mapping the values
samsung['rating'] = samsung['rating'].map({'5.0 out of 5 stars': 5,
                          '4.0 out of 5 stars' : 4,
                          '3.0 out of 5 stars': 3,
                          '2.0 out of 5 stars': 2,
                          '1.0 out of 5 stars': 1})
samsung['rating'].value_counts()

5.0    1590
4.0     409
1.0     313
2.0     230
3.0     225
Name: rating, dtype: int64

In [34]:
# Checking for nulls
samsung.isnull().sum()

title      2
body       1
rating     1
product    0
dtype: int64

In [35]:
# Dropping the nulls - MAR
samsung.dropna(inplace=True)
samsung.isnull().sum()

title      0
body       0
rating     0
product    0
dtype: int64

In [36]:
# Dropping Duplicates
samsung.drop_duplicates(inplace=True)
samsung.shape

(2746, 4)

In [37]:
samsung.head()

Unnamed: 0,title,body,rating,product
0,Stay in well,They kinda hurt after a couple of hours but o...,4.0,Samsung Galaxybuds (latest model)
1,Terrible sound quality,It was good at first but the sound quality wen...,1.0,Samsung Galaxybuds (latest model)
2,Sweet buds,Excellent sound for buds and crystal clear voi...,5.0,Samsung Galaxybuds (latest model)
3,Very bad bluetooth and poor volume,I got mine for free with the S10 and figured I...,1.0,Samsung Galaxybuds (latest model)
4,Please get it right Samsung!,Maybe I am doing it wrong but I bought these h...,3.0,Samsung Galaxybuds (latest model)


In [38]:
# Saving to a new csv
samsung.to_csv('../datasets/samsung_galaxybuds_reviews_eda.csv', index=False)

### TrelabJ1 Elite Wireless Running Earphone Reviews

In [39]:
trelab = pd.read_csv('../datasets/trelabj1_elite_wireless_running_earphones_amazon_reviews.csv')

In [40]:
trelab.shape

(1480, 7)

In [41]:
trelab.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'title', 'body', 'rating',
       'next', 'next-href'],
      dtype='object')

In [42]:
#Dropping the unneeded columns
trelab.drop(columns=['web-scraper-order', 
                      'web-scraper-start-url',
                      'next', 'next-href'], inplace=True)

#Adding the product column
trelab['product'] = 'TrelabJ1 Elite Wireless Running Earphones'
#Checking
trelab.head()

Unnamed: 0,title,body,rating,product
0,I think the sound quality is good when you get...,I think the sound quality is good when you ge...,4.0 out of 5 stars,TrelabJ1 Elite Wireless Running Earphones
1,I was expecting more,I feel like $50 is a reasonable amount to spen...,2.0 out of 5 stars,TrelabJ1 Elite Wireless Running Earphones
2,Ive had these headphones for a little over a m...,I had a problem with these headphones and I ta...,5.0 out of 5 stars,TrelabJ1 Elite Wireless Running Earphones
3,Great Customer Support,Really liked this product for the first few mo...,5.0 out of 5 stars,TrelabJ1 Elite Wireless Running Earphones
4,Five Stars,amazing pair of earbuds very very high quality...,5.0 out of 5 stars,TrelabJ1 Elite Wireless Running Earphones


In [43]:
# Changing the review column to an integer datatype and mapping the values
trelab['rating'] = trelab['rating'].map({'5.0 out of 5 stars': 5,
                          '4.0 out of 5 stars' : 4,
                          '3.0 out of 5 stars': 3,
                          '2.0 out of 5 stars': 2,
                          '1.0 out of 5 stars': 1})
trelab['rating'].value_counts()

5.0    770
4.0    273
1.0    177
2.0    132
3.0    127
Name: rating, dtype: int64

In [44]:
# Checking for nulls
trelab.isnull().sum()

title      2
body       1
rating     1
product    0
dtype: int64

In [45]:
# Dropping the MAR nulls
trelab.dropna(inplace=True)
trelab.isnull().sum()

title      0
body       0
rating     0
product    0
dtype: int64

In [46]:
# Dropping duplicates
trelab.drop_duplicates(inplace=True)
trelab.shape

(1478, 4)

In [47]:
# Saving into a new csv
trelab.to_csv('../datasets/trelabJ1_wireless_reviews_eda.csv', index=False)

### Bose SountSport Wireless Reviews

In [48]:
bose = pd.read_csv('../datasets/bose_soundsportwireless_amazon_reviews.csv')
bose.shape

(5001, 7)

In [49]:
bose.columns

Index(['web-scraper-order', 'web-scraper-start-url', 'title', 'content',
       'rating', 'next', 'next-href'],
      dtype='object')

In [50]:
#Dropping the unneeded columns
bose.drop(columns=['web-scraper-order', 
                      'web-scraper-start-url',
                      'next', 'next-href'], inplace=True)
#Renaming the content column
bose.rename(columns = {'content': 'body'}, inplace=True )
#Adding the product column
bose['product'] = 'Bose SoundSport Wireless Earphones'
#Checking
bose.head()

Unnamed: 0,title,body,rating,product
0,Cover,Buena calidad,5.0 out of 5 stars,Bose SoundSport Wireless Earphones
1,Great Customer Service,Excellent customer service. I received a fault...,5.0 out of 5 stars,Bose SoundSport Wireless Earphones
2,Five Stars,Follow the directions and watch the online vid...,5.0 out of 5 stars,Bose SoundSport Wireless Earphones
3,Good,Good,5.0 out of 5 stars,Bose SoundSport Wireless Earphones
4,Good protection.,Good protection for my tablet.,5.0 out of 5 stars,Bose SoundSport Wireless Earphones


In [51]:
# Changing the review column to an integer datatype and mapping the values
bose['rating'] = bose['rating'].map({'5.0 out of 5 stars': 5,
                          '4.0 out of 5 stars' : 4,
                          '3.0 out of 5 stars': 3,
                          '2.0 out of 5 stars': 2,
                          '1.0 out of 5 stars': 1})
bose['rating'].value_counts()

5.0    3636
4.0     498
1.0     418
3.0     225
2.0     223
Name: rating, dtype: int64

In [52]:
# checking for nulls
bose.isnull().sum()

title      1
body       1
rating     1
product    0
dtype: int64

In [53]:
# Dropping the MAR nulls
bose.dropna(inplace=True)
bose.isnull().sum()

title      0
body       0
rating     0
product    0
dtype: int64

In [54]:
#Dropping the duplicates
bose.drop_duplicates(inplace=True)
bose.shape

(4945, 4)

In [55]:
bose.head()

Unnamed: 0,title,body,rating,product
0,Cover,Buena calidad,5.0,Bose SoundSport Wireless Earphones
1,Great Customer Service,Excellent customer service. I received a fault...,5.0,Bose SoundSport Wireless Earphones
2,Five Stars,Follow the directions and watch the online vid...,5.0,Bose SoundSport Wireless Earphones
3,Good,Good,5.0,Bose SoundSport Wireless Earphones
4,Good protection.,Good protection for my tablet.,5.0,Bose SoundSport Wireless Earphones


In [56]:
#creating a new csv
bose.to_csv('../datasets/bose_wireless_reviews_eda.csv', index=False)

### Kaggle Dataset 

In [57]:
df = pd.read_csv('../datasets/AllProductReviews.csv')
df.head()

Unnamed: 0,ReviewTitle,ReviewBody,ReviewStar,Product
0,Honest review of an edm music lover\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,Really good and durable.\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,Just Awesome Wireless Headphone under 1000...😉\n,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255


In [58]:
#Renaming the columns
df.rename(columns = {
    'ReviewTitle': 'title',
    'ReviewBody': 'body',
    'ReviewStar': 'rating',
    'Product': 'product'
}, inplace=True )
df.head()

Unnamed: 0,title,body,rating,product
0,Honest review of an edm music lover\n,No doubt it has a great bass and to a great ex...,3,boAt Rockerz 255
1,Unreliable earphones with high cost\n,"This earphones are unreliable, i bought it be...",1,boAt Rockerz 255
2,Really good and durable.\n,"i bought itfor 999,I purchased it second time,...",4,boAt Rockerz 255
3,stopped working in just 14 days\n,Its sound quality is adorable. overall it was ...,1,boAt Rockerz 255
4,Just Awesome Wireless Headphone under 1000...😉\n,Its Awesome... Good sound quality & 8-9 hrs ba...,5,boAt Rockerz 255


df.dtypes

In [59]:
#checking for nulls
df.isnull().sum()

title      0
body       0
rating     0
product    0
dtype: int64

In [60]:
### Creating two large datasets, one with and one without the Kaggle data
reviews = pd.concat([airpods,
                   tozo,
                   samsung,
                   bose,
                   trelab,
                   df],
                   axis=0)

review_nok = pd.concat([airpods,
                   tozo,
                   samsung,
                   bose,
                   trelab],
                   axis=0)


In [61]:
reviews.columns

Index(['title', 'body', 'rating', 'product'], dtype='object')

In [62]:
review_nok.columns

Index(['title', 'body', 'rating', 'product'], dtype='object')

In [63]:
reviews.shape

(31868, 4)

In [64]:
review_nok.shape

(17531, 4)

In [65]:
# Creating two master csvs
reviews.to_csv('../datasets/reviews.csv')
review_nok.to_csv('../datasets/reviews_no_kaggle.csv')

### EDA

In [4]:
df = pd.read_csv('../datasets/reviews.csv')

In [5]:
df.columns

Index(['Unnamed: 0', 'title', 'body', 'rating', 'product'], dtype='object')

In [6]:
df = df.drop(columns='Unnamed: 0')
df.head()

Unnamed: 0,title,body,rating,product
0,Great wireless music,Product as expecred,5.0,Apple Airpods-Latest Model
1,battery life dies really quick,i really like my airpods but the battery life ...,4.0,Apple Airpods-Latest Model
2,The audio sound muffled (right ear),I bought Airpods 2 on Amazon. But right ears w...,3.0,Apple Airpods-Latest Model
3,Comprarlo sin problemas,Perfect,5.0,Apple Airpods-Latest Model
4,good！,so,5.0,Apple Airpods-Latest Model


In [7]:
# Using RegEx to 
df['title'] = df['title'].apply(lambda x: x.lower())
df['body'] = df['body'].apply(lambda x: x.lower())
# remove punctuation from text
df['title'] = df['title'].str.replace('[^\w\s]','')
df['body'] = df['body'].str.replace('[^\w\s]','')

In [8]:
df.head()

Unnamed: 0,title,body,rating,product
0,great wireless music,product as expecred,5.0,Apple Airpods-Latest Model
1,battery life dies really quick,i really like my airpods but the battery life ...,4.0,Apple Airpods-Latest Model
2,the audio sound muffled right ear,i bought airpods 2 on amazon but right ears wa...,3.0,Apple Airpods-Latest Model
3,comprarlo sin problemas,perfect,5.0,Apple Airpods-Latest Model
4,good,so,5.0,Apple Airpods-Latest Model
