# Packages 

In [1]:
import pandas as pd # pandas package
import numpy as np # numpy package
import matplotlib.pyplot as plt # matplotlib package
import seaborn as sns # seaborn package
import plotly 
from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# Load Data set

In [2]:
df = pd.read_csv(Path(r"../Data/Merged Data/merged_data.csv"))
df.sample(5)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product
6320,I am the Streamster!!!<U+0001F917><U+0001F92F>...,It works!!,5.0 out of 5 stars,"Reviewed in the United States on August 11, 2020",Christopher E Stuart,133,Google ChromeCast
13497,"Ads, Ads and more ads. I switched to a Chromec...",I bought this to replace a Chromecast.Pressing...,1.0 out of 5 stars,"Reviewed in the United States on July 31, 2020",DPO,350,Roku Remote
84,New features,I had the original fire stick for about 2 year...,5.0 out of 5 stars,"Reviewed in the United States on February 18, ...",Jason,9,Amazon Fire TV Stick
11610,Good quality Product,The Roku streaming stick was easy to install a...,5.0 out of 5 stars,"Reviewed in the United States on October 13, 2019",Truman,162,Roku Remote
9484,No type c cable included !!!,i just purchased chromecast and it didn't come...,1.0 out of 5 stars,"Reviewed in the United States on July 21, 2019",Dae Kang,449,Google ChromeCast


# Pre-cleaning 

1. Remove the rows where `review_text` column were missing
2. Fix `review_star` to have numeric value of ratings 
3. fix `date` to be parsed and converted into datetime object
4. Make suring we have actual string dtype columns
5. Convert `review_title`, `review_text`, `author`, and `product` to string types
6. Adding Sentiment Polarity Scores for `review_text` as a new column `polarity`
7. Adding Length of Reviews as a new column `review_len`
8. Adding the Word Count in each Reviews as a new column `word_count`


## Remove missing values in `review_text`

In [3]:
df.shape

(17740, 7)

In [4]:
df = df[~df['review_text'].isnull()]
df.shape

(17696, 7)

## Cleaning `review_star`

In [5]:
## extract rating from strings and convert to float type
df['review_star'] = df['review_star'].apply(lambda x: re.findall(r'^[0-9]\.[0-9]?', x)[0]).astype(float)

# sample only 5 rows
df['review_star'].sample(5)


6258     5.0
3431     5.0
15590    4.0
303      5.0
17000    1.0
Name: review_star, dtype: float64

## Cleaning `date`

In [6]:
# clean date of reviews to parse the actual date and convert to date-time object 
df['date'] = pd.to_datetime(df['date'].apply(lambda x: re.findall(r'(?<=on\ ).*', x)[0])).dt.strftime('%m-%d-%Y')

# sample only 5 rows
df['date'].sample(5)

3121     03-27-2022
10257    11-19-2021
4812     03-01-2022
13246    01-06-2020
6862     08-06-2020
Name: date, dtype: object

## Convert `review_title`, `review_text`, `author`, and `product` to string types

In [7]:
# convert review title column into string type
df['review_title'] = df['review_title'].astype("string")

# convert review text column into string type
df['review_text'] = df['review_text'].astype("string")

# convert author column into string type
df['author'] = df['author'].astype("string")

# convert product column into string type
df['product'] = df['product'].astype("string")

## Adding Sentiment Polarity Scores as `polarity`

In [8]:
# create new column for sentiment polarity scores
df['polarity'] = df['review_text'].map(lambda text: TextBlob(text).sentiment.polarity)

# sample only 5 rows
df['polarity'].sample(5)

15704    0.445518
6905     0.616667
3375     0.500000
831      0.493182
5763     0.000000
Name: polarity, dtype: float64

### Preview 5 randomly selected reviews with the highest sentiment polarity score

In [9]:
print('5 random reviews with the highest positive sentiment polarity: \n')
cl = df.loc[df.polarity == 1, ['review_text']].sample(5).values
for c in cl:
    print(c[0])

5 random reviews with the highest positive sentiment polarity: 

best buy!!!
Works great!
Works <U+0001F44D> streaming everything is wonderful
excellent
Works great!


### Preview 5 randomly selected reviews with the most neutral sentiment polarity score

In [10]:
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl = df.loc[df.polarity == 0, ['review_text']].sample(5).values
for c in cl:
    print(c[0])

5 random reviews with the most neutral sentiment(zero) polarity: 

I wish that the TV Guide was easier to understand unless it's me.
<U+0001F44D><U+0001F60A>
works as advertised
La comodidad de control remoto y la calidad que ofrece
Just dumped the catv provider


### Preview 5 randomly selected reviews with the most negative sentiment polarity score

In [11]:
print('5 reviews with the most negative polarity: \n')
cl = df.loc[df.polarity <= -1.0, ['review_text']].sample(5).values
for c in cl:
    print(c[0])

5 reviews with the most negative polarity: 

No mute button WTF!!!!!!!!!!
You have to use your phone to control this. Very annoying. Wish I got the fire stick or Roku stick instead
This is a horrible device, although I am an IT Data tech. Cannot get it working after I downloaded 3 apps , gave them my CC card . Now there are fraudulent charges on my account. Please do not purchase this product, you will regret it. !!
Just awful...one of apples worst products...highly recommend going with something else
Terrible


## Adding Length of Reviews as `review_len`

In [12]:
# create new column called review_len for length of reviews
df['review_len'] = df['review_text'].astype(str).apply(len)

# sample only 5 rows
df['review_len'].sample(5)

3290      11
1965      40
2997     123
3610       9
12914     18
Name: review_len, dtype: int64

## Adding the Word Count in each Reviews as `word_count`

In [13]:
# create new column called word count for number of words in each reviews
df['word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

# sample only 5 rows
df['word_count'].sample(5)

9834      2
17058    54
11417    36
16620     2
17663    28
Name: word_count, dtype: int64

## Re-examine data now

In [14]:
# preview new clean columns with only 10 samples
df.sample(10)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product,polarity,review_len,word_count
2531,Its cool,Its cool,5.0,01-13-2022,Tommy Terry,254,Amazon Fire TV Stick,0.35,8,2
16772,Love amazon not this time.,It's not working. Plug it and no power. Its junk.,2.0,07-23-2020,Jenny Casella,178,Apple TV,0.0,49,10
7000,Works as expected.,"Works as expected. But, this was a second one ...",5.0,09-08-2020,ray,201,Google ChromeCast,-0.033333,129,27
14107,#1 Video streamer.,Excellent.. here is what i think:1. They produ...,5.0,01-25-2018,M. Reddy,411,Roku Remote,0.566942,414,69
6486,Essential,I've been using chromecast for the past 7 year...,5.0,09-16-2021,Brittany Campbell,149,Google ChromeCast,0.175024,1114,210
8279,great product,work as expected,5.0,04-18-2020,Amazon Customer,328,Google ChromeCast,-0.1,16,3
2420,Works great,"Dont have cable, watch everything thru this un...",5.0,07-05-2022,Diarra P Collazo,243,Amazon Fire TV Stick,0.8,66,11
2350,good value for the money,Fast shipping. exactly as it said. Easy to ins...,5.0,02-16-2022,Nancy Kepner,236,Amazon Fire TV Stick,0.270833,69,12
15998,ATV FTW,I don’t have cable or satellite right now so I...,5.0,08-09-2018,Julian Salas,100,Apple TV,0.362245,300,60
823,A great accessory to have for sure,It's the best. You don't even really need cabl...,5.0,11-17-2021,Stephanie,83,Amazon Fire TV Stick,0.435714,200,40


# Exploratory Data Analysis 