# Packages 

In [1]:
import pandas as pd # pandas package
import numpy as np # numpy package
import matplotlib.pyplot as plt # matplotlib package
import seaborn as sns # seaborn package
import plotly 
from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# Load Data set

In [2]:
# load the merged data set into pandas
df = pd.read_csv(Path(r"../Data/Merged Data/merged_data.csv"))

# sample only 5 rows
df.sample(5)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product
13969,Buy one if you want other Streaming,I bought 2 of them for myself and a friend. We...,5.0 out of 5 stars,"Reviewed in the United States on April 22, 2021",Jay H,397,Roku Remote
4058,Sus progamas,Si imagen y la gran variedad de programas,5.0 out of 5 stars,"Reviewed in the United States on April 27, 2022",Edeard Antonio Peguero Homez,406,Amazon Fire TV Stick
2570,"Very good image, even on a conventional tv",Very goodbye,5.0 out of 5 stars,"Reviewed in the United States on December 30, ...",RRBB,258,Amazon Fire TV Stick
13652,Flawless streaming quality all,I purchased an Amazon fire stick first before ...,5.0 out of 5 stars,"Reviewed in the United States on March 18, 2020",eg61,366,Roku Remote
4000,fast shipping,Great product and fast shipping.,5.0 out of 5 stars,"Reviewed in the United States on June 13, 2022",John Fras,401,Amazon Fire TV Stick


# Pre-cleaning 

1. Remove the rows where `review_text` column were missing
2. Fix `review_star` to have numeric value of ratings 
3. fix `date` to be parsed and converted into datetime object
4. Make suring we have actual string dtype columns
5. Convert `review_title`, `review_text`, `author`, and `product` to string types
6. Adding Sentiment Polarity Scores for `review_text` as a new column `polarity`
7. Adding Length of Reviews as a new column `review_len`
8. Adding the Word Count in each Reviews as a new column `word_count`


## Remove missing values in `review_text`

In [3]:
# original data frame dimensions before removing null values in `review_text`
df.shape

(17740, 7)

In [4]:
# remove null values in `review_text`
df = df[~df['review_text'].isnull()]

# data frame dimensions after removing null values
df.shape

(17696, 7)

## Cleaning `review_star`

In [5]:
## extract rating from strings and convert to float type
df['review_star'] = df['review_star'].apply(lambda x: re.findall(r'^[0-9]\.[0-9]?', x)[0]).astype(float)

# sample only 5 rows
df['review_star'].sample(5)


1000     5.0
15407    5.0
2705     5.0
10199    5.0
16172    3.0
Name: review_star, dtype: float64

## Cleaning `date`

In [6]:
# clean date of reviews to parse the actual date and convert to date-time object 
df['date'] = pd.to_datetime(df['date'].apply(lambda x: re.findall(r'(?<=on\ ).*', x)[0])).dt.strftime('%m-%d-%Y')

# sample only 5 rows
df['date'].sample(5)

15398    01-21-2019
7068     01-22-2021
8867     05-11-2020
14425    05-18-2019
2816     01-03-2022
Name: date, dtype: object

## Convert `review_title`, `review_text`, `author`, and `product` to string types

In [7]:
# convert review title column into string type
df['review_title'] = df['review_title'].astype("string")

# convert review text column into string type
df['review_text'] = df['review_text'].astype("string")

# convert author column into string type
df['author'] = df['author'].astype("string")

# convert product column into string type
df['product'] = df['product'].astype("string")

## Adding Sentiment Polarity Scores as `polarity`

In [8]:
# create new column for sentiment polarity scores
df['polarity'] = df['review_text'].map(lambda text: TextBlob(text).sentiment.polarity)

# sample only 5 rows
df['polarity'].sample(5)

11382    0.018750
13064    0.550000
4024     0.500000
9562    -0.083333
14788    0.220052
Name: polarity, dtype: float64

### Preview 5 randomly selected reviews with the highest sentiment polarity score

In [9]:
# Print statement
print('5 random reviews with the highest positive sentiment polarity: \n')

# Randomly sample review texts with highest positive sentiment polarity scores
cl = df.loc[df.polarity == 1, ['review_text']].sample(5).values

# print out each one
for c in cl:
    print(c[0])

5 random reviews with the highest positive sentiment polarity: 

Wonderful idea for gifts
Very happy with our Firestick
It’s very good device for streaming!
excellent
Great device!


### Preview 5 randomly selected reviews with the most neutral sentiment polarity score

In [10]:
# Print statement
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')

# Randomly sample review texts with highest neutral sentiment polarity scores
cl = df.loc[df.polarity == 0, ['review_text']].sample(5).values

# print out each one
for c in cl:
    print(c[0])

5 random reviews with the most neutral sentiment(zero) polarity: 

No me sirvió
Movies and sports
I’m buying a second one. It works both all the way across the house from the WiFi router and downstairs in the basement halfway across the house from the router.
Was a hift
Works as expected, no issues so far


### Preview 5 randomly selected reviews with the most negative sentiment polarity score

In [11]:
# Print Statement
print('5 reviews with the most negative polarity: \n')

# Randomly sample review texts with highest negative sentiment polarity scores
cl = df.loc[df.polarity <= -1.0, ['review_text']].sample(5).values

# Print out each one
for c in cl:
    print(c[0])

5 reviews with the most negative polarity: 

Arrived today and it has been used! No place in description does it mention that this item was refurbished. Box wasn’t even sealed and you can tell that it has been used and returned. Very unhappy!!!
Just awful...one of apples worst products...highly recommend going with something else
Very annoying to use app. Have to pay for monthly fee for app.
Customer service doesn't exist, I bought 3 of these pieces of CRAP for $50.00 and now they are going for $30.00Find some thing else to spend your money on. You can't believe the reviews. Roku will play you for a sap!!
It’s horrible, it doesn’t work with anything, the streaming is garbage, etc. Just get an apple tv


## Adding Length of Reviews as `review_len`

In [12]:
# create new column called review_len for length of reviews
df['review_len'] = df['review_text'].astype(str).apply(len)

# sample only 5 rows
df['review_len'].sample(5)

14227    176
962       70
7460      29
9970      48
9181       4
Name: review_len, dtype: int64

## Adding the Word Count in each Reviews as `word_count`

In [13]:
# create new column called word count for number of words in each reviews
df['word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

# sample only 5 rows
df['word_count'].sample(5)

7295    19
9897    49
1087    10
1877    32
2795    28
Name: word_count, dtype: int64

## Re-examine data now

In [14]:
# preview new clean columns with only 10 samples
df.sample(10)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product,polarity,review_len,word_count
9607,Good for music,I have this hooked into a stereo and use it fo...,4.0,12-28-2020,Nathan,461,Google ChromeCast,0.458333,222,43
15220,Excellent Streaming Device.,The Apple TV fits right in at our house since ...,5.0,12-26-2018,William,23,Apple TV,0.228571,122,24
3343,Excelente,Recomendado muy buena programación,5.0,02-22-2022,Darlin Flores,335,Amazon Fire TV Stick,0.0,34,4
8626,Easy to connect,"Easy setup, works well with my phone.",5.0,07-26-2019,S Brum,363,Google ChromeCast,0.433333,37,7
14075,Bad Quality Control!,"Ordered a ROKU+ about 6 months ago, worked fin...",2.0,08-06-2018,Amazon Customer,408,Roku Remote,0.027887,758,142
1565,tips on how to use features on firestick,Firestick is great. Picture quality really nic...,5.0,11-17-2021,Sabrina Fowler,157,Amazon Fire TV Stick,0.4,334,59
8350,GREAT PRODUCT!!,IT WORKS GREAT!!!,5.0,01-19-2020,Franklin B.,336,Google ChromeCast,1.0,17,3
4281,Amazing Product,It is truly amazing…..,5.0,01-08-2022,Ukesh,429,Amazon Fire TV Stick,0.0,22,4
11529,Don't pay for 4k if you don't have a 4k tv,"Roku is great, I love it. But if you don't hav...",5.0,10-18-2018,BillyBob,153,Roku Remote,0.359091,131,29
7704,Great,Works good,5.0,05-11-2021,harlan paulson,271,Google ChromeCast,0.7,10,2


# Exploratory Data Analysis 