# Packages 

In [10]:
import pandas as pd # pandas package
import numpy as np # numpy package
import matplotlib.pyplot as plt # matplotlib package
import seaborn as sns # seaborn package
import plotly 
from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# Load Data set

In [11]:
df = pd.read_csv(Path(r"../Data/Merged Data/merged_data.csv"))
df.sample(5)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product
12871,Meets my need exactly!,This Roku stick package had everything needed ...,5.0 out of 5 stars,"Reviewed in the United States on December 5, 2019",RAM,288,Roku Remote
16350,Nice,We upgrade our Apple TV and this works very si...,4.0 out of 5 stars,"Reviewed in the United States on December 25, ...",Dana Hornung,136,Apple TV
16546,"Apple TV 4K (64GB, Latest Model)",Wordies Great!!,5.0 out of 5 stars,"Reviewed in the United States on December 14, ...",Marion DeMar Aller III,155,Apple TV
5713,Works great!,Great product!,5.0 out of 5 stars,"Reviewed in the United States on May 28, 2020",Megan,72,Google ChromeCast
4135,Good Seller....!!!!!,Very Fast,5.0 out of 5 stars,"Reviewed in the United States on February 22, ...",Rodrigo,414,Amazon Fire TV Stick


# Pre-cleaning 

1. Remove the rows where `review_text` column were missing
2. Fix `review_star` to have numeric value of ratings 
3. fix `date` to be parsed and converted into datetime object
4. Make suring we have actual string dtype columns
5. Convert `review_title`, `review_text`, `author`, and `product` to string types
6. Adding Sentiment Polarity Scores for `review_text` as a new column `polarity`
7. Adding Length of Reviews as a new column `review_len`
8. Adding the Word Count in each Reviews as a new column `word_count`


## Remove missing values in `review_text`

In [12]:
df.shape

(17740, 7)

In [13]:
df = df[~df['review_text'].isnull()]
df.shape

(17696, 7)

## Cleaning `review_star`

In [14]:
## extract rating from strings and convert to float type
df['review_star'] = df['review_star'].apply(lambda x: re.findall(r'^[0-9]\.[0-9]?', x)[0]).astype(float)

# sample only 5 rows
df['review_star'].sample(5)


3327     5.0
14705    5.0
4515     5.0
10000    5.0
455      5.0
Name: review_star, dtype: float64

## Cleaning `date`

In [15]:
# clean date of reviews to parse the actual date and convert to date-time object 
df['date'] = pd.to_datetime(df['date'].apply(lambda x: re.findall(r'(?<=on\ ).*', x)[0])).dt.strftime('%m-%d-%Y')

# sample only 5 rows
df['date'].sample(5)

3735     12-20-2021
16761    07-28-2019
4368     03-20-2022
4008     01-13-2022
13843    12-16-2020
Name: date, dtype: object

## Convert `review_title`, `review_text`, `author`, and `product` to string types

In [16]:
# convert review title column into string type
df['review_title'] = df['review_title'].astype("string")

# convert review text column into string type
df['review_text'] = df['review_text'].astype("string")

# convert author column into string type
df['author'] = df['author'].astype("string")

# convert product column into string type
df['product'] = df['product'].astype("string")

## Adding Sentiment Polarity Scores as `polarity`

In [17]:
# create new column for sentiment polarity scores
df['polarity'] = df['review_text'].map(lambda text: TextBlob(text).sentiment.polarity)

# sample only 5 rows
df['polarity'].sample(5)

1777     0.58
12795    1.00
8540     0.00
5250     1.00
1396     0.80
Name: polarity, dtype: float64

## Adding Length of Reviews as `review_len`

In [18]:
# create new column called review_len for length of reviews
df['review_len'] = df['review_text'].astype(str).apply(len)

# sample only 5 rows
df['review_len'].sample(5)

8517      13
1360      40
1009      47
10070    883
5082     257
Name: review_len, dtype: int64

## Adding the Word Count in each Reviews as `word_count`

In [19]:
# create new column called word count for number of words in each reviews
df['word_count'] = df['review_text'].apply(lambda x: len(str(x).split()))

# sample only 5 rows
df['word_count'].sample(5)

989       8
3401      3
901      43
9775     38
11462    19
Name: word_count, dtype: int64

## Re-examine data now

In [20]:
# preview new clean columns with only 10 samples
df.sample(10)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product,polarity,review_len,word_count
3161,Like it very much; easy to use,Streaming great. Haven’t used voice or Alexa.,5.0,06-06-2022,George Waterhouse,317,Amazon Fire TV Stick,0.8,45,7
5396,Works great,Works as advertised,5.0,03-07-2022,J. Parks,40,Google ChromeCast,0.0,19,3
17236,Money saver,I absolutely love this product. Every night I ...,5.0,02-05-2018,Cameron Iz,224,Apple TV,0.190278,573,112
4434,Amazing,I like everything about it,5.0,04-16-2022,Daniel Perez,444,Amazon Fire TV Stick,0.0,26,5
4004,Pleased,Love everything about my Firestick.,5.0,03-24-2022,Barbara,401,Amazon Fire TV Stick,0.5,35,5
4061,Luvbug,I’ve been enjoying the firestick,5.0,03-18-2022,Lah111320,407,Amazon Fire TV Stick,0.5,32,5
16423,The Apple TV is slow.,The Apple TV is slow and the remote does not a...,2.0,05-14-2019,Keith Cannada,143,Apple TV,-0.2375,157,30
11641,Great way to watch TV. Awesome!!!,"Soon as it arrived, I put it on the kitchen TV...",5.0,03-12-2018,lonlives,165,Roku Remote,0.1,159,28
17049,Advertising is Wrong,This product advertising is wrong! This is NOT...,1.0,06-21-2021,William Mazejko,205,Apple TV,-0.111111,113,20
1160,Easy to Install,Easy to Install. I like the user interface as ...,5.0,03-13-2022,Adrienne A,117,Amazon Fire TV Stick,0.433333,51,10


# Exploratory Data Analysis 