<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Packages" data-toc-modified-id="Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Packages</a></span></li><li><span><a href="#Removing-Unecessary-Columns" data-toc-modified-id="Removing-Unecessary-Columns-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Removing Unecessary Columns</a></span></li><li><span><a href="#Cleaning-Reviews" data-toc-modified-id="Cleaning-Reviews-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cleaning Reviews</a></span></li><li><span><a href="#Split-the-data-set-for-training,-validation-and-test" data-toc-modified-id="Split-the-data-set-for-training,-validation-and-test-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Split the data set for training, validation and test</a></span></li></ul></div>

# Packages 

In [1]:
import pandas as pd # pandas package
pd.options.display.max_columns = 40

import numpy as np # numpy package

# matplotlib packages
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

import seaborn as sns # seaborn package
# dictionary package
from collections import Counter, defaultdict

import warnings  # warnings package
warnings.filterwarnings('ignore')

# plotly packages
from chart_studio import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot

# cufflink packages
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'

from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# word cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# nltk packages
import nltk

#nltk.download('stopwords')
# stop words
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

# punctuation
from string import punctuation

# detokenizer 
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [2]:
# load the merged data set into pandas
df = pd.read_csv(Path(r"../Data/EDA Data/eda_data.csv"))

# sample only 5 rows
df.sample(5)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product,polarity,review_len,word_count,clean_reviews,bigrams,trigrams
3434,Excellent,Excellent,5.0,04-26-2022,Luis Felipe Sandoval,345,Amazon Fire TV Stick,1.0,9,1,['excellent'],[],[]
6187,Muy bueno,"Muy buen equipo, se debe tener un buen internet",5.0,10-01-2019,Carlos Martínez,124,Google ChromeCast,0.0,47,9,"['muy', 'buen', 'equipo', 'se', 'debe', 'tener...","['muy buen', 'buen equipo', 'equipo se', 'se d...","['muy buen equipo', 'buen equipo se', 'equipo ..."
4021,Very easy to use. Better than cable TV,Works just fine for me,5.0,03-25-2022,Donald Wallace,405,Amazon Fire TV Stick,0.416667,22,5,"['works', 'fine']",[],[]
8779,Less functionality,It ’s a pity that you can only connect to YouT...,3.0,05-14-2020,Jovy H,383,Google ChromeCast,-0.05,64,14,"['pity', 'connect', 'youtube', 'browser']",[],[]
5302,Works great and easy to install,Easy to install with google home app. Great de...,5.0,09-17-2020,La T.,35,Google ChromeCast,0.616667,100,16,"['easy', 'install', 'home', 'app', 'great', 'd...","['home app', 'app great', 'great device', 'str...","['home app great', 'app great device']"


# Removing Unecessary Columns 

In [3]:
# drop these columns
drop_columns = ['review_title', 'date', 'author', 'page', 'product',
 'polarity', 'review_len', 'word_count', 'clean_reviews', 'bigrams', 'trigrams']

# drop
df.drop(drop_columns, inplace=True, axis = 1)

# see 10 random samples
df.sample(10)

Unnamed: 0,review_text,review_star
2258,why waist your time with cable when you can bu...,5.0
10942,Works well.,5.0
10232,Our Roku worked just fine for two months. We e...,5.0
11618,This is a great streaming device and the third...,5.0
17182,Excellent interface and user experience. Simpl...,5.0
3204,"Fáciles instrucciones ,cómodo de usar,una exce...",5.0
730,"TV at most hotels is getting sparser, and this...",5.0
11023,I'm very happy with my purchase. It was super ...,5.0
4672,Nice Convenience,5.0
14673,It was easy to set up and use but it only last...,3.0


# Cleaning Reviews 

- Remove Punctuation
- Remove extra white space
- Tokenize on white space pattern
- Fold to lowercase
- Remove stopwords
- Remove numbers
- Remove unicode characters

In [4]:
# punctuation dictionary
punctuation = set(punctuation) 
include_punctuation = {'’', '”', '“'}
punctuation |= include_punctuation

# stop words and other words to be excluded
include_stopwords = {'could', 'shouldn', 'oh', 'know', 'im', 'en',
'go', 'get', 'got', 'gonna', 'la', 'na', 'de', 'gon', 'got' 'must', 'would', 'also', 
                    'apple', 'Apple', 'Amazon', 'amazon', 
                     'roku', 'Roku', 'roku remote', 'Rokue Remote',
                     'Google', 'google', 'chromecast', 'Chromecast', 
                    'Chrome Cast', 'chrome cast', 'chrome', 'cast'
                     'Fire TV Stick', 'prime', 'firestick4ktv',
                     'firestick', 'fire tv', 'fire tv stick', 'fire', 
                     'firesticks','tv', 'remote', '4k', 'stick', 'dont', "it's", 'tvs',
                    'etc'}

# include the dictionary of stop words
sw |= include_stopwords

# useful white space pattern
whitespace_pattern = re.compile(r"\s+")

def decontracted(phrase):
    """
    split up decontracted words from a column of texts
    
    """
    # add extra white space
    phrase = re.sub('(?<=[.,!?()/:;])(?=[^\s])', r' ',  phrase)
   
    # specific
    phrase = re.sub(r"she/her", "she her",phrase)
    phrase = re.sub(r"he/him", "he him",phrase)
    phrase = re.sub(r"they/them", "they them",phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r'\<.*\>', '', phrase)

    # general
    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"don’t", "do not", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"it’s", "it is", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub("\w+\d+", "", phrase)
    phrase = re.sub("\d+\w+", "", phrase)
    phrase = re.sub("\d+", " ", phrase)

    return phrase

def remove_stop(tokens) :
    """
    remove stop words from a column of texts
    """
    
    not_stop_words = [word for word in tokens if word not in sw]
    return not_stop_words
 
def remove_punctuation(text) : 
    """
    remove punctuation from a column of texts
    """
    return("".join([ch for ch in text if ch not in punctuation]))

def tokenize(text) : 
    """ Splitting on whitespace"""
    
    # modify this function to return tokens
    tokens = re.split(whitespace_pattern, text)
    return(tokens)


def remove_whitespace_token(tokens):
    """ Remove whitespace tokens"""
    
    # loop through each token to find whitespace token and remove
    for i in tokens:
        if '' in tokens:
            tokens.remove('')
    return tokens

def prepare(text, pipeline) :
    """
    prepare function applies each cleaning transformation
    function onto a column of text
    """
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)


# list of cleaning functions
my_pipeline = [str.lower, decontracted, remove_punctuation, tokenize, remove_whitespace_token, remove_stop]



In [5]:
# for cleaning and tokenizng reviews
df['clean_reviews'] = df['review_text'].apply(prepare,pipeline=my_pipeline)

# remove any unicode characters
df['clean_reviews'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

# drop original reviews column
df.drop(columns = ['review_text'], axis = 1, inplace = True)

# drop rows with no tokens
df = df[df['clean_reviews'].str.len() != 0]

# untokenize plot descriptions
df['clean_reviews'] = df['clean_reviews'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

# 10 random rows
df.sample(10)

Unnamed: 0,review_star,clean_reviews
7859,5.0,like days already problem working properly slo...
16232,5.0,product exactly described good value price
1318,5.0,everything connected quickly works great
517,5.0,really like upgraded version like alexa voice ...
6031,5.0,works great
7002,1.0,box plugged philips flashing onoff screen unsu...
12115,5.0,great product love things
8491,4.0,love kids
14760,5.0,buying second one works way across house wifi ...
10363,5.0,simple design buy motivation stop buffering is...


# Split the data set for training, validation and test 


- Checking for Class Imbalance
- Downsampling the Majority class
- Split Data on balanced data set
