<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Packages" data-toc-modified-id="Packages-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Packages</a></span></li><li><span><a href="#Removing-Unecessary-Columns" data-toc-modified-id="Removing-Unecessary-Columns-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Removing Unecessary Columns</a></span></li><li><span><a href="#Cleaning-Reviews" data-toc-modified-id="Cleaning-Reviews-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Cleaning Reviews</a></span></li><li><span><a href="#Split-the-data-set-for-training,-validation-and-test" data-toc-modified-id="Split-the-data-set-for-training,-validation-and-test-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Split the data set for training, validation and test</a></span><ul class="toc-item"><li><span><a href="#Checking-for-Class-Imbalance" data-toc-modified-id="Checking-for-Class-Imbalance-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Checking for Class Imbalance</a></span></li><li><span><a href="#Convert-Review-Stars-into-Positive,-Neutral,-and-Negative-classes" data-toc-modified-id="Convert-Review-Stars-into-Positive,-Neutral,-and-Negative-classes-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Convert Review Stars into Positive, Neutral, and Negative classes</a></span></li><li><span><a href="#Checking-for-class-imbalance-after-new-target-classes" data-toc-modified-id="Checking-for-class-imbalance-after-new-target-classes-4.3"><span class="toc-item-num">4.3&nbsp;&nbsp;</span>Checking for class imbalance after new target classes</a></span></li><li><span><a href="#Downsampling/Upsampling-the-Majority/Minority-class" data-toc-modified-id="Downsampling/Upsampling-the-Majority/Minority-class-4.4"><span class="toc-item-num">4.4&nbsp;&nbsp;</span>Downsampling/Upsampling the Majority/Minority class</a></span></li></ul></li></ul></div>

# Packages 

In [None]:
import pandas as pd # pandas package
pd.options.display.max_columns = 40

import numpy as np # numpy package

# matplotlib packages
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

import seaborn as sns # seaborn package
# dictionary package
from collections import Counter, defaultdict

import warnings  # warnings package
warnings.filterwarnings('ignore')

# plotly packages
from chart_studio import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot

# cufflink packages
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'

from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# word cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# nltk packages
import nltk

#nltk.download('stopwords')
# stop words
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

# punctuation
from string import punctuation

# detokenizer 
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [None]:
# load the merged data set into pandas
df = pd.read_csv(Path(r"../Data/EDA Data/eda_data.csv"))

# sample only 5 rows
df.sample(5)

# Removing Unecessary Columns 

In [None]:
# drop these columns
drop_columns = ['review_title', 'date', 'author', 'page', 'product',
 'polarity', 'review_len', 'word_count', 'clean_reviews', 'bigrams', 'trigrams']

# drop
df.drop(drop_columns, inplace=True, axis = 1)

# see 10 random samples
df.sample(10)

# Cleaning Reviews 

- Remove Punctuation
- Remove extra white space
- Tokenize on white space pattern
- Fold to lowercase
- Remove stopwords
- Remove numbers
- Remove unicode characters

In [None]:
# punctuation dictionary
punctuation = set(punctuation) 
include_punctuation = {'’', '”', '“'}
punctuation |= include_punctuation

# stop words and other words to be excluded
include_stopwords = {'could', 'shouldn', 'oh', 'know', 'im', 'en',
'go', 'get', 'got', 'gonna', 'la', 'na', 'de', 'gon', 'got' 'must', 'would', 'also', 
                    'apple', 'Apple', 'Amazon', 'amazon', 
                     'roku', 'Roku', 'roku remote', 'Rokue Remote',
                     'Google', 'google', 'chromecast', 'Chromecast', 
                    'Chrome Cast', 'chrome cast', 'chrome', 'cast'
                     'Fire TV Stick', 'prime', 'firestick4ktv',
                     'firestick', 'fire tv', 'fire tv stick', 'fire', 
                     'firesticks','tv', 'remote', '4k', 'stick', 'dont', "it's", 'tvs',
                    'etc'}

# include the dictionary of stop words
sw |= include_stopwords

# useful white space pattern
whitespace_pattern = re.compile(r"\s+")

def decontracted(phrase):
    """
    split up decontracted words from a column of texts
    
    """
    # add extra white space
    phrase = re.sub('(?<=[.,!?()/:;])(?=[^\s])', r' ',  phrase)
   
    # specific
    phrase = re.sub(r"she/her", "she her",phrase)
    phrase = re.sub(r"he/him", "he him",phrase)
    phrase = re.sub(r"they/them", "they them",phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r'\<.*\>', '', phrase)

    # general
    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"don’t", "do not", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"it’s", "it is", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub("\w+\d+", "", phrase)
    phrase = re.sub("\d+\w+", "", phrase)
    phrase = re.sub("\d+", " ", phrase)

    return phrase

def remove_stop(tokens) :
    """
    remove stop words from a column of texts
    """
    
    not_stop_words = [word for word in tokens if word not in sw]
    return not_stop_words
 
def remove_punctuation(text) : 
    """
    remove punctuation from a column of texts
    """
    return("".join([ch for ch in text if ch not in punctuation]))

def tokenize(text) : 
    """ Splitting on whitespace"""
    
    # modify this function to return tokens
    tokens = re.split(whitespace_pattern, text)
    return(tokens)


def remove_whitespace_token(tokens):
    """ Remove whitespace tokens"""
    
    # loop through each token to find whitespace token and remove
    for i in tokens:
        if '' in tokens:
            tokens.remove('')
    return tokens

def prepare(text, pipeline) :
    """
    prepare function applies each cleaning transformation
    function onto a column of text
    """
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)


# list of cleaning functions
my_pipeline = [str.lower, decontracted, remove_punctuation, tokenize, remove_whitespace_token, remove_stop]



In [None]:
# for cleaning and tokenizng reviews
df['clean_reviews'] = df['review_text'].apply(prepare,pipeline=my_pipeline)

# remove any unicode characters
df['clean_reviews'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

# drop original reviews column
df.drop(columns = ['review_text'], axis = 1, inplace = True)

# drop rows with no tokens
df = df[df['clean_reviews'].str.len() != 0]

# untokenize plot descriptions
df['clean_reviews'] = df['clean_reviews'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

# 10 random rows
df.sample(10)

# Split the data set for training, validation and test 


- Checking for Class Imbalance
- Convert Review Stars into Positive, Neutral, and Negative classes
- Checking for class imbalance after new target classes
- Downsampling/Upsampling the Majority/Minority class
- Split Data on balanced data set


## Checking for Class Imbalance

In [None]:
# create a data frame to count the number of customer reviews by products and also rename a column as count
rating_counts = pd.DataFrame(df[['review_star']].value_counts()).reset_index().rename( \
    columns={'review_star': 'Ratings', 0:'count'})

# add a sort index
rating_counts['Ratings_for_sorting'] = rating_counts['Ratings']

# convert ratings to str type for visualization
rating_counts['Ratings'] = rating_counts['Ratings'].astype(str)

# Sort order by custom column and display 
rating_counts.sort_values(by=['Ratings_for_sorting'], inplace = True)

# use plotly to create a bar graph of number of reviews by brand and using customized color coding 
fig = px.bar(rating_counts, x="Ratings", y="count", color="Ratings", 
             title= "Customer Ratings - Checking for Class Imbalance", text_auto = '.2s', 
             color_discrete_map={'1.0':'red', '2.0':'orange', '3.0': 'yellow',
                                 '4.0':'rgb(179,226,205)', '5.0':'rgb(27,158,119)'})

# figure customization
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False,
 marker_line_width=1.5, opacity = 0.75)


## Convert Review Stars into Positive, Neutral, and Negative classes

In [None]:
def convert_ratings(x):
    """
    This function takes in a value from `review_star` and convert it into a new class
    as either Negative, Neutral, or Positive for the target variable
    
    """
    if x <= 2.0:
        return 'Negative'
    if x <= 4.0:
        return 'Neutral'
    return 'Positive'

# apply the convert rating function
df['new_rating'] = df['review_star'].apply(lambda x: convert_ratings(x))
df['new_rating'] = pd.Categorical(df['new_rating'], ["Negative", "Neutral", "Positive"])

# random 10 rows preview
df.sample(10)

## Checking for class imbalance after new target classes

In [None]:
# create a data frame to count the number of customer reviews by products and also rename a column as count
rating_counts = pd.DataFrame(df[['new_rating']].value_counts()).reset_index().rename( \
    columns={'new_rating': 'Ratings', 0:'count'})

# add a sort index
rating_counts['Ratings_for_sorting'] = rating_counts['Ratings']

# convert ratings to str type for visualization
rating_counts['Ratings'] = rating_counts['Ratings'].astype(str)


# Sort order by custom column
rating_counts.sort_values(by=['Ratings_for_sorting'], inplace = True)

# use plotly to create a bar graph of number of reviews by brand and using customized color coding 
fig = px.bar(rating_counts, x="Ratings", y="count", color="Ratings", 
             title= "Customer Ratings - Checking for Class Imbalance", text_auto = '.2s', 
             color_discrete_map={'Negative':'red', 'Neutral':'yellow', 'Positive':'rgb(27,158,119)'})

# figure customization
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False,
 marker_line_width=1.5, opacity = 0.75)


## Downsampling/Upsampling the Majority/Minority class