# Packages 

In [1]:
import pandas as pd # pandas package
pd.options.display.max_columns = 40

import numpy as np # numpy package

# matplotlib packages
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

import seaborn as sns # seaborn package
# dictionary package
from collections import Counter, defaultdict

import warnings  # warnings package
warnings.filterwarnings('ignore')

# plotly packages
from chart_studio import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot

# cufflink packages
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'

from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# word cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# nltk packages
import nltk

#nltk.download('stopwords')
# stop words
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

# punctuation
from string import punctuation

# detokenizer 
from nltk.tokenize.treebank import TreebankWordDetokenizer

# sklearn packages
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline

#imblean packages for undersampling/oversampling
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import NearMiss 
from imblearn.under_sampling import OneSidedSelection

# pickle package
import _pickle as cPickle

In [2]:
# load the merged data set into pandas
df = pd.read_csv(Path(r"../Data/EDA Data/eda_data.csv"))

# sample only 5 rows
df.sample(5)

Unnamed: 0,review_title,review_text,review_star,date,author,page,product,polarity,review_len,word_count,clean_reviews,bigrams,trigrams
5610,Easy set up,Easy to connect and use.,5.0,03-07-2021,Richard D. Rocafort,66,Google ChromeCast,0.433333,24,5,"['easy', 'connect', 'use']",[],[]
10976,Works great on regular TV.,Works great!,5.0,11-16-2020,Sue A. Davidson,103,Roku Remote,1.0,12,2,"['works', 'great']",['works great'],[]
3744,The BEST!,No more buffering!,5.0,01-30-2022,Mary,377,Amazon Fire TV Stick,-0.3125,18,3,['buffering'],[],[]
3772,It performs as advertised,"It works properly and was priced fairly, on sale.",5.0,06-07-2022,Robert B.,380,Amazon Fire TV Stick,0.35,49,9,"['works', 'properly', 'priced', 'fairly', 'sale']","['works properly', 'priced fairly']",[]
8233,Excellent product!,Excellent product! It’s been 2 years since my ...,5.0,03-31-2020,michelle antunez,328,Google ChromeCast,1.0,81,13,"['excellent', 'product', 'years', 'since', 'pu...","['excellent product', '2 years', 'years since'...","['2 years since', 'purchase still working', 's..."


# Removing Unecessary Columns 

In [3]:
# drop these columns
drop_columns = ['review_title', 'date', 'author', 'page', 'product',
 'polarity', 'review_len', 'word_count', 'clean_reviews', 'bigrams', 'trigrams']

# drop
df.drop(drop_columns, inplace=True, axis = 1)

# see 10 random samples
df.sample(10)

Unnamed: 0,review_text,review_star
12894,"Product worked fine, until mid-movie the other...",5.0
3144,Love everything about it. Easy to install and ...,5.0
17612,Great picture quality. Easy installation,5.0
6097,"Excelente producto, muy fácil de instalar, exc...",5.0
13064,Like this item very much,5.0
11,I've been using Fire Sticks for years. Each ne...,1.0
6375,My neighbor turned me onto to this device. He ...,3.0
9989,"I bought a few of these, cause we had tried al...",1.0
6738,Many smart TVs have YouTube built-in but using...,5.0
8883,Worth the money and works!!,5.0


# Cleaning Reviews 

- Remove Punctuation
- Remove extra white space
- Tokenize on white space pattern
- Fold to lowercase
- Remove stopwords
- Remove numbers
- Remove unicode characters

In [4]:
# punctuation dictionary
punctuation = set(punctuation) 
include_punctuation = {'’', '”', '“'}
punctuation |= include_punctuation

# stop words and other words to be excluded
include_stopwords = {'could', 'shouldn', 'oh', 'know', 'im', 'en',
'go', 'get', 'got', 'gonna', 'la', 'na', 'de', 'gon', 'got' 'must', 'would', 'also', 
                    'apple', 'Apple', 'Amazon', 'amazon', 
                     'roku', 'Roku', 'roku remote', 'Rokue Remote',
                     'Google', 'google', 'chromecast', 'Chromecast', 
                    'Chrome Cast', 'chrome cast', 'chrome', 'cast'
                     'Fire TV Stick', 'prime', 'firestick4ktv',
                     'firestick', 'fire tv', 'fire tv stick', 'fire', 
                     'firesticks','tv', 'remote', '4k', 'stick', 'dont', "it's", 'tvs',
                    'etc'}

# include the dictionary of stop words
sw |= include_stopwords

# useful white space pattern
whitespace_pattern = re.compile(r"\s+")

def decontracted(phrase):
    """
    split up decontracted words from a column of texts
    
    """
    # add extra white space
    phrase = re.sub('(?<=[.,!?()/:;])(?=[^\s])', r' ',  phrase)
   
    # specific
    phrase = re.sub(r"she/her", "she her",phrase)
    phrase = re.sub(r"he/him", "he him",phrase)
    phrase = re.sub(r"they/them", "they them",phrase)
    phrase = re.sub(r"won\’t", "will not", phrase)
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\’t", "can not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    phrase = re.sub(r'\<.*\>', '', phrase)

    # general
    phrase = re.sub(r"n\’t", " not", phrase)
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\’re", " are", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\’s", " is", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\’d", " would", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\’ll", " will", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\’t", " not", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\’ve", " have", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\’m", " am", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    phrase = re.sub(r"don’t", "do not", phrase)
    phrase = re.sub(r"it's", "it is", phrase)
    phrase = re.sub(r"it’s", "it is", phrase)
    phrase = re.sub(r"we've", "we have", phrase)
    phrase = re.sub("\w+\d+", "", phrase)
    phrase = re.sub("\d+\w+", "", phrase)
    phrase = re.sub("\d+", " ", phrase)

    return phrase

def remove_stop(tokens) :
    """
    remove stop words from a column of texts
    """
    
    not_stop_words = [word for word in tokens if word not in sw]
    return not_stop_words
 
def remove_punctuation(text) : 
    """
    remove punctuation from a column of texts
    """
    return("".join([ch for ch in text if ch not in punctuation]))

def tokenize(text) : 
    """ Splitting on whitespace"""
    
    # modify this function to return tokens
    tokens = re.split(whitespace_pattern, text)
    return(tokens)


def remove_whitespace_token(tokens):
    """ Remove whitespace tokens"""
    
    # loop through each token to find whitespace token and remove
    for i in tokens:
        if '' in tokens:
            tokens.remove('')
    return tokens

def prepare(text, pipeline) :
    """
    prepare function applies each cleaning transformation
    function onto a column of text
    """
    tokens = str(text)
    
    for transform in pipeline : 
        tokens = transform(tokens)
        
    return(tokens)


# list of cleaning functions
my_pipeline = [str.lower, decontracted, remove_punctuation, tokenize, remove_whitespace_token, remove_stop]



In [5]:
# for cleaning and tokenizng reviews
df['clean_reviews'] = df['review_text'].apply(prepare,pipeline=my_pipeline)

# remove any unicode characters
df['clean_reviews'].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

# drop original reviews column
df.drop(columns = ['review_text'], axis = 1, inplace = True)

# drop rows with no tokens
df = df[df['clean_reviews'].str.len() != 0]

# untokenize plot descriptions
df['clean_reviews'] = df['clean_reviews'].apply(lambda x: TreebankWordDetokenizer().detokenize(x))

# 10 random rows
df.sample(10)

Unnamed: 0,review_star,clean_reviews
13485,5.0,easy install many available channels
8374,4.0,good product hooked universe change wifi param...
4157,5.0,yes
12169,5.0,easy set easy use
16110,4.0,defaults site wanting buy stuff nonstop gets o...
7837,5.0,already one needed another quality fine easy use
1281,5.0,easy setup works surround sound systems curren...
15774,5.0,finally updated previousgeneration version wow...
11205,5.0,great first streaming family used since cut ca...
13956,4.0,worked great items included package evening co...


# Split the data set for training and testing 


- Checking for Class Imbalance
- Split Data on balanced data set
- Use TF-IDF and Ordinal Encoder for data pre-processing on training data



- Over-sampling the Minority class with SMOTE-ENN
- Checking for Class Imbalance after Oversampling (SMOTE-ENN)



- Under-sampling the Majority class with Near-Miss Undersampling
- Checking for Class Imbalance after Near-Miss Undersampling



- Under-sampling the Majority class with One-Sided Selection (OSS) Undersampling
- Checking for Class Imbalance after One-Sided Selection (OSS) Undersampling


## Checking for Class Imbalance

In [6]:
# create a data frame to count the number of customer reviews by products and also rename a column as count
rating_counts = pd.DataFrame(df[['review_star']].value_counts()).reset_index().rename( \
    columns={'review_star': 'Ratings', 0:'count'})

# add a sort index
rating_counts['Ratings_for_sorting'] = rating_counts['Ratings']

# convert ratings to str type for visualization
rating_counts['Ratings'] = rating_counts['Ratings'].astype(str)

# Sort order by custom column and display 
rating_counts.sort_values(by=['Ratings_for_sorting'], inplace = True)

# use plotly to create a bar graph of number of reviews by brand and using customized color coding 
fig = px.bar(rating_counts, x="Ratings", y="count", color="Ratings", 
             title= "Customer Ratings - Checking for Class Imbalance", text_auto = '.2s', 
             color_discrete_map={'1.0':'red', '2.0':'orange', '3.0': 'yellow',
                                 '4.0':'rgb(179,226,205)', '5.0':'rgb(27,158,119)'})

# figure customization
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False,
 marker_line_width=1.5, opacity = 0.75)


## Train-Test-Split

In [7]:
# Features
X = df['clean_reviews']

# Target Variable
y = df[['review_star']]

# Split data set on 80% training and 20% test and keep target variable classes balanced
X_train, X_test, Y_train, Y_test = train_test_split(X,
y, test_size=0.2, random_state=42, stratify=y)

# Print shapes of training and testing data
print('Size of Training Data ', X_train.shape[0])
print('Size of Test Data ', X_test.shape[0])

Size of Training Data  14084
Size of Test Data  3522


## Use TF-IDF and Ordinal Encoder for data pre-processing

In [8]:
# use tfidf vectorizer
tfidf = TfidfVectorizer(stop_words=sw)
tfidf.fit(X_train)
X_train = tfidf.transform(X_train)

# use ordinal encoder
ord_en = OrdinalEncoder()
ord_en.fit(Y_train)
Y_train = ord_en.transform(Y_train)

## Over-sampling the Minority class using SMOTE with Edited Nearest Neighbor (ENN)

This method combines the SMOTE ability to generate synthetic examples for minority class and ENN ability to delete some observations from both classes that are identified as having different class between the observation’s class and its K-nearest neighbor majority class.

In [9]:
# use SMOTE method for over-sampling on training set only
oversample = SMOTEENN()
X_train_smote, Y_train_smote = oversample.fit_resample(X_train, Y_train)

## Checking for Class Imbalance after SMOTE with ENN

In [10]:
# create a data frame to count the number of customer reviews by products and also rename a column as count
rating_counts = pd.DataFrame(pd.DataFrame(Y_train_smote).rename(columns = {0:'review_star'}).value_counts()).reset_index().rename( \
    columns={'review_star': 'Ratings', 0:'count'})

# add a sort index
rating_counts['Ratings_for_sorting'] = rating_counts['Ratings']

# convert ratings to str type for visualization
rating_counts['Ratings'] = rating_counts['Ratings'].astype(str)

# Sort order by custom column and display 
rating_counts.sort_values(by=['Ratings_for_sorting'], inplace = True)

# use plotly to create a bar graph of number of reviews by brand and using customized color coding 
fig = px.bar(rating_counts, x="Ratings", y="count", color="Ratings", 
             title= "Customer Ratings - After SMOTE (Oversampling)", text_auto = '.2s', 
             color_discrete_map={'0.0':'red', '1.0':'orange', '2.0': 'yellow',
                                 '3.0':'rgb(179,226,205)', '4.0':'rgb(27,158,119)'})

# figure customization
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False,
 marker_line_width=1.5, opacity = 0.75)


## Near-Missing Undersampling (Version 3)

NearMiss-1 selects examples from the majority class that have the smallest average distance to the three closest examples from the minority class. NearMiss-2 selects examples from the majority class that have the smallest average distance to the three furthest examples from the minority class. NearMiss-3 involves selecting a given number of majority class examples for each example in the minority class that are closest.

We will use version 3.

In [11]:
# near miss undersampling
nr = NearMiss(version = 1, n_neighbors=3) 

# fit to training data only
X_train_near, Y_train_near= nr.fit_resample(X_train, Y_train) 


## Checking for Class Imbalance after Near-miss Undersampling 

In [12]:
# create a data frame to count the number of customer reviews by products and also rename a column as count
rating_counts = pd.DataFrame(pd.DataFrame(Y_train_near).rename(columns = {0:'review_star'}).value_counts()).reset_index().rename( \
    columns={'review_star': 'Ratings', 0:'count'})

# add a sort index
rating_counts['Ratings_for_sorting'] = rating_counts['Ratings']

# convert ratings to str type for visualization
rating_counts['Ratings'] = rating_counts['Ratings'].astype(str)

# Sort order by custom column and display 
rating_counts.sort_values(by=['Ratings_for_sorting'], inplace = True)

# use plotly to create a bar graph of number of reviews by brand and using customized color coding 
fig = px.bar(rating_counts, x="Ratings", y="count", color="Ratings", 
             title= "Customer Ratings - After Near-Miss Undersampling", text_auto = '.2s', 
             color_discrete_map={'0.0':'red', '1.0':'orange', '2.0': 'yellow',
                                 '3.0':'rgb(179,226,205)', '4.0':'rgb(27,158,119)'})

# figure customization
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False,
 marker_line_width=1.5, opacity = 0.75)

## One-Sided Selection Undersampling

One-Sided Selection, or OSS for short, is an undersampling technique that combines Tomek Links and the Condensed Nearest Neighbor (CNN) Rule.

In [13]:
# condensed nearest neighbor undersampling
oss = OneSidedSelection(n_neighbors=1, n_seeds_S=200)

# fit to training data only
X_train_oss, Y_train_oss= oss.fit_resample(X_train, Y_train) 

## Checking for Class Imbalance after One-Sided Selection Undersampling

In [14]:
# create a data frame to count the number of customer reviews by products and also rename a column as count
rating_counts = pd.DataFrame(pd.DataFrame(Y_train_oss).rename(columns = {0:'review_star'}).value_counts()).reset_index().rename( \
    columns={'review_star': 'Ratings', 0:'count'})

# add a sort index
rating_counts['Ratings_for_sorting'] = rating_counts['Ratings']

# convert ratings to str type for visualization
rating_counts['Ratings'] = rating_counts['Ratings'].astype(str)

# Sort order by custom column and display 
rating_counts.sort_values(by=['Ratings_for_sorting'], inplace = True)

# use plotly to create a bar graph of number of reviews by brand and using customized color coding 
fig = px.bar(rating_counts, x="Ratings", y="count", color="Ratings", 
             title= "Customer Ratings - After One-Sided Selection Undersampling", text_auto = '.2s', 
             color_discrete_map={'0.0':'red', '1.0':'orange', '2.0': 'yellow',
                                 '3.0':'rgb(179,226,205)', '4.0':'rgb(27,158,119)'})

# figure customization
fig.update_traces(textfont_size=12, textangle=0, textposition="outside", cliponaxis=False,
 marker_line_width=1.5, opacity = 0.75)

# Export all different Sampling Training and Test data

- SMOTE with ENN training data 
- Near-miss Undersampling (Version 3) training data
- One-sided Selection Undersampling training data
- Original Test Data

In [15]:
# SMOTE-ENN X training data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/X_train_smote.pickle"), 'wb') as output_file:
    cPickle.dump(X_train_smote, output_file)

# SMOTE-ENN X training data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/Y_train_smote.pickle"), 'wb') as output_file:
    cPickle.dump(Y_train_smote, output_file)

In [16]:
# Near-miss Sampling X training data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/X_train_near.pickle"), 'wb') as output_file:
    cPickle.dump(X_train_near, output_file)

# Near-miss Sampling Y training data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/Y_train_near.pickle"), 'wb') as output_file:
    cPickle.dump(Y_train_near, output_file)

In [17]:
# One-Sided Selection Sampling X training data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/X_train_oss.pickle"), 'wb') as output_file:
    cPickle.dump(X_train_oss, output_file)

# One-Sided Selection Sampling Y training data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/Y_train_oss.pickle"), 'wb') as output_file:
    cPickle.dump(Y_train_oss, output_file)

In [18]:
# use tfidf vectorizer
X_test = tfidf.transform(X_test)

# use ordinal encoder
Y_test = ord_en.transform(Y_test)

# Original X test data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/X_test.pickle"), 'wb') as output_file:
    cPickle.dump(X_test, output_file)

# Original Y test data 
with open(Path(r"../Data/Preparation for Modeling Data/Multi-classification/Y_test.pickle"), 'wb') as output_file:
    cPickle.dump(Y_test, output_file)

In [19]:
# How to read pickle files
#with open(Path(r"../Data/Preparation for Modeling Data/X_train_smote.pickle"), "rb") as input_file:
    #X_train_smote_pickle = cPickle.load(input_file)
#X_train_smote_pickle