# Packages 

In [1]:
import pandas as pd # pandas package
pd.options.display.max_columns = 40

import numpy as np # numpy package

# matplotlib packages
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

import seaborn as sns # seaborn package
# dictionary package
from collections import Counter, defaultdict

import warnings  # warnings package
warnings.filterwarnings('ignore')
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

# plotly packages
from chart_studio import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot

# cufflink packages
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'

from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# word cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# nltk packages
import nltk

#nltk.download('stopwords')
# stop words
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

# punctuation
from string import punctuation

# detokenizer 
from nltk.tokenize.treebank import TreebankWordDetokenizer

# sklearn packages
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

#imblean packages for undersampling/oversampling
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import NearMiss 
from imblearn.under_sampling import OneSidedSelection

# pickle package
import _pickle as cPickle
import joblib

# Loading Training and Test Sets

In [2]:
## Load tfidf model for new data later on
tfidf = joblib.load('tfidf.pkl') 

## SMOTE+ENN Training Data

In [3]:
# SMOTE+ENN Training Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_train_smote.pickle"), "rb") as input_file:
    X_train_smote = cPickle.load(input_file)
    
# SMOTE+ENN Training Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_train_smote.pickle"), "rb") as input_file:
    Y_train_smote = cPickle.load(input_file).squeeze()

## Near-miss Sampling Training Data 

In [None]:
# Near-miss Training Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_train_near.pickle"), "rb") as input_file:
    X_train_near = cPickle.load(input_file)
    
# Near-miss Training Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_train_near.pickle"), "rb") as input_file:
    Y_train_near = cPickle.load(input_file).squeeze()

## One-sided Selection Sampling Training Data 

In [None]:
# One-sided Selection Training Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_train_oss.pickle"), "rb") as input_file:
    X_train_oss = cPickle.load(input_file)
    
# One-sided Selection Training Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_train_oss.pickle"), "rb") as input_file:
    Y_train_oss = cPickle.load(input_file).squeeze()

## Test Data

In [None]:
# Test Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_test.pickle"), "rb") as input_file:
    X_test = cPickle.load(input_file)
    
# Test Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_test.pickle"), "rb") as input_file:
    Y_test = cPickle.load(input_file).squeeze()

# Baseline Model - Dummy Classifier 

## Dummy Classifier with SMOTE+ENN Training Data

In [None]:
# Baseline Model - Dummy Classifier
clf = DummyClassifier(strategy='most_frequent')

# Fit to SMOTE+ENN training data
clf.fit(X_train_smote, Y_train_smote)

# Make predictions on original unsampled test data
Y_pred_smote_baseline = clf.predict(X_test)

# Print out Performance scores
print ('Accuracy Score with SMOTE+ENN Training Data ', round(accuracy_score(Y_test, Y_pred_smote_baseline),3))
print(classification_report(Y_test, Y_pred_smote_baseline))

## Dummy Classifier with Near-miss Sampling Training Data

In [None]:
# Baseline Model - Dummy Classifier
clf = DummyClassifier(strategy='most_frequent')

# Fit to Near-miss sampling training data
clf.fit(X_train_near, Y_train_near)

# Make predictions on original unsampled test data
Y_pred_near_baseline = clf.predict(X_test)

# Print out Performance scores
print ('Accuracy Score with near-miss sampling Training Data ', round(accuracy_score(Y_test, Y_pred_near_baseline),3))
print(classification_report(Y_test, Y_pred_near_baseline))

## Dummy Classifier with One-sided Selection Training Data

In [None]:
# Baseline Model - Dummy Classifier
clf = DummyClassifier(strategy='most_frequent')

# Fit to One-Sided Selection sampling training data
clf.fit(X_train_oss, Y_train_oss)

# Make predictions on original unsampled test data
Y_pred_oss_baseline = clf.predict(X_test)

# Print out Performance scores
print ('Accuracy Score with one-sided selection sampling Training Data ', 
       round(accuracy_score(Y_test, Y_pred_oss_baseline),3))
print(classification_report(Y_test, Y_pred_oss_baseline))

# SVM Classifier - Hyperparameter Tuning

- Hyper-parameter tuning on  Training SMOTE+ENN Data
- Hyper-parameter tuning on  Training Near-miss Sampling Data
- Hyper-parameter tuning on  Training One-sided Selection Sampling Data

In [None]:
# Add parameters in pipeline
training_pipeline = Pipeline(
steps=[
('model', LinearSVC(random_state=42, tol=1e-5))])
grid_param = [{
'model__penalty': ['l1', 'l2'],
'model__loss': ['hinge'],
'model__max_iter': [10000]
}, {
'model__C': [1, 10],
'model__tol': [1e-2, 1e-3]
}]

## SVM Classifier - Training on SMOTE+ENN Data 

In [None]:
# grid search to find best parameters
gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
param_grid=grid_param, cv=5, n_jobs = -1)

# SMOTE+ENN data
gridSearchProcessor.fit(X_train_smote, Y_train_smote)

# best parameters for SMOTE+ENN Data
smote_best_params = gridSearchProcessor.best_params_

# best model for SMOTE+ENN Data
smote_best_model = gridSearchProcessor.best_estimator_

# print out best parameters for SMOTE+ENN Data
print("Best alpha parameter identified by grid search for SMOTE+ENN Data ", smote_best_params)
smote_best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search for SMOTE+ENN Data", smote_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

In [None]:
# Model Evaluation
Y_pred_smote = smote_best_model.predict(X_test)
print('Accuracy Score for SMOTE+ENN Data ', accuracy_score(Y_test, Y_pred_smote))
print(classification_report(Y_test, Y_pred_smote))

## SVM Classifier - Training on Near-miss Sampling Data 

In [None]:
# grid search to find best parameters
gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
param_grid=grid_param, cv=5, n_jobs = -1)

# Near-miss sampling data
gridSearchProcessor.fit(X_train_near, Y_train_near)

# best parameters for Near-miss Sampling Data
near_best_params = gridSearchProcessor.best_params_

# best model for Near-miss Sampling Data
near_best_model = gridSearchProcessor.best_estimator_

# print out best parameters for Near-miss Sampling Data
print("Best alpha parameter identified by grid search for Near-miss Sampling Data ", near_best_params)
near_best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search for Near-miss Sampling Data", near_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

In [None]:
# Model Evaluation
Y_pred_near = near_best_model.predict(X_test)
print('Accuracy Score for Near-miss Sampling Data ', accuracy_score(Y_test, Y_pred_near))
print(classification_report(Y_test, Y_pred_near))

## SVM Classifier - Training on One-sided Selection Sampling Data 

In [None]:
# grid search to find best parameters
gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
param_grid=grid_param, cv=5, n_jobs = -1)

# One-sided selection sampling data
gridSearchProcessor.fit(X_train_oss, Y_train_oss)

# best parameters for One-sided selection Sampling Data
oss_best_params = gridSearchProcessor.best_params_

# best model for One-sided selection Sampling Data
oss_best_model = gridSearchProcessor.best_estimator_

# print out best parameters for One-sided selection Sampling Data
print("Best alpha parameter identified by grid search for One-sided selection Sampling Data ", oss_best_params)
oss_best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search for One-sided selection Sampling Data", oss_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

In [None]:
# Model Evaluation
Y_pred_oss = oss_best_model.predict(X_test)
print('Accuracy Score for One-sided selection Sampling Data ', accuracy_score(Y_test, Y_pred_oss))
print(classification_report(Y_test, Y_pred_oss))

# Random Forest Classifier - Hyperparameter Tuning

- Hyper-parameter tuning on  Training SMOTE+ENN Data
- Hyper-parameter tuning on  Training Near-miss Sampling Data
- Hyper-parameter tuning on  Training One-sided Selection Sampling Data

In [None]:
# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 50, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Rf model
rf = RandomForestClassifier(random_state=42)

random_grid = {'n_estimators': [100,300,500],
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}       


## RF Classifier - Training on SMOTE+ENN Data 

In [None]:
# random search to find best parameters
rf_gridSearchProcessor = RandomizedSearchCV(estimator=rf,\
                                            param_distributions=random_grid, 
                                            cv=3, n_jobs = -1,
                                            n_iter = 100)

# SMOTE+ENN data
rf_gridSearchProcessor.fit(X_train_smote, Y_train_smote)

# best parameters for SMOTE+ENN Data
rf_smote_best_params = rf_gridSearchProcessor.best_params_

# best model for SMOTE+ENN Data
rf_smote_best_model = rf_gridSearchProcessor.best_estimator_

# print out best parameters for SMOTE+ENN Data
print("Best alpha parameter identified by grid search for SMOTE+ENN Data ", rf_smote_best_params)
rf_smote_best_result = rf_gridSearchProcessor.best_score_
print("Best result identified by grid search for SMOTE+ENN Data", rf_smote_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(rf_gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

In [None]:
# Model Evaluation
Y_pred_smote_rf = rf_smote_best_model.predict(X_test)
print('Accuracy Score for SMOTE+ENN Data ', accuracy_score(Y_test, Y_pred_smote_rf))
print(classification_report(Y_test, Y_pred_smote_rf))

## RF Hyper-parameter tuning on Training Near-miss Sampling Data

In [None]:
# random search to find best parameters
rf_gridSearchProcessor = RandomizedSearchCV(estimator=rf,\
                                            param_distributions=random_grid, 
                                            cv=3, n_jobs = -1,
                                            n_iter = 100)


# Near-miss data
rf_gridSearchProcessor.fit(X_train_near, Y_train_near)

# best parameters for Near-miss sampling data 
rf_near_best_params = rf_gridSearchProcessor.best_params_

# best model for Near-miss sampling data 
rf_near_best_model = rf_gridSearchProcessor.best_estimator_

# print out best parameters for Near-miss sampling data 
print("Best alpha parameter identified by grid search for Near-miss sampling data ", rf_near_best_params)
rf_near_best_result = rf_gridSearchProcessor.best_score_
print("Best result identified by grid search for Near-miss sampling data ", rf_near_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(rf_gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

In [None]:
# Model Evaluation
Y_pred_near_rf = rf_near_best_model.predict(X_test)
print('Accuracy Score for SMOTE+ENN Data ', accuracy_score(Y_test, Y_pred_near_rf))
print(classification_report(Y_test, Y_pred_near_rf))

## RF Hyper-parameter tuning on Training One-sided Selection Sampling Data


In [None]:
# random search to find best parameters
rf_gridSearchProcessor = RandomizedSearchCV(estimator=rf,\
                                            param_distributions=random_grid, 
                                            cv=3, n_jobs = -1,
                                            n_iter = 100)

# oss data
rf_gridSearchProcessor.fit(X_train_oss, Y_train_oss)

# best parameters for oss sampling data 
rf_oss_best_params = rf_gridSearchProcessor.best_params_

# best model for oss sampling data 
rf_oss_best_model = rf_gridSearchProcessor.best_estimator_

# print out best parameters for oss sampling data 
print("Best alpha parameter identified by grid search for oss sampling data ", rf_oss_best_params)
rf_oss_best_result = rf_gridSearchProcessor.best_score_
print("Best result identified by grid search for oss sampling data ", rf_oss_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(rf_gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

In [None]:
# Model Evaluation
Y_pred_oss_rf = rf_oss_best_model.predict(X_test)
print('Accuracy Score for One-sided Sampling Data ', accuracy_score(Y_test, Y_pred_oss_rf))
print(classification_report(Y_test, Y_pred_oss_rf))

# Model Comparison 

In [None]:
model_comp

In [None]:
## make data frame for model performance comparison
data = ['SMOTE+ENN Sampling', 'Near-miss Sampling', 'One-sided Selection Sampling Data'] * 3
models = ['baseline', 'baseline', 'baseline',\
          'SVM', 'SVM', 'SVM',\
         'Random Forest', 'Random Forest', 'Random Forest']
params = ['Most Frequent', 'Most Frequent', 'Most Frequent', \
          smote_best_params, near_best_params, oss_best_params,\
         rf_smote_best_params, rf_near_best_params, rf_oss_best_params]
performance = [accuracy_score(Y_test, Y_pred_smote_baseline), \
               accuracy_score(Y_test, Y_pred_near_baseline),\
               accuracy_score(Y_test, Y_pred_oss_baseline), \
               accuracy_score(Y_test, Y_pred_smote), \
              accuracy_score(Y_test, Y_pred_near), \
              accuracy_score(Y_test, Y_pred_oss), \
              accuracy_score(Y_test, Y_pred_smote_rf), \
              accuracy_score(Y_test, Y_pred_near_rf), \
              accuracy_score(Y_test, Y_pred_oss_rf)]

model_comp = pd.DataFrame(list(zip(data, models, params, performance))) 

# reanming the DataFrame columns
model_comp.rename(columns = {0:'data', 
                             1:'model',
                             2:'parameters',
                             3:'accuracy score'}, 
            inplace = True)
model_comp

# Final Model Test Unseen Data and Export 

In [None]:
# predict unseen data
unseen_data = tfidf.transform(["Very disappointed in the Apple TV 4K. \
Horrible issues with audio video syncing. The audio is never in sync with the video play back . \
It actually moves around from a little out of sync to way out of sync. Apple support? No help at all. \
And Apple actually said they are only concerned with Apple designed products and don't really care about\
how another company products work with Apple TV. This appears to be a firmware issue, That Apple has no interest\
in addressing."])

# select final model
final_model = oss_best_model

# predict unseen data
final_model.predict(unseen_data).tolist()

# probabilities/likelihood set up
final_model_proba = CalibratedClassifierCV(final_model, cv='prefit') 
final_model_proba.fit(X_train_oss, Y_train_oss)

# put probabilities into data frame
pred_class = final_model_proba.predict(X_test)
pd.DataFrame(np.round(final_model_proba.predict_proba(unseen_data),3),\
             columns=final_model_proba.classes_)


In [None]:
# Final mode export
joblib.dump(final_model_proba, 'final_model.pkl') 