# Packages 

In [1]:
import pandas as pd # pandas package
pd.options.display.max_columns = 40

import numpy as np # numpy package

# matplotlib packages
import matplotlib
import matplotlib.pyplot as plt 
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)

import seaborn as sns # seaborn package
# dictionary package
from collections import Counter, defaultdict

import warnings  # warnings package
warnings.filterwarnings('ignore')

# plotly packages
from chart_studio import plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.offline import iplot

# cufflink packages
import cufflinks
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='pearl')

# interactive shell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'last_expr'

from pathlib import Path # path package
import re #regex package
from textblob import TextBlob #import textblob package

# word cloud
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# nltk packages
import nltk

#nltk.download('stopwords')
# stop words
from nltk.corpus import stopwords
sw = set(stopwords.words("english"))

# punctuation
from string import punctuation

# detokenizer 
from nltk.tokenize.treebank import TreebankWordDetokenizer

# sklearn packages
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier

#imblean packages for undersampling/oversampling
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import NearMiss 
from imblearn.under_sampling import OneSidedSelection

# pickle package
import _pickle as cPickle


# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe


# Loading Training and Test Sets

## SMOTE+ENN Training Data

In [2]:
# SMOTE+ENN Training Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_train_smote.pickle"), "rb") as input_file:
    X_train_smote = cPickle.load(input_file)
    
# SMOTE+ENN Training Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_train_smote.pickle"), "rb") as input_file:
    Y_train_smote = cPickle.load(input_file)

## Near-miss Sampling Training Data 

In [3]:
# Near-miss Training Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_train_near.pickle"), "rb") as input_file:
    X_train_near = cPickle.load(input_file)
    
# Near-miss Training Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_train_near.pickle"), "rb") as input_file:
    Y_train_near = cPickle.load(input_file)

## One-sided Selection Sampling Training Data 

In [4]:
# One-sided Selection Training Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_train_oss.pickle"), "rb") as input_file:
    X_train_oss = cPickle.load(input_file)
    
# One-sided Selection Training Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_train_oss.pickle"), "rb") as input_file:
    Y_train_oss = cPickle.load(input_file)

## Test Data

In [5]:
# Test Data - X variables
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/X_test.pickle"), "rb") as input_file:
    X_test = cPickle.load(input_file)
    
# Test Data - Target Variable Y
with open(Path(r"../Data/Preparation for Modeling Data/Binary Classification/Y_test.pickle"), "rb") as input_file:
    Y_test = cPickle.load(input_file)

# Baseline Model - Dummy Classifier 

## Dummy Classifier with SMOTE+ENN Training Data

In [6]:
# Baseline Model - Dummy Classifier
clf = DummyClassifier(strategy='most_frequent')

# Fit to SMOTE+ENN training data
clf.fit(X_train_smote, Y_train_smote)

# Make predictions on original unsampled test data
Y_pred_smote_baseline = clf.predict(X_test)

# Print out Performance scores
print ('Accuracy Score with SMOTE+ENN Training Data ', round(accuracy_score(Y_test, Y_pred_smote_baseline),3))
print(classification_report(Y_test, Y_pred_smote_baseline))

Accuracy Score with SMOTE+ENN Training Data  0.111
              precision    recall  f1-score   support

    Negative       0.11      1.00      0.20       375
    Positive       0.00      0.00      0.00      3013

    accuracy                           0.11      3388
   macro avg       0.06      0.50      0.10      3388
weighted avg       0.01      0.11      0.02      3388



## Dummy Classifier with Near-miss Sampling Training Data

In [7]:
# Baseline Model - Dummy Classifier
clf = DummyClassifier(strategy='most_frequent')

# Fit to Near-miss sampling training data
clf.fit(X_train_near, Y_train_near)

# Make predictions on original unsampled test data
Y_pred_near_baseline = clf.predict(X_test)

# Print out Performance scores
print ('Accuracy Score with near-miss sampling Training Data ', round(accuracy_score(Y_test, Y_pred_near_baseline),3))
print(classification_report(Y_test, Y_pred_near_baseline))

Accuracy Score with near-miss sampling Training Data  0.111
              precision    recall  f1-score   support

    Negative       0.11      1.00      0.20       375
    Positive       0.00      0.00      0.00      3013

    accuracy                           0.11      3388
   macro avg       0.06      0.50      0.10      3388
weighted avg       0.01      0.11      0.02      3388



## Dummy Classifier with One-sided Selection Training Data

In [8]:
# Baseline Model - Dummy Classifier
clf = DummyClassifier(strategy='most_frequent')

# Fit to One-Sided Selection sampling training data
clf.fit(X_train_oss, Y_train_oss)

# Make predictions on original unsampled test data
Y_pred_oss_baseline = clf.predict(X_test)

# Print out Performance scores
print ('Accuracy Score with one-sided selection sampling Training Data ', 
       round(accuracy_score(Y_test, Y_pred_oss_baseline),3))
print(classification_report(Y_test, Y_pred_oss_baseline))

Accuracy Score with one-sided selection sampling Training Data  0.889
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00       375
    Positive       0.89      1.00      0.94      3013

    accuracy                           0.89      3388
   macro avg       0.44      0.50      0.47      3388
weighted avg       0.79      0.89      0.84      3388



# SVM Classifier - Hyperparameter Tuning

## SVM Classifier - Training on SMOTE+ENN Data 

In [9]:
# Add parameters in pipeline
training_pipeline = Pipeline(
steps=[
('model', LinearSVC(random_state=42, tol=1e-5))])
grid_param = [{
'model__penalty': ['l1', 'l2'],
'model__loss': ['hinge'],
'model__max_iter': [10000]
}, {
'model__C': [1, 10],
'model__tol': [1e-2, 1e-3]
}]

# grid search to find best parameters
gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
param_grid=grid_param, cv=5, n_jobs = -1)

# SMOTE+ENN data
gridSearchProcessor.fit(X_train_smote, Y_train_smote)

# best parameters for SMOTE+ENN Data
smote_best_params = gridSearchProcessor.best_params_

# best model for SMOTE+ENN Data
smote_best_model = gridSearchProcessor.best_estimator_

# print out best parameters for SMOTE+ENN Data
print("Best alpha parameter identified by grid search for SMOTE+ENN Data ", smote_best_params)
smote_best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search for SMOTE+ENN Data", smote_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

Best alpha parameter identified by grid search for SMOTE+ENN Data  {'model__C': 10, 'model__tol': 0.01}
Best result identified by grid search for SMOTE+ENN Data 0.9873017837495774


Unnamed: 0,rank_test_score,mean_test_score,params
4,1,0.987302,"{'model__C': 10, 'model__tol': 0.01}"
5,1,0.987302,"{'model__C': 10, 'model__tol': 0.001}"
2,3,0.986249,"{'model__C': 1, 'model__tol': 0.01}"
3,3,0.986249,"{'model__C': 1, 'model__tol': 0.001}"
1,5,0.984638,"{'model__loss': 'hinge', 'model__max_iter': 10..."


In [10]:
# Model Evaluation
Y_pred = smote_best_model.predict(X_test)
print('Accuracy Score for SMOTE+ENN Data ', accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Accuracy Score for SMOTE+ENN Data  0.7573789846517119
              precision    recall  f1-score   support

    Negative       0.30      0.93      0.46       375
    Positive       0.99      0.74      0.84      3013

    accuracy                           0.76      3388
   macro avg       0.65      0.83      0.65      3388
weighted avg       0.91      0.76      0.80      3388



## SVM Classifier - Training on Near-miss Sampling Data 

In [11]:
# Add parameters in pipeline
training_pipeline = Pipeline(
steps=[
('model', LinearSVC(random_state=42, tol=1e-5))])
grid_param = [{
'model__penalty': ['l1', 'l2'],
'model__loss': ['hinge'],
'model__max_iter': [10000]
}, {
'model__C': [1, 10],
'model__tol': [1e-2, 1e-3]
}]

# grid search to find best parameters
gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
param_grid=grid_param, cv=5, n_jobs = -1)

# Near-miss sampling data
gridSearchProcessor.fit(X_train_near, Y_train_near)

# best parameters for Near-miss Sampling Data
near_best_params = gridSearchProcessor.best_params_

# best model for Near-miss Sampling Data
near_best_model = gridSearchProcessor.best_estimator_

# print out best parameters for Near-miss Sampling Data
print("Best alpha parameter identified by grid search for Near-miss Sampling Data ", near_best_params)
near_best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search for Near-miss Sampling Data", near_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

Best alpha parameter identified by grid search for Near-miss Sampling Data  {'model__loss': 'hinge', 'model__max_iter': 10000, 'model__penalty': 'l2'}
Best result identified by grid search for Near-miss Sampling Data 0.8761602671118531


Unnamed: 0,rank_test_score,mean_test_score,params
1,1,0.87616,"{'model__loss': 'hinge', 'model__max_iter': 10..."
2,2,0.873492,"{'model__C': 1, 'model__tol': 0.01}"
3,2,0.873492,"{'model__C': 1, 'model__tol': 0.001}"
4,4,0.856471,"{'model__C': 10, 'model__tol': 0.01}"
5,5,0.856137,"{'model__C': 10, 'model__tol': 0.001}"


In [12]:
# Model Evaluation
Y_pred = near_best_model.predict(X_test)
print('Accuracy Score for Near-miss Sampling Data ', accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Accuracy Score for Near-miss Sampling Data  0.7795159386068476
              precision    recall  f1-score   support

    Negative       0.32      0.91      0.48       375
    Positive       0.99      0.76      0.86      3013

    accuracy                           0.78      3388
   macro avg       0.65      0.84      0.67      3388
weighted avg       0.91      0.78      0.82      3388



## SVM Classifier - Training on One-sided Selection Sampling Data 

In [13]:
# Add parameters in pipeline
training_pipeline = Pipeline(
steps=[
('model', LinearSVC(random_state=42, tol=1e-5))])
grid_param = [{
'model__penalty': ['l1', 'l2'],
'model__loss': ['hinge'],
'model__max_iter': [10000]
}, {
'model__C': [1, 10],
'model__tol': [1e-2, 1e-3]
}]

# grid search to find best parameters
gridSearchProcessor = GridSearchCV(estimator=training_pipeline,
param_grid=grid_param, cv=5, n_jobs = -1)

# One-sided selection sampling data
gridSearchProcessor.fit(X_train_oss, Y_train_oss)

# best parameters for One-sided selection Sampling Data
oss_best_params = gridSearchProcessor.best_params_

# best model for One-sided selection Sampling Data
oss_best_model = gridSearchProcessor.best_estimator_

# print out best parameters for One-sided selection Sampling Data
print("Best alpha parameter identified by grid search for One-sided selection Sampling Data ", oss_best_params)
oss_best_result = gridSearchProcessor.best_score_
print("Best result identified by grid search for One-sided selection Sampling Data", oss_best_result)


# see other parameter results
gridsearch_results = pd.DataFrame(gridSearchProcessor.cv_results_)
gridsearch_results[['rank_test_score', 'mean_test_score',
'params']].sort_values(by=['rank_test_score'])[:5]

Best alpha parameter identified by grid search for One-sided selection Sampling Data  {'model__loss': 'hinge', 'model__max_iter': 10000, 'model__penalty': 'l2'}
Best result identified by grid search for One-sided selection Sampling Data 0.9286975358148165


Unnamed: 0,rank_test_score,mean_test_score,params
1,1,0.928698,"{'model__loss': 'hinge', 'model__max_iter': 10..."
2,2,0.928342,"{'model__C': 1, 'model__tol': 0.01}"
3,2,0.928342,"{'model__C': 1, 'model__tol': 0.001}"
4,4,0.916459,"{'model__C': 10, 'model__tol': 0.01}"
5,4,0.916459,"{'model__C': 10, 'model__tol': 0.001}"


In [14]:
# Model Evaluation
Y_pred = oss_best_model.predict(X_test)
print('Accuracy Score for One-sided selection Sampling Data ', accuracy_score(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Accuracy Score for One-sided selection Sampling Data  0.9439197166469894
              precision    recall  f1-score   support

    Negative       0.80      0.66      0.72       375
    Positive       0.96      0.98      0.97      3013

    accuracy                           0.94      3388
   macro avg       0.88      0.82      0.85      3388
weighted avg       0.94      0.94      0.94      3388



# XGBoost Model

In [15]:
le = LabelEncoder()
Y_train_oss = le.fit_transform(Y_train_oss)
Y_test_oss = le.transform(Y_test)

In [16]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [17]:


def objective(space, X_train = X_train_oss, Y_train = Y_train_oss, Y_test= Y_test_oss):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, Y_train), ( X_test, Y_test)]
    
    clf.fit(X_train, Y_train,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(Y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }



In [18]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                          

0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211              

SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                                                                                                                  
0.8893152302243211                                                                                                      
SCORE:                          

In [19]:
best_hyperparams

{'colsample_bytree': 0.6032739335582951,
 'gamma': 4.651952284119747,
 'max_depth': 16.0,
 'min_child_weight': 2.0,
 'reg_alpha': 150.0,
 'reg_lambda': 0.5791175610186193}

In [20]:
clf=XGBClassifier(colsample_bytree= 0.7035349155522628,
                     gamma= 8.074420556741606,
 max_depth= 8,
 min_child_weight= 3.0,
 reg_alpha= 110.0,
 reg_lambda= 0.3053922914267756)

clf.fit(X_train_oss, Y_train_oss)

# Model Evaluation
Y_pred = clf.predict(X_test)
print('Accuracy Score for One-sided selection Sampling Data ', accuracy_score(Y_test_oss, Y_pred))
print(classification_report(Y_test_oss, Y_pred))

Accuracy Score for One-sided selection Sampling Data  0.9028925619834711
              precision    recall  f1-score   support

           0       0.84      0.15      0.26       375
           1       0.90      1.00      0.95      3013

    accuracy                           0.90      3388
   macro avg       0.87      0.57      0.60      3388
weighted avg       0.90      0.90      0.87      3388

