# Table of Contents

* [Import and downsampling data](#Import-and-downsampling-data)
* [Function to log results](#Function-to-log-results)
* [Modeling](#Modeling)
    * [1. Random Forest](#1.-Random-Forest)
    * [2. XGboost](#2.-XGboost )
    * [3. LightGBM](#3.-LightGBM)
    * [4. Logistic Regression with regularizations](#4.-Logistic-Regression-with-regularizations)
    * [5. KNN](#5.-KNN)      
    * [6. SVM](#6.-SVM)
* [Summary](#Summary)

In [2]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")
    
import warnings
warnings.filterwarnings("ignore")

In [3]:
#import sys
#!conda install --yes --prefix {sys.prefix} -c conda-forge lightgbm
#!pip install xgboost
import pandas as pd
import numpy as np
from numpy.random import seed
import matplotlib.pyplot as plt
%matplotlib inline  
import statistics
from scipy import stats
from scipy.stats import t
from scipy.stats import norm
import seaborn as sns
import sklearn
import sqlite3
from sqlite3 import Error
import csv
import lightgbm

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.svm import LinearSVC
from xgboost.sklearn import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
#from sklearn.metrics import plot_precision_recall_curve

## Import and downsampling data

Use the code below to create a balanced training dataset to be used by all ML models that we build shortly. Test sets are still imbalanced.

In [4]:
from sklearn.model_selection import train_test_split

data_all = pd.read_csv('sm_data.csv')
X = data_all.iloc[:,1:]
y = data_all['TARGET']

# train test split using test_size = 0.2 
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=123)

X_train_0 = X_train.loc[y_train == 0,:].copy().sample(frac = 0.09)
X_train_1 = X_train.loc[y_train == 1,:].copy()

X_train_0['target'] = 0
X_train_1['target'] = 1

# use frac = 1 to randomize the rows
X_train_new = X_train_0.append(X_train_1).sample(frac = 1)

X_train_new_x = X_train_new.drop(columns='target')
y_train_new = X_train_new['target']

In [5]:
print("X train shape: ", X_train_new_x.shape)
print("y train shape: ", y_train_new.shape)
print("X test shape: ", X_test.shape)
print("y test shape: ", y_test.shape)

X train shape:  (40209, 350)
y train shape:  (40209,)
X test shape:  (61501, 350)
y test shape:  (61501,)


Save X_train_new_x, y_train_new, X_test and y_test for future model evaluations

In [None]:
X_train_new_x.to_csv("X_train_new_x.csv", index=False)
y_train_new.to_csv("y_train_new.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_test.to_csv("y_test.csv", index=False)

In [6]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=123, shuffle=True)
kf.get_n_splits(X_train_new_x)

i = 1
for train_index, val_index in kf.split(X_train_new_x):
    print(f"Fold {i}: ", "Train:", train_index, "Validation:", val_index)
    i = i + 1

Fold 1:  Train: [    0     2     3 ... 40206 40207 40208] Validation: [    1    15    19 ... 40184 40193 40197]
Fold 2:  Train: [    0     1     2 ... 40205 40206 40208] Validation: [    8    12    21 ... 40203 40204 40207]
Fold 3:  Train: [    0     1     2 ... 40206 40207 40208] Validation: [    3     9    10 ... 40198 40201 40202]
Fold 4:  Train: [    0     1     3 ... 40205 40206 40207] Validation: [    2     5     6 ... 40196 40199 40208]
Fold 5:  Train: [    1     2     3 ... 40204 40207 40208] Validation: [    0     4    13 ... 40187 40205 40206]


## Function to log results

In [7]:
def model_log(cv_clf, modelname):
    rlt_dict = {}

    rlt_dict['best_estimator_'] = [cv_clf.best_estimator_]
    rlt_dict['best_params_'] = [cv_clf.best_params_]
    rlt_dict['best_score_'] = [cv_clf.best_score_]
    rlt_dict['best_index_'] = [cv_clf.best_index_]

    rlt_dict['candidate_params'] = [cv_clf.cv_results_['params']]
    rlt_dict['mean_test_score'] = [cv_clf.cv_results_['mean_test_score']]
    rlt_dict['std_test_score'] = [cv_clf.cv_results_['std_test_score']]
    rlt_dict['mean_train_score'] = [cv_clf.cv_results_['mean_train_score']]
    rlt_dict['std_train_score'] = [cv_clf.cv_results_['std_train_score']]

    rlt_dict['split0_test_score'] = [cv_clf.cv_results_['split0_test_score']]
    rlt_dict['split1_test_score'] = [cv_clf.cv_results_['split1_test_score']]
    rlt_dict['split2_test_score'] = [cv_clf.cv_results_['split2_test_score']]
    rlt_dict['split3_test_score'] = [cv_clf.cv_results_['split3_test_score']]
    rlt_dict['split4_test_score'] = [cv_clf.cv_results_['split4_test_score']]

    rlt_dict['split0_train_score'] = [cv_clf.cv_results_['split0_train_score']]
    rlt_dict['split1_train_score'] = [cv_clf.cv_results_['split1_train_score']]
    rlt_dict['split2_train_score'] = [cv_clf.cv_results_['split2_train_score']]
    rlt_dict['split3_train_score'] = [cv_clf.cv_results_['split3_train_score']]
    rlt_dict['split4_train_score'] = [cv_clf.cv_results_['split4_train_score']]
    
    rlt_dict['std_of_best_mean_test_score'] = [cv_clf.cv_results_['std_test_score'][max(enumerate(cv_clf.cv_results_['mean_test_score']),key=(lambda x: x[1]))[0]]]

    rlt_df = pd.DataFrame.from_dict(rlt_dict)
    
    filename = modelname + '_' + 'cv_rlt.csv'
    rlt_df.to_csv(filename)

## Modeling

The first 3 models we build are tree based. Since tree based models are robust regardless of the skewed distributions of the features, we do not need to scale the features. Later on when we train other classification models that are more sensitive to the scale of features, such as logistic regression, KNN and SVM, we first transform some of the features before fitting the models.

## 1. Random Forest

In [None]:
import time
start = time.process_time()

#when max depth is large (eg. 20), it's obvious that the RF model is overfitting 
#(CV training score 97%, test score 66%)

niter, verbose, random_state = [5, 0, 123] 
param_space = {'n_estimators': range(100,500,100), 'max_depth': range(1,10)} 

clf = RandomForestClassifier(random_state=random_state)
cv_clf = RandomizedSearchCV(clf, param_space, cv=kf, n_iter=niter, scoring='roc_auc', return_train_score=True, 
                            verbose=verbose, n_jobs=-1) 
cv_clf.fit(X_train_new_x, y_train_new)

print('completed in {} s'.format(time.process_time() - start))

# write out results
model_log(cv_clf, 'randomforest')

In [8]:
import re
import ast
result = pd.read_csv('randomforest_cv_rlt.csv')
candidate_params = ast.literal_eval(result.candidate_params.values[0])
print("1. Five candidate parameters are: ", candidate_params)
best_n = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['n_estimators']
best_d = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['max_depth']
print("2. Best number of trees and depth are: {} and {}".format(best_n, best_d))
best_score = result.best_score_.values[0]
print("3. Best average CV validation score is: ", best_score)
std_of_best_score = result.std_of_best_mean_test_score.values[0]
print("4. Standard Deviation of Best average CV validation score is: ", std_of_best_score)
mean_test_score = result.mean_test_score.values[0]
print("5. Average CV validation score: ", mean_test_score)
std_test_score = result.std_test_score.values[0]
print("6. Standard Deviation of CV validation score: ", std_test_score)
mean_train_score = result.mean_train_score.values[0]
print("7. Average CV training score: ", mean_train_score)
std_train_score = result.std_train_score.values[0]
print("8. Standard Deviation of CV training score: ", std_train_score)

print("1st fold validation score: ", result.split0_test_score.values[0])
print("2nd fold validation score: ", result.split1_test_score.values[0])
print("3rd fold validation score: ", result.split2_test_score.values[0])
print("4th fold validation score: ", result.split3_test_score.values[0])
print("5th fold validation score: ", result.split4_test_score.values[0])
print("1st fold training score: ", result.split0_train_score.values[0])
print("2nd fold training score: ", result.split1_train_score.values[0])
print("3rd fold training score: ", result.split2_train_score.values[0])
print("4th fold training score: ", result.split3_train_score.values[0])
print("5th fold training score: ", result.split4_train_score.values[0])

1. Five candidate parameters are:  [{'n_estimators': 100, 'max_depth': 5}, {'n_estimators': 400, 'max_depth': 4}, {'n_estimators': 400, 'max_depth': 8}, {'n_estimators': 100, 'max_depth': 8}, {'n_estimators': 400, 'max_depth': 9}]
2. Best number of trees and depth are: 400 and 9
3. Best average CV validation score is:  0.7499708327404105
4. Standard Deviation of Best average CV validation score is:  0.007575448338780573
5. Average CV validation score:  [0.73404878 0.73197666 0.74795543 0.74571211 0.74997083]
6. Standard Deviation of CV validation score:  [0.00770114 0.00750812 0.00733293 0.00707123 0.00757545]
7. Average CV training score:  [0.75566938 0.74596377 0.82225717 0.8197234  0.85312945]
8. Standard Deviation of CV training score:  [0.00226952 0.00211268 0.00089419 0.00108859 0.00083326]
1st fold validation score:  [0.74598698 0.74386643 0.76052436 0.75719985 0.76300074]
2nd fold validation score:  [0.72379893 0.72182914 0.73822735 0.73587173 0.7399635 ]
3rd fold validation sc

## 2. XGboost 

In [None]:
import time
start = time.process_time()

niter, verbose, random_state = [5, 0, 123] 
param_space = {'n_estimators': range(100,500,100), 'learning_rate': [0.01,0.1,0.5], 
               'max_depth': range(1,10), 'gamma': [0.001,0.01,1,10]}
               
clf = XGBClassifier(objective='binary:logistic', verbosity=verbose, booster='gbtree', tree_method='auto', 
                            subsample=1, colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, 
                            reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=random_state)
cv_clf = RandomizedSearchCV(clf, param_space, cv=kf, n_iter=niter, scoring='roc_auc', return_train_score=True, 
                            verbose=verbose, n_jobs=-1)
cv_clf.fit(X_train_new_x, y_train_new)

print('completed in {} s'.format(time.process_time() - start))

# write out results
model_log(cv_clf, 'xgboost')

In [9]:
import re
import ast
result = pd.read_csv('xgboost_cv_rlt.csv')
candidate_params = ast.literal_eval(result.candidate_params.values[0])
print("1. Five candidate parameters are: ", candidate_params)
best_n = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['n_estimators']
best_d = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['max_depth']
best_lr = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['learning_rate']
best_gamma = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['gamma']
print("2. Best number of trees, learning rate, depth and gamma are: {}, {}, {} and {}".format(best_n, best_lr, best_d, best_gamma))
best_score = result.best_score_.values[0]
print("3. Best average CV validation score is: ", best_score)
std_of_best_score = result.std_of_best_mean_test_score.values[0]
print("4. Standard Deviation of Best average CV validation score is: ", std_of_best_score)
mean_test_score = result.mean_test_score.values[0]
print("5. Average CV validation score: ", mean_test_score)
std_test_score = result.std_test_score.values[0]
print("6. Standard Deviation of CV validation score: ", std_test_score)
mean_train_score = result.mean_train_score.values[0]
print("7. Average CV training score: ", mean_train_score)
std_train_score = result.std_train_score.values[0]
print("8. Standard Deviation of CV training score: ", std_train_score)

print("1st fold validation score: ", result.split0_test_score.values[0])
print("2nd fold validation score: ", result.split1_test_score.values[0])
print("3rd fold validation score: ", result.split2_test_score.values[0])
print("4th fold validation score: ", result.split3_test_score.values[0])
print("5th fold validation score: ", result.split4_test_score.values[0])
print("1st fold training score: ", result.split0_train_score.values[0])
print("2nd fold training score: ", result.split1_train_score.values[0])
print("3rd fold training score: ", result.split2_train_score.values[0])
print("4th fold training score: ", result.split3_train_score.values[0])
print("5th fold training score: ", result.split4_train_score.values[0])

1. Five candidate parameters are:  [{'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.001}, {'n_estimators': 400, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 10}, {'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.01, 'gamma': 0.001}, {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.5, 'gamma': 0.001}, {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.01, 'gamma': 10}]
2. Best number of trees, learning rate, depth and gamma are: 200, 0.1, 5 and 0.001
3. Best average CV validation score is:  0.7722415850612538
4. Standard Deviation of Best average CV validation score is:  0.007403126618375358
5. Average CV validation score:  [0.77224159 0.76618283 0.72106212 0.74706994 0.73872224]
6. Standard Deviation of CV validation score:  [0.00740313 0.00762454 0.00702354 0.00595042 0.00797753]
7. Average CV training score:  [0.88045    0.86465668 0.72665755 0.93424846 0.75471596]
8. Standard Deviation of CV training score:  [0.00189341 0.0006524  0.001

## 3. LightGBM

In [None]:
import time
start = time.process_time()

niter, verbose, random_state = [5, 0, 123] 
param_space = {'n_estimators': range(100,500,100), 'learning_rate': [0.01,0.1,0.5], 
               'max_depth': range(1,5)}

clf = LGBMClassifier(colsample_bytree=1, subsample=1, reg_alpha=0, reg_lambda=1, verbose=verbose)
cv_clf = RandomizedSearchCV(clf, param_space, cv=kf, n_iter=niter, scoring='roc_auc', return_train_score=True, 
                            verbose=verbose, n_jobs = -1)  
cv_clf.fit(X_train_new_x, y_train_new)

print('completed in {} s'.format(time.process_time() - start))

# write out results
model_log(cv_clf, 'lightgbm')

In [10]:
import re
import ast
result = pd.read_csv('lightgbm_cv_rlt.csv')
candidate_params = ast.literal_eval(result.candidate_params.values[0])
print("1. Five candidate parameters are: ", candidate_params)
best_n = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['n_estimators']
best_d = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['max_depth']
best_lr = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['learning_rate']
print("2. Best number of trees, learning rate and depth are: {}, {} and {}".format(best_n, best_lr, best_d))
best_score = result.best_score_.values[0]
print("3. Best average CV validation score is: ", best_score)
std_of_best_score = result.std_of_best_mean_test_score.values[0]
print("4. Standard Deviation of Best average CV validation score is: ", std_of_best_score)
mean_test_score = result.mean_test_score.values[0]
print("5. Average CV validation score: ", mean_test_score)
std_test_score = result.std_test_score.values[0]
print("6. Standard Deviation of CV validation score: ", std_test_score)
mean_train_score = result.mean_train_score.values[0]
print("7. Average CV training score: ", mean_train_score)
std_train_score = result.std_train_score.values[0]
print("8. Standard Deviation of CV training score: ", std_train_score)

print("1st fold validation score: ", result.split0_test_score.values[0])
print("2nd fold validation score: ", result.split1_test_score.values[0])
print("3rd fold validation score: ", result.split2_test_score.values[0])
print("4th fold validation score: ", result.split3_test_score.values[0])
print("5th fold validation score: ", result.split4_test_score.values[0])
print("1st fold training score: ", result.split0_train_score.values[0])
print("2nd fold training score: ", result.split1_train_score.values[0])
print("3rd fold training score: ", result.split2_train_score.values[0])
print("4th fold training score: ", result.split3_train_score.values[0])
print("5th fold training score: ", result.split4_train_score.values[0])

1. Five candidate parameters are:  [{'n_estimators': 100, 'max_depth': 2, 'learning_rate': 0.5}, {'n_estimators': 200, 'max_depth': 2, 'learning_rate': 0.1}, {'n_estimators': 300, 'max_depth': 1, 'learning_rate': 0.5}, {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.5}, {'n_estimators': 400, 'max_depth': 1, 'learning_rate': 0.5}]
2. Best number of trees, learning rate and depth are: 400, 0.5 and 1
3. Best average CV validation score is:  0.7703665520630054
4. Standard Deviation of Best average CV validation score is:  0.006516696411119587
5. Average CV validation score:  [0.76875892 0.76774851 0.76936503 0.75036595 0.77036655]
6. Standard Deviation of CV validation score:  [0.00729377 0.00597662 0.00626044 0.00461198 0.0065167 ]
7. Average CV training score:  [0.7985437  0.78386748 0.78580417 0.92184532 0.78923898]
8. Standard Deviation of CV training score:  [0.0014364  0.00158614 0.00161512 0.00126126 0.00166479]
1st fold validation score:  [0.78130973 0.77719603 0.78014901 

## 4. Logistic Regression with regularizations

In [None]:
import time
start = time.process_time()

niter, verbose, random_state = [5, 0, 123] 
param_space = {'penalty': ['l1','l2'], 'C': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1, 1, 10, 100]}
        
clf = LogisticRegression(random_state=random_state)
cv_clf = RandomizedSearchCV(clf, param_space, cv=kf, n_iter=niter, scoring='roc_auc', return_train_score=True, 
                            verbose=verbose, n_jobs = -1) 
cv_clf.fit(X_train_new_x, y_train_new)

print('completed in {} s'.format(time.process_time() - start))

# write out results
model_log(cv_clf, 'logistic')

In [11]:
import re
import ast
result = pd.read_csv('logistic_cv_rlt.csv')
candidate_params = ast.literal_eval(result.candidate_params.values[0])
print("1. Five candidate parameters are: ", candidate_params)
best_c = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['C']
best_penalty = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['penalty']
print("2. Best C and penalty are: {} and {}".format(best_c, best_penalty))
best_score = result.best_score_.values[0]
print("3. Best average CV validation score is: ", best_score)
std_of_best_score = result.std_of_best_mean_test_score.values[0]
print("4. Standard Deviation of Best average CV validation score is: ", std_of_best_score)
mean_test_score = result.mean_test_score.values[0]
print("5. Average CV validation score: ", mean_test_score)
std_test_score = result.std_test_score.values[0]
print("6. Standard Deviation of CV validation score: ", std_test_score)
mean_train_score = result.mean_train_score.values[0]
print("7. Average CV training score: ", mean_train_score)
std_train_score = result.std_train_score.values[0]
print("8. Standard Deviation of CV training score: ", std_train_score)

print("1st fold validation score: ", result.split0_test_score.values[0])
print("2nd fold validation score: ", result.split1_test_score.values[0])
print("3rd fold validation score: ", result.split2_test_score.values[0])
print("4th fold validation score: ", result.split3_test_score.values[0])
print("5th fold validation score: ", result.split4_test_score.values[0])
print("1st fold training score: ", result.split0_train_score.values[0])
print("2nd fold training score: ", result.split1_train_score.values[0])
print("3rd fold training score: ", result.split2_train_score.values[0])
print("4th fold training score: ", result.split3_train_score.values[0])
print("5th fold training score: ", result.split4_train_score.values[0])

1. Five candidate parameters are:  [{'penalty': 'l1', 'C': 100}, {'penalty': 'l2', 'C': 0.001}, {'penalty': 'l2', 'C': 100}, {'penalty': 'l2', 'C': 0.005}, {'penalty': 'l2', 'C': 0.1}]
2. Best C and penalty are: 100 and l1
3. Best average CV validation score is:  0.7652214570941079
4. Standard Deviation of Best average CV validation score is:  0.003987849080163703
5. Average CV validation score:  [0.76522146 0.66303347 0.66356286 0.66162847 0.66300642]
6. Standard Deviation of CV validation score:  [0.00398785 0.00695754 0.00434593 0.0055261  0.00776671]
7. Average CV training score:  [0.77381036 0.66467191 0.66526745 0.6631809  0.6645457 ]
8. Standard Deviation of CV training score:  [0.00100667 0.00239779 0.00441104 0.00189288 0.00219546]
1st fold validation score:  [0.75939492 0.65572886 0.65939621 0.65401359 0.65451265]
2nd fold validation score:  [0.7649992  0.66580904 0.6709678  0.66900502 0.67202593]
3rd fold validation score:  [0.76626005 0.67422651 0.665995   0.66627048 0.6725

## 5. KNN

In [None]:
import time
start = time.process_time()

niter, verbose, random_state = [5, 0, 123] 
param_space = {'n_neighbors': range(1,20)}

clf = KNeighborsClassifier()
cv_clf = RandomizedSearchCV(clf, param_space, cv=kf, n_iter=niter, scoring='roc_auc', return_train_score=True, 
                            verbose=verbose, n_jobs = -1)
cv_clf.fit(X_train_new_x, y_train_new)

print('completed in {} s'.format(time.process_time() - start))

# write out results
model_log(cv_clf, 'knn')

In [12]:
import re
import ast
result = pd.read_csv('knn_cv_rlt.csv')
candidate_params = ast.literal_eval(result.candidate_params.values[0])
print("1. Five candidate parameters are: ", candidate_params)
best_k = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['n_neighbors']
print("2. Best number of neighbors is: {}".format(best_k))
best_score = result.best_score_.values[0]
print("3. Best average CV validation score is: ", best_score)
std_of_best_score = result.std_of_best_mean_test_score.values[0]
print("4. Standard Deviation of Best average CV validation score is: ", std_of_best_score)
mean_test_score = result.mean_test_score.values[0]
print("5. Average CV validation score: ", mean_test_score)
std_test_score = result.std_test_score.values[0]
print("6. Standard Deviation of CV validation score: ", std_test_score)
mean_train_score = result.mean_train_score.values[0]
print("7. Average CV training score: ", mean_train_score)
std_train_score = result.std_train_score.values[0]
print("8. Standard Deviation of CV training score: ", std_train_score)

print("1st fold validation score: ", result.split0_test_score.values[0])
print("2nd fold validation score: ", result.split1_test_score.values[0])
print("3rd fold validation score: ", result.split2_test_score.values[0])
print("4th fold validation score: ", result.split3_test_score.values[0])
print("5th fold validation score: ", result.split4_test_score.values[0])
print("1st fold training score: ", result.split0_train_score.values[0])
print("2nd fold training score: ", result.split1_train_score.values[0])
print("3rd fold training score: ", result.split2_train_score.values[0])
print("4th fold training score: ", result.split3_train_score.values[0])
print("5th fold training score: ", result.split4_train_score.values[0])

1. Five candidate parameters are:  [{'n_neighbors': 7}, {'n_neighbors': 9}, {'n_neighbors': 18}, {'n_neighbors': 1}, {'n_neighbors': 10}]
2. Best number of neighbors is: 18
3. Best average CV validation score is:  0.5860421015466559
4. Standard Deviation of Best average CV validation score is:  0.005290393187255072
5. Average CV validation score:  [0.56435268 0.56938242 0.5860421  0.52727264 0.57399634]
6. Standard Deviation of CV validation score:  [0.00527947 0.00477997 0.00529039 0.0020675  0.0059998 ]
7. Average CV training score:  [0.74477514 0.72450065 0.68253512 1.         0.71711453]
8. Standard Deviation of CV training score:  [0.00059892 0.00086726 0.00100905 0.         0.00084448]
1st fold validation score:  [0.57242651 0.57436509 0.5901431  0.52323235 0.58135607]
2nd fold validation score:  [0.55632186 0.56359608 0.57942082 0.52851801 0.56761344]
3rd fold validation score:  [0.56378392 0.56562997 0.58420869 0.52743362 0.56803958]
4th fold validation score:  [0.5667624  0.57

## 6. SVM

In [None]:
scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train_new_x)
X_train_new = scaling.transform(X_train_new_x)
X_test_new = scaling.transform(X_test)
X_train_new = pd.DataFrame(X_train_new)
X_train_new.columns = X_train_new_x.columns

In [None]:
import time
start = time.process_time()

#param_space = {'C': [10], 'kernel': ['rbf'], 'gamma': [0.001]}
Kfold, niter, verbose, random_state = [2, 2, 10, 1234] 
param_space = {'C': range(1,101), 'kernel': ['linear','rbf','poly'], 'gamma': [0.001, 0.0001]}

clf = svm.SVC(random_state=random_state)
cv_clf = RandomizedSearchCV(clf, param_space, cv=Kfold, n_iter=niter, scoring='roc_auc', return_train_score=True, 
                            verbose=verbose, n_jobs = -1) 
cv_clf.fit(X_train_new.values, y_train_new.values)

print('completed in {} s'.format(time.process_time() - start))

# write out results
rlt_dict = {}
rlt_dict['best_estimator_'] = [cv_clf.best_estimator_]
rlt_dict['best_params_'] = [cv_clf.best_params_]
rlt_dict['best_score_'] = [cv_clf.best_score_]
rlt_dict['best_index_'] = [cv_clf.best_index_]
rlt_dict['candidate_params'] = [cv_clf.cv_results_['params']]
rlt_dict['mean_test_score'] = [cv_clf.cv_results_['mean_test_score']]
rlt_dict['std_test_score'] = [cv_clf.cv_results_['std_test_score']]
rlt_dict['mean_train_score'] = [cv_clf.cv_results_['mean_train_score']]
rlt_dict['std_train_score'] = [cv_clf.cv_results_['std_train_score']]

rlt_dict['split0_test_score'] = [cv_clf.cv_results_['split0_test_score']]
rlt_dict['split1_test_score'] = [cv_clf.cv_results_['split1_test_score']]

rlt_dict['split0_train_score'] = [cv_clf.cv_results_['split0_train_score']]
rlt_dict['split1_train_score'] = [cv_clf.cv_results_['split1_train_score']]

rlt_dict['std_of_best_mean_test_score'] = [cv_clf.cv_results_['std_test_score'][max(enumerate(cv_clf.cv_results_['mean_test_score']),key=(lambda x: x[1]))[0]]]

rlt_df = pd.DataFrame.from_dict(rlt_dict)
rlt_df.to_csv('kernelsvm_cv_rlt.csv')

In [13]:
import re
import ast
result = pd.read_csv('kernelsvm_cv_rlt.csv')
candidate_params = ast.literal_eval(result.candidate_params.values[0])
print("1. Two candidate parameters are: ", candidate_params)
best_c = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['C']
best_kernel = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['kernel']
best_gamma = ast.literal_eval(re.search('({.+})', result.best_params_.values[0]).group(0))['gamma']
print("2. Best C, kernel and gamma are: {}, {} and {}".format(best_c, best_kernel, best_gamma))
best_score = result.best_score_.values[0]
print("3. Best average CV validation score is: ", best_score)
std_of_best_score = result.std_of_best_mean_test_score.values[0]
print("4. Standard Deviation of Best average CV validation score is: ", std_of_best_score)
mean_test_score = result.mean_test_score.values[0]
print("5. Average CV validation score: ", mean_test_score)
std_test_score = result.std_test_score.values[0]
print("6. Standard Deviation of CV validation score: ", std_test_score)
mean_train_score = result.mean_train_score.values[0]
print("7. Average CV training score: ", mean_train_score)
std_train_score = result.std_train_score.values[0]
print("8. Standard Deviation of CV training score: ", std_train_score)

print("1st fold validation score: ", result.split0_test_score.values[0])
print("2nd fold validation score: ", result.split1_test_score.values[0])
print("1st fold training score: ", result.split0_train_score.values[0])
print("2nd fold training score: ", result.split1_train_score.values[0])

1. Two candidate parameters are:  [{'kernel': 'rbf', 'gamma': 0.001, 'C': 87}, {'kernel': 'poly', 'gamma': 0.001, 'C': 87}]
2. Best C, kernel and gamma are: 87, poly and 0.001
3. Best average CV validation score is:  0.7565343438326259
4. Standard Deviation of Best average CV validation score is:  0.0013193399255403296
5. Average CV validation score:  [0.75568958 0.75653434]
6. Standard Deviation of CV validation score:  [0.00101853 0.00131934]
7. Average CV training score:  [0.81502068 0.79615193]
8. Standard Deviation of CV training score:  [0.00039024 0.00034963]
1st fold validation score:  [0.75467108 0.75521504]
2nd fold validation score:  [0.75670814 0.75785372]
1st fold training score:  [0.81541093 0.79650157]
2nd fold training score:  [0.81463044 0.7958023 ]


## Summary

Choose the final model which has the highest average cross validation AUC

In [14]:
def read_cv_results(filename):
    result = pd.read_csv(filename)
    best_score = result.best_score_.values[0]
    std_score = result.std_of_best_mean_test_score.values[0]
    return [best_score, std_score]

cv_RFT = read_cv_results('randomforest_cv_rlt.csv')[0]
cv_XGB = read_cv_results('xgboost_cv_rlt.csv')[0]
cv_LGB = read_cv_results('lightgbm_cv_rlt.csv')[0]
cv_LGI = read_cv_results('logistic_cv_rlt.csv')[0]
cv_KNN = read_cv_results('knn_cv_rlt.csv')[0]
cv_SVM = read_cv_results('kernelsvm_cv_rlt.csv')[0]  

std_cv_RFT = read_cv_results('randomforest_cv_rlt.csv')[1]
std_cv_XGB = read_cv_results('xgboost_cv_rlt.csv')[1]
std_cv_LGB = read_cv_results('lightgbm_cv_rlt.csv')[1]
std_cv_LGI = read_cv_results('logistic_cv_rlt.csv')[1]
std_cv_KNN = read_cv_results('knn_cv_rlt.csv')[1]
std_cv_SVM = read_cv_results('kernelsvm_cv_rlt.csv')[1] 

print(f"Average Cross Validation AUC for RandomForest, XGboost, LogReg, LightGBM, KNN, SVM are: {cv_RFT:.4f}, {cv_XGB:.4f}, {cv_LGB:.4f}, {cv_LGI:.4f}, {cv_KNN:.4f}, {cv_SVM:.4f}")
print(f"Standard Deviation of Average Cross Validation AUC for RandomForest, XGboost, LogReg, LightGBM, KNN, SVM are: {std_cv_RFT:.4f}, {std_cv_XGB:.4f}, {std_cv_LGB:.4f}, {std_cv_LGI:.4f}, {std_cv_KNN:.4f}, {std_cv_SVM:.4f}")

all_scores = [cv_RFT, cv_XGB, cv_LGB, cv_LGI, cv_KNN, cv_SVM]
modelnames = ['RandomForest','XGboost','LightGBM','Logistic','KNN','SVM']
print("Best model is: ", modelnames[max(enumerate(all_scores),key=(lambda x: x[1]))[0]])


Average Cross Validation AUC for RandomForest, XGboost, LogReg, LightGBM, KNN, SVM are: 0.7500, 0.7722, 0.7704, 0.7652, 0.5860, 0.7565
Standard Deviation of Average Cross Validation AUC for RandomForest, XGboost, LogReg, LightGBM, KNN, SVM are: 0.0076, 0.0074, 0.0065, 0.0040, 0.0053, 0.0013
Best model is:  XGboost
