In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

## Data cleaning

In [2]:
column_names = np.arange(0, 55, 1, dtype = int)

df_covtype = pd.read_csv('covtype.data', names = column_names, index_col = False)

In [3]:
df_covtype

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,5
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,5
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,2
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,2
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
581007,2396,153,20,85,17,108,240,237,118,837,...,0,0,0,0,0,0,0,0,0,3
581008,2391,152,19,67,12,95,240,237,119,845,...,0,0,0,0,0,0,0,0,0,3
581009,2386,159,17,60,7,90,236,241,130,854,...,0,0,0,0,0,0,0,0,0,3
581010,2384,170,15,60,5,90,230,245,143,864,...,0,0,0,0,0,0,0,0,0,3


In [4]:
X = df_covtype.iloc[:, 0:54]
print(X.shape)
X.head()

(581012, 54)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
0,2596,51,3,258,0,510,221,232,148,6279,...,0,0,0,0,0,0,0,0,0,0
1,2590,56,2,212,-6,390,220,235,151,6225,...,0,0,0,0,0,0,0,0,0,0
2,2804,139,9,268,65,3180,234,238,135,6121,...,0,0,0,0,0,0,0,0,0,0
3,2785,155,18,242,118,3090,238,238,122,6211,...,0,0,0,0,0,0,0,0,0,0
4,2595,45,2,153,-1,391,220,234,150,6172,...,0,0,0,0,0,0,0,0,0,0


In [5]:
X = StandardScaler().fit_transform(X)
X.shape

(581012, 54)

In [6]:
Y = df_covtype.loc[:, [54]]
np.unique(Y, return_counts = True)

(array([1, 2, 3, 4, 5, 6, 7], dtype=int64),
 array([211840, 283301,  35754,   2747,   9493,  17367,  20510],
       dtype=int64))

In [7]:
Y.loc[Y[54] != 2, [54]] = 0
Y.loc[Y[54] == 2, [54]] = 1
print(Y.shape)
print(np.unique(Y, return_counts = True))

(581012, 1)
(array([0, 1], dtype=int64), array([297711, 283301], dtype=int64))


In [8]:
#Creat training and testing data sets for 5 trials
X_train_arr = []
X_test_arr = []
Y_train_arr = []
Y_test_arr = []

for i in [1, 2, 3, 4, 5]:
    X_train, X_test, Y_train, Y_test = train_test_split( 
        X, Y, train_size = 5000, stratify = Y, 
        shuffle = True, random_state = i)
    X_train_arr.append(X_train)
    X_test_arr.append(X_test)
    Y_train_arr.append(Y_train)
    Y_test_arr.append(Y_test)

In [9]:
for i in [0, 1, 2, 3, 4]:
    Y_train_arr[i] = Y_train_arr[i].values.ravel()
    Y_test_arr[i] = Y_test_arr[i].values.ravel()

## Logistic Regression

In [10]:
# C_list for SVM.
C_logre = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 
         1e1, 1e2, 1e3, 1e4]

In [11]:
%%time
pipe = Pipeline([('logRe', LogisticRegression())])

search_space = [{'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga'],
                 'logRe__penalty': ['l1', 'l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga', 'lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['none']}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring = ['accuracy', 'roc_auc', 'f1'], refit = False,
                   verbose = 0, n_jobs = -1)

best_logre_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_logre_arr
for i in [0, 1, 2, 3, 4]:
    best_logre = search.fit(X_train_arr[i], Y_train_arr[i])
    best_logre_arr.append(best_logre)

Wall time: 10min 47s


In [12]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga

## acc

In [13]:
# logre_lam_acc_train 
# logre_lam_acc_test 
logre_lam_acc_train = []
logre_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    logre_lam_acc_train.append(train_acc)
    logre_lam_acc_test.append(test_acc)
    
print(logre_lam_acc_train)
print(logre_lam_acc_test)

0.7512
0.7533384721151642
0.7614
0.7529634799275015
0.7606
0.7536023555064825
0.7704
0.7545415720505823
0.7652
0.7544773372776956
[0.7512, 0.7614, 0.7606, 0.7704, 0.7652]
[0.7533384721151642, 0.7529634799275015, 0.7536023555064825, 0.7545415720505823, 0.7544773372776956]


In [14]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_acc_train_mean = sum(logre_lam_acc_train)/len(logre_lam_acc_train)
logre_lam_acc_test_mean = sum(logre_lam_acc_test)/len(logre_lam_acc_test)
print(logre_lam_acc_train_mean)
print(logre_lam_acc_test_mean)

0.76176
0.7537846433754851


## roc_auc

In [15]:
# logre_lam_roc_train stores
# logre_lam_roc_test stores
logre_lam_roc_train = []
logre_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    logre_lam_roc_train.append(train_roc)
    logre_lam_roc_test.append(test_roc)
    
print(logre_lam_roc_train)
print(logre_lam_roc_test)

0.751338102986861
0.7537483357160824
0.7608612400971093
0.7535146176637891
0.7616353161848664
0.7544650557295874
0.7720516426422908
0.7552259020865517
0.7634348229535094
0.7550988063594606
[0.751338102986861, 0.7608612400971093, 0.7616353161848664, 0.7720516426422908, 0.7634348229535094]
[0.7537483357160824, 0.7535146176637891, 0.7544650557295874, 0.7552259020865517, 0.7550988063594606]


In [16]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_roc_train_mean = sum(logre_lam_roc_train)/len(logre_lam_roc_train)
logre_lam_roc_test_mean = sum(logre_lam_roc_test)/len(logre_lam_roc_test)
print(logre_lam_roc_train_mean)
print(logre_lam_roc_test_mean)

0.7618642249729274
0.7544105435110942


## f1

In [17]:
# logre_lam_f1_train stores
# logre_lam_f1_test stores
logre_lam_f1_train = []
logre_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    logre_lam_f1_train.append(train_f1)
    logre_lam_f1_test.append(test_f1)
    
print(logre_lam_f1_train)
print(logre_lam_f1_test)

0.7503010839020473
0.752705240604108
0.7577664974619289
0.7506784991546137
0.7588152327221439
0.7515914202452109
0.7689210950080514
0.7536120255050563
0.7652
0.7524379847743015
[0.7503010839020473, 0.7577664974619289, 0.7588152327221439, 0.7689210950080514, 0.7652]
[0.752705240604108, 0.7506784991546137, 0.7515914202452109, 0.7536120255050563, 0.7524379847743015]


In [18]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_f1_train_mean = sum(logre_lam_f1_train)/len(logre_lam_f1_train)
logre_lam_f1_test_mean = sum(logre_lam_f1_test)/len(logre_lam_f1_test)
print(logre_lam_f1_train_mean)
print(logre_lam_f1_test_mean)

0.7602007818188343
0.7522050340566582


## SVM

In [19]:
# C list for SVM
C_svm = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
C_svm_linear = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]

# gamma list for SVM with kernel rbf
gamma_svm = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]

In [20]:
%%time
pipe = Pipeline([('svm_classifier', svm.SVC())])

search_space = [{'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': C_svm_linear},
                {'svm_classifier': [svm.SVC(max_iter = 1000000)],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': [100, 1000]},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['poly'],
                 'svm_classifier__degree': [2, 3],
                 'svm_classifier__C': C_svm},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['rbf'],
                 'svm_classifier__gamma': gamma_svm,
                 'svm_classifier__C': C_svm}
                ]

clf = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_svm_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_svm = clf.fit(X_train_arr[i], Y_train_arr[i])
    best_svm_arr.append(best_svm)

Wall time: 24min 42s


In [21]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.1, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.1, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.1, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_class

## acc

In [22]:
# svm_lam_acc_train stores
# svm_lam_acc_test stores
svm_lam_acc_train = []
svm_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    svm_lam_acc_train.append(train_acc)
    svm_lam_acc_test.append(test_acc)
    
print(svm_lam_acc_train)
print(svm_lam_acc_test)

0.8632
0.8013964986840552
0.8736
0.7988496767428457
0.8704
0.7972316548960785
0.8762
0.7994208453990542
0.8736
0.7955476622014819
[0.8632, 0.8736, 0.8704, 0.8762, 0.8736]
[0.8013964986840552, 0.7988496767428457, 0.7972316548960785, 0.7994208453990542, 0.7955476622014819]


In [23]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_acc_train_mean = sum(svm_lam_acc_train)/len(svm_lam_acc_train)
svm_lam_acc_test_mean = sum(svm_lam_acc_test)/len(svm_lam_acc_test)
print(svm_lam_acc_train_mean)
print(svm_lam_acc_test_mean)

0.8714000000000001
0.7984892675847032


## roc_auc

In [24]:
# svm_lam_roc_train stores
# svm_lam_roc_test stores
svm_lam_roc_train = []
svm_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    svm_lam_roc_train.append(train_roc)
    svm_lam_roc_test.append(test_roc)
    
print(svm_lam_roc_train)
print(svm_lam_roc_test)

0.9033456737231668
0.8033561508482511
0.9074605245210015
0.7989142078651003
0.9096999818768535
0.7981917098165305
0.9178669248734742
0.801881814447859
0.9138976035821071
0.797133971827019
[0.9033456737231668, 0.9074605245210015, 0.9096999818768535, 0.9178669248734742, 0.9138976035821071]
[0.8033561508482511, 0.7989142078651003, 0.7981917098165305, 0.801881814447859, 0.797133971827019]


In [25]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_roc_train_mean = sum(svm_lam_roc_train)/len(svm_lam_roc_train)
svm_lam_roc_test_mean = sum(svm_lam_roc_test)/len(svm_lam_roc_test)
print(svm_lam_roc_train_mean)
print(svm_lam_roc_test_mean)

0.9104541417153206
0.7998955709609519


## f1

In [26]:
# svm_lam_f1_train stores
# svm_lam_f1_test stores
svm_lam_f1_train = []
svm_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    svm_lam_f1_train.append(train_f1)
    svm_lam_f1_test.append(test_f1)
    
print(svm_lam_f1_train)
print(svm_lam_f1_test)

0.8638535031847134
0.802123405197465
0.8732959101844426
0.7984329034603815
0.8700361010830324
0.7936073172326413
0.8748230535894843
0.7959636345655291
0.8743038981702467
0.7953689690343678
[0.8638535031847134, 0.8732959101844426, 0.8700361010830324, 0.8748230535894843, 0.8743038981702467]
[0.802123405197465, 0.7984329034603815, 0.7936073172326413, 0.7959636345655291, 0.7953689690343678]


In [27]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_f1_train_mean = sum(svm_lam_f1_train)/len(svm_lam_f1_train)
svm_lam_f1_test_mean = sum(svm_lam_f1_test)/len(svm_lam_f1_test)
print(svm_lam_f1_train_mean)
print(svm_lam_f1_test_mean)

0.8712624932423839
0.7970992458980769


## KNN

In [28]:
#K list for KNN
K_knn = np.arange(1, 105, 4)
K_knn

array([  1,   5,   9,  13,  17,  21,  25,  29,  33,  37,  41,  45,  49,
        53,  57,  61,  65,  69,  73,  77,  81,  85,  89,  93,  97, 101])

In [29]:
%%time
pipe = Pipeline([('knn', KNeighborsClassifier())])

search_space = [{'knn': [KNeighborsClassifier()],
                 'knn__weights': ['uniform', 'distance'],
                 'knn__n_neighbors': K_knn}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_knn_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_knn = search.fit(X_train_arr[i], Y_train_arr[i])
    best_knn_arr.append(best_knn)

Wall time: 52.7 s


In [30]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])])

{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 9, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'kn

## acc

In [31]:
# knn_lam_acc_train stores
# knn_lam_acc_test stores
knn_lam_acc_train = []
knn_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    knn_lam_acc_train.append(train_acc)
    knn_lam_acc_test.append(test_acc)
    
print(knn_lam_acc_train)
print(knn_lam_acc_test)

1.0
0.7816382297591022
1.0
0.7798188232189607
1.0
0.780393464025055
1.0
0.7838482531613925
1.0
0.7805514468448574
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.7816382297591022, 0.7798188232189607, 0.780393464025055, 0.7838482531613925, 0.7805514468448574]


In [32]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_acc_train_mean = sum(knn_lam_acc_train)/len(knn_lam_acc_train)
knn_lam_acc_test_mean = sum(knn_lam_acc_test)/len(knn_lam_acc_test)
print(knn_lam_acc_train_mean)
print(knn_lam_acc_test_mean)

1.0
0.7812500434018737


## roc_auc

In [33]:
# knn_lam_roc_train stores
# knn_lam_roc_test stores
knn_lam_roc_train = []
knn_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    knn_lam_roc_train.append(train_roc)
    knn_lam_roc_test.append(test_roc)
    
print(knn_lam_roc_train)
print(knn_lam_roc_test)

1.0
0.7827344773150235
1.0
0.7787132865643966
1.0
0.7789959346589663
1.0
0.781575570214404
1.0
0.776740456987496
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.7827344773150235, 0.7787132865643966, 0.7789959346589663, 0.781575570214404, 0.776740456987496]


In [34]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_roc_train_mean = sum(knn_lam_roc_train)/len(knn_lam_roc_train)
knn_lam_roc_test_mean = sum(knn_lam_roc_test)/len(knn_lam_roc_test)
print(knn_lam_roc_train_mean)
print(knn_lam_roc_test_mean)

1.0
0.7797519451480573


## f1

In [35]:
# knn_lam_f1_train stores
# knn_lam_f1_test stores
knn_lam_f1_train = []
knn_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    knn_lam_f1_train.append(train_f1)
    knn_lam_f1_test.append(test_f1)
    
print(knn_lam_f1_train)
print(knn_lam_f1_test)

1.0
0.7810235116949137
1.0
0.7790889883280003
1.0
0.7753945368542166
1.0
0.7802600043063435
1.0
0.7781188947809098
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.7810235116949137, 0.7790889883280003, 0.7753945368542166, 0.7802600043063435, 0.7781188947809098]


In [36]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_f1_train_mean = sum(knn_lam_f1_train)/len(knn_lam_f1_train)
knn_lam_f1_test_mean = sum(knn_lam_f1_test)/len(knn_lam_f1_test)
print(knn_lam_f1_train_mean)
print(knn_lam_f1_test_mean)

1.0
0.7787771871928768
