In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

## Data cleaning

In [2]:
column_names = np.arange(0, 11, 1)

df_magic04 = pd.read_csv('magic04.data', names = column_names, index_col = False, sep = ',')

In [3]:
df_magic04

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828,g
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610,g
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880,g
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370,g
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620,g
...,...,...,...,...,...,...,...,...,...,...,...
19015,21.3846,10.9170,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258,h
19016,28.9452,6.7020,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.4560,h
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166,h
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166,h


In [4]:
X = df_magic04.iloc[:, :10]
print(X.shape)
X

(19020, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,28.7967,16.0021,2.6449,0.3918,0.1982,27.7004,22.0110,-8.2027,40.0920,81.8828
1,31.6036,11.7235,2.5185,0.5303,0.3773,26.2722,23.8238,-9.9574,6.3609,205.2610
2,162.0520,136.0310,4.0612,0.0374,0.0187,116.7410,-64.8580,-45.2160,76.9600,256.7880
3,23.8172,9.5728,2.3385,0.6147,0.3922,27.2107,-6.4633,-7.1513,10.4490,116.7370
4,75.1362,30.9205,3.1611,0.3168,0.1832,-5.5277,28.5525,21.8393,4.6480,356.4620
...,...,...,...,...,...,...,...,...,...,...
19015,21.3846,10.9170,2.6161,0.5857,0.3934,15.2618,11.5245,2.8766,2.4229,106.8258
19016,28.9452,6.7020,2.2672,0.5351,0.2784,37.0816,13.1853,-2.9632,86.7975,247.4560
19017,75.4455,47.5305,3.4483,0.1417,0.0549,-9.3561,41.0562,-9.4662,30.2987,256.5166
19018,120.5135,76.9018,3.9939,0.0944,0.0683,5.8043,-93.5224,-63.8389,84.6874,408.3166


In [5]:
X = StandardScaler().fit_transform(X)

In [6]:
Y = df_magic04.loc[:, [10]]
Y = Y.replace('g', 1)
Y = Y.replace('h', 0)
np.unique(Y, return_counts = True)

(array([0, 1], dtype=int64), array([ 6688, 12332], dtype=int64))

In [7]:
#Creat training and testing data sets for 5 trials
X_train_arr = []
X_test_arr = []
Y_train_arr = []
Y_test_arr = []

for i in [1, 2, 3, 4, 5]:
    X_train, X_test, Y_train, Y_test = train_test_split( 
        X, Y, train_size = 5000, stratify = Y, 
        shuffle = True, random_state = i)
    X_train_arr.append(X_train)
    X_test_arr.append(X_test)
    Y_train_arr.append(Y_train)
    Y_test_arr.append(Y_test)

In [8]:
for i in [0, 1, 2, 3, 4]:
    Y_train_arr[i] = Y_train_arr[i].values.ravel()
    Y_test_arr[i] = Y_test_arr[i].values.ravel()

## Logistic Regression

In [9]:
# C_list for SVM.
C_logre = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 
         1e1, 1e2, 1e3, 1e4]

In [10]:
%%time
pipe = Pipeline([('logRe', LogisticRegression())])

search_space = [{'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga'],
                 'logRe__penalty': ['l1', 'l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga', 'lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['none']}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring = ['accuracy', 'roc_auc', 'f1'], refit = False,
                   verbose = 0, n_jobs = -1)

best_logre_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_logre_arr
for i in [0, 1, 2, 3, 4]:
    best_logre = search.fit(X_train_arr[i], Y_train_arr[i])
    best_logre_arr.append(best_logre)

Wall time: 4.36 s


In [11]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 1.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga

## acc

In [12]:
# logre_lam_acc_train 
# logre_lam_acc_test 
logre_lam_acc_train = []
logre_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    logre_lam_acc_train.append(train_acc)
    logre_lam_acc_test.append(test_acc)
    
print(logre_lam_acc_train)
print(logre_lam_acc_test)

0.7926
0.7914407988587732
0.794
0.792867332382311
0.7868
0.793509272467903
0.7888
0.7927246790299572
0.8042
0.7868045649072754
[0.7926, 0.794, 0.7868, 0.7888, 0.8042]
[0.7914407988587732, 0.792867332382311, 0.793509272467903, 0.7927246790299572, 0.7868045649072754]


In [13]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_acc_train_mean = sum(logre_lam_acc_train)/len(logre_lam_acc_train)
logre_lam_acc_test_mean = sum(logre_lam_acc_test)/len(logre_lam_acc_test)
print(logre_lam_acc_train_mean)
print(logre_lam_acc_test_mean)

0.79328
0.791469329529244


## roc_auc

In [14]:
# logre_lam_roc_train stores
# logre_lam_roc_test stores
logre_lam_roc_train = []
logre_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 7000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    logre_lam_roc_train.append(train_roc)
    logre_lam_roc_test.append(test_roc)
    
print(logre_lam_roc_train)
print(logre_lam_roc_test)

0.7465926453073602
0.745267630211297
0.7480627907743853
0.7487348734873487
0.7414691558954254
0.7507151830801742
0.7437925436832697
0.7481143043310416
0.7611358387040401
0.7409960570093521
[0.7465926453073602, 0.7480627907743853, 0.7414691558954254, 0.7437925436832697, 0.7611358387040401]
[0.745267630211297, 0.7487348734873487, 0.7507151830801742, 0.7481143043310416, 0.7409960570093521]


In [15]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_roc_train_mean = sum(logre_lam_roc_train)/len(logre_lam_roc_train)
logre_lam_roc_test_mean = sum(logre_lam_roc_test)/len(logre_lam_roc_test)
print(logre_lam_roc_train_mean)
print(logre_lam_roc_test_mean)

0.7482105948728961
0.7467656096238426


## f1

In [16]:
# logre_lam_f1_train stores
# logre_lam_f1_test stores
logre_lam_f1_train = []
logre_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    logre_lam_f1_train.append(train_f1)
    logre_lam_f1_test.append(test_f1)
    
print(logre_lam_f1_train)
print(logre_lam_f1_test)

0.8493389510387911
0.8485131074500051
0.8503776873910518
0.8489073881373569
0.8446969696969696
0.8489433863814245
0.8461090061206645
0.848960498960499
0.8571845368344274
0.8448642757045725
[0.8493389510387911, 0.8503776873910518, 0.8446969696969696, 0.8461090061206645, 0.8571845368344274]
[0.8485131074500051, 0.8489073881373569, 0.8489433863814245, 0.848960498960499, 0.8448642757045725]


In [17]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_f1_train_mean = sum(logre_lam_f1_train)/len(logre_lam_f1_train)
logre_lam_f1_test_mean = sum(logre_lam_f1_test)/len(logre_lam_f1_test)
print(logre_lam_f1_train_mean)
print(logre_lam_f1_test_mean)

0.8495414302163808
0.8480377313267716


## SVM

In [18]:
# C list for SVM
C_svm = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
C_svm_linear = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]

# gamma list for SVM with kernel rbf
gamma_svm = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]

In [19]:
%%time
##pipe = Pipeline(steps=[('std', StandardScaler()), ('svm_classifier', svm.SVC())])
pipe = Pipeline([('svm_classifier', svm.SVC())])

search_space = [{'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': C_svm_linear},
                {'svm_classifier': [svm.SVC(max_iter = 1000000)],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': [100, 1000]},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['poly'],
                 'svm_classifier__degree': [2, 3],
                 'svm_classifier__C': C_svm},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['rbf'],
                 'svm_classifier__gamma': gamma_svm,
                 'svm_classifier__C': C_svm}
                ]

clf = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_svm_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_svm = clf.fit(X_train_arr[i], Y_train_arr[i])
    best_svm_arr.append(best_svm)

Wall time: 5min 16s


In [20]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.1, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.1, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.05, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.1, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_class

## acc

In [21]:
# svm_lam_acc_train stores
# svm_lam_acc_test stores
svm_lam_acc_train = []
svm_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    svm_lam_acc_train.append(train_acc)
    svm_lam_acc_test.append(test_acc)
    
print(svm_lam_acc_train)
print(svm_lam_acc_test)

0.8798
0.8666904422253923
0.8776
0.8643366619115549
0.8682
0.8689728958630528
0.874
0.8661911554921541
0.879
0.8587018544935806
[0.8798, 0.8776, 0.8682, 0.874, 0.879]
[0.8666904422253923, 0.8643366619115549, 0.8689728958630528, 0.8661911554921541, 0.8587018544935806]


In [22]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_acc_train_mean = sum(svm_lam_acc_train)/len(svm_lam_acc_train)
svm_lam_acc_test_mean = sum(svm_lam_acc_test)/len(svm_lam_acc_test)
print(svm_lam_acc_train_mean)
print(svm_lam_acc_test_mean)

0.87572
0.8649786019971468


## roc_auc

In [23]:
# svm_lam_roc_train stores
# svm_lam_roc_test stores
svm_lam_roc_train = []
svm_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    svm_lam_roc_train.append(train_roc)
    svm_lam_roc_test.append(test_roc)
    
print(svm_lam_roc_train)
print(svm_lam_roc_test)

0.8634270127781065
0.8303958387725181
0.8533323297252571
0.8287731207197799
0.8480105750814642
0.8323296224145742
0.84891189233461
0.8297734844478363
0.8581333310874969
0.8227652481272467
[0.8634270127781065, 0.8533323297252571, 0.8480105750814642, 0.84891189233461, 0.8581333310874969]
[0.8303958387725181, 0.8287731207197799, 0.8323296224145742, 0.8297734844478363, 0.8227652481272467]


In [24]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_roc_train_mean = sum(svm_lam_roc_train)/len(svm_lam_roc_train)
svm_lam_roc_test_mean = sum(svm_lam_roc_test)/len(svm_lam_roc_test)
print(svm_lam_roc_train_mean)
print(svm_lam_roc_test_mean)

0.8543630282013869
0.8288074628963911


## f1

In [25]:
# svm_lam_f1_train stores
# svm_lam_f1_test stores
svm_lam_f1_train = []
svm_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    svm_lam_f1_train.append(train_f1)
    svm_lam_f1_test.append(test_f1)
    
print(svm_lam_f1_train)
print(svm_lam_f1_test)

0.9118639096641736
0.9029040469634786
0.9107090749927051
0.9011331739266036
0.9043402525765712
0.9045565542681976
0.9083236321303843
0.9028281363306745
0.9119487701935671
0.8977759430311162
[0.9118639096641736, 0.9107090749927051, 0.9043402525765712, 0.9083236321303843, 0.9119487701935671]
[0.9029040469634786, 0.9011331739266036, 0.9045565542681976, 0.9028281363306745, 0.8977759430311162]


In [26]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_f1_train_mean = sum(svm_lam_f1_train)/len(svm_lam_f1_train)
svm_lam_f1_test_mean = sum(svm_lam_f1_test)/len(svm_lam_f1_test)
print(svm_lam_f1_train_mean)
print(svm_lam_f1_test_mean)

0.9094371279114803
0.9018395709040142


## KNN

In [27]:
#K list for KNN
K_knn = np.arange(1, 105, 4)
K_knn

array([  1,   5,   9,  13,  17,  21,  25,  29,  33,  37,  41,  45,  49,
        53,  57,  61,  65,  69,  73,  77,  81,  85,  89,  93,  97, 101])

In [28]:
%%time
##pipe = Pipeline(steps=[('std', StandardScaler()), ('knn', KNeighborsClassifier())])
pipe = Pipeline([('knn', KNeighborsClassifier())])

search_space = [{'knn': [KNeighborsClassifier()],
                 'knn__weights': ['uniform', 'distance'],
                 'knn__n_neighbors': K_knn}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_knn_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_knn = search.fit(X_train_arr[i], Y_train_arr[i])
    best_knn_arr.append(best_knn)

Wall time: 15 s


In [29]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])])

{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 25, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 25, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 25, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 25, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 13, 'knn__weights': 'dis

## acc

In [30]:
# knn_lam_acc_train stores
# knn_lam_acc_test stores
knn_lam_acc_train = []
knn_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    knn_lam_acc_train.append(train_acc)
    knn_lam_acc_test.append(test_acc)
    
print(knn_lam_acc_train)
print(knn_lam_acc_test)

1.0
0.8314550641940086
1.0
0.8310271041369472
1.0
0.8323823109843081
1.0
0.8294579172610557
1.0
0.8278174037089872
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.8314550641940086, 0.8310271041369472, 0.8323823109843081, 0.8294579172610557, 0.8278174037089872]


In [31]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_acc_train_mean = sum(knn_lam_acc_train)/len(knn_lam_acc_train)
knn_lam_acc_test_mean = sum(knn_lam_acc_test)/len(knn_lam_acc_test)
print(knn_lam_acc_train_mean)
print(knn_lam_acc_test_mean)

1.0
0.8304279600570613


## roc_auc

In [32]:
# knn_lam_roc_train stores
# knn_lam_roc_test stores
knn_lam_roc_train = []
knn_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    knn_lam_roc_train.append(train_roc)
    knn_lam_roc_test.append(test_roc)
    
print(knn_lam_roc_train)
print(knn_lam_roc_test)

1.0
0.770040523322109
1.0
0.7700215559081263
1.0
0.7718644075360883
1.0
0.7631370540705186
1.0
0.7594636238471717
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.770040523322109, 0.7700215559081263, 0.7718644075360883, 0.7631370540705186, 0.7594636238471717]


In [33]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_roc_train_mean = sum(knn_lam_roc_train)/len(knn_lam_roc_train)
knn_lam_roc_test_mean = sum(knn_lam_roc_test)/len(knn_lam_roc_test)
print(knn_lam_roc_train_mean)
print(knn_lam_roc_test_mean)

1.0
0.7669054329368029


## f1

In [34]:
# knn_lam_f1_train stores
# knn_lam_f1_test stores
knn_lam_f1_train = []
knn_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    knn_lam_f1_train.append(train_f1)
    knn_lam_f1_test.append(test_f1)
    
print(knn_lam_f1_train)
print(knn_lam_f1_test)

1.0
0.880541934179263
1.0
0.8802386128102726
1.0
0.8807953738459977
1.0
0.8791630868752212
1.0
0.8785347690449834
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.880541934179263, 0.8802386128102726, 0.8807953738459977, 0.8791630868752212, 0.8785347690449834]


In [35]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_f1_train_mean = sum(knn_lam_f1_train)/len(knn_lam_f1_train)
knn_lam_f1_test_mean = sum(knn_lam_f1_test)/len(knn_lam_f1_test)
print(knn_lam_f1_train_mean)
print(knn_lam_f1_test_mean)

1.0
0.8798547553511475
