In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier

## Data cleaning

In [2]:
column_names = ['target', 'feature1', 'feature2', 'feature3', 'feature4', 
                'feature5', 'feature6', 'feature7', 'feature8', 'feature9', 'feature10', 
                'feature11', 'feature12', 'feature13', 'feature14', 'feature15', 'feature16']

## df_adult = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names = column_names, index_col = False)
df_letter = pd.read_csv('letter-recognition.data', names = column_names, index_col = False)

In [3]:
print(df_letter.shape)
df_letter.head()

(20000, 17)


Unnamed: 0,target,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [4]:
X = df_letter.iloc[:, 1:]
print(X.shape)
X.head()

(20000, 16)


Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [5]:
X = StandardScaler().fit_transform(X)
X.shape

(20000, 16)

In [6]:
def letter_replace_1 (letter):
    df_letter.loc[df_letter['target'] == letter, ['target']] = 1
    
def letter_replace_0 (letter):
    df_letter.loc[df_letter['target'] == letter, ['target']] = 0

positive_letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M'] 
negative_letters = ['N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']

for i in positive_letters:
    letter_replace_1(i)

for i in negative_letters:
    letter_replace_0(i)

In [7]:
df_letter['target'].describe()

count     20000
unique        2
top           0
freq      10060
Name: target, dtype: int64

In [8]:
Y = df_letter.loc[:, ['target']]
Y = Y.astype(int)

In [9]:
#Creat training and testing data sets for 5 trials
X_train_arr = []
X_test_arr = []
Y_train_arr = []
Y_test_arr = []

for i in [1, 2, 3, 4, 5]:
    X_train, X_test, Y_train, Y_test = train_test_split( 
        X, Y, train_size = 5000, stratify = Y, 
        shuffle = True, random_state = i)
    X_train_arr.append(X_train)
    X_test_arr.append(X_test)
    Y_train_arr.append(Y_train)
    Y_test_arr.append(Y_test)

In [10]:
for i in [0, 1, 2, 3, 4]:
    Y_train_arr[i] = Y_train_arr[i].values.ravel()
    Y_test_arr[i] = Y_test_arr[i].values.ravel()

## Logistic Regression

In [11]:
# C_list for SVM.
C_logre = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 
         1e1, 1e2, 1e3, 1e4]

In [12]:
%%time
##pipe = Pipeline(steps=[('std', StandardScaler()), ('logRe', LogisticRegression())])
pipe = Pipeline([('logRe', LogisticRegression())])

search_space = [{'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga'],
                 'logRe__penalty': ['l1', 'l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga', 'lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['none']}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring = ['accuracy', 'roc_auc', 'f1'], refit = False,
                   verbose = 0, n_jobs = -1)

best_logre_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_logre_arr
for i in [0, 1, 2, 3, 4]:
    best_logre = search.fit(X_train_arr[i], Y_train_arr[i])
    best_logre_arr.append(best_logre)

Wall time: 4.41 s


In [13]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 10000.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 10.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 10000.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 10.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.1, 'logRe__penalty': 'l2', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 10000.0, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 10.0, 'logRe__penalty': 'l1', 'logRe_

## acc

In [14]:
# logre_lam_acc_train 
# logre_lam_acc_test 
logre_lam_acc_train = []
logre_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    logre_lam_acc_train.append(train_acc)
    logre_lam_acc_test.append(test_acc)
    
print(logre_lam_acc_train)
print(logre_lam_acc_test)

0.7246
0.7266666666666667
0.7266
0.7243333333333334
0.7298
0.7235333333333334
0.7188
0.7296
0.7194
0.7281333333333333
[0.7246, 0.7266, 0.7298, 0.7188, 0.7194]
[0.7266666666666667, 0.7243333333333334, 0.7235333333333334, 0.7296, 0.7281333333333333]


In [15]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_acc_train_mean = sum(logre_lam_acc_train)/len(logre_lam_acc_train)
logre_lam_acc_test_mean = sum(logre_lam_acc_test)/len(logre_lam_acc_test)
print(logre_lam_acc_train_mean)
print(logre_lam_acc_test_mean)

0.72384
0.7264533333333334


## roc_auc

In [16]:
# logre_lam_roc_train stores
# logre_lam_roc_test stores
logre_lam_roc_train = []
logre_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    logre_lam_roc_train.append(train_roc)
    logre_lam_roc_test.append(test_roc)
    
print(logre_lam_roc_train)
print(logre_lam_roc_test)

0.7250657023652851
0.7282747512243775
0.725041701501254
0.7250895698911826
0.7322587613154075
0.7257459935224335
0.7187158737714558
0.7305422995227827
0.7202535291270484
0.7297040026774297
[0.7250657023652851, 0.725041701501254, 0.7322587613154075, 0.7187158737714558, 0.7202535291270484]
[0.7282747512243775, 0.7250895698911826, 0.7257459935224335, 0.7305422995227827, 0.7297040026774297]


In [17]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_roc_train_mean = sum(logre_lam_roc_train)/len(logre_lam_roc_train)
logre_lam_roc_test_mean = sum(logre_lam_roc_test)/len(logre_lam_roc_test)
print(logre_lam_roc_train_mean)
print(logre_lam_roc_test_mean)

0.7242671136160901
0.7278713233676413


## f1

In [18]:
# logre_lam_f1_train stores
# logre_lam_f1_test stores
logre_lam_f1_train = []
logre_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    logre_lam_f1_train.append(train_f1)
    logre_lam_f1_test.append(test_f1)
    
print(logre_lam_f1_train)
print(logre_lam_f1_test)

0.7270560190703218
0.7333856619570904
0.7257131458208658
0.7247014875592022
0.733134328358209
0.7280777109627965
0.7227586206896552
0.7355961022823883
0.7213375796178343
0.7358551989061787
[0.7270560190703218, 0.7257131458208658, 0.733134328358209, 0.7227586206896552, 0.7213375796178343]
[0.7333856619570904, 0.7247014875592022, 0.7280777109627965, 0.7355961022823883, 0.7358551989061787]


In [19]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_f1_train_mean = sum(logre_lam_f1_train)/len(logre_lam_f1_train)
logre_lam_f1_test_mean = sum(logre_lam_f1_test)/len(logre_lam_f1_test)
print(logre_lam_f1_train_mean)
print(logre_lam_f1_test_mean)

0.7259999387113771
0.7315232323335312


## SVM

In [20]:
# C list for SVM
C_svm = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
C_svm_linear = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]

# gamma list for SVM with kernel rbf
gamma_svm = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]

In [21]:
%%time
##pipe = Pipeline(steps=[('std', StandardScaler()), ('svm_classifier', svm.SVC())])
pipe = Pipeline([('svm_classifier', svm.SVC())])

search_space = [{'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': C_svm_linear},
                {'svm_classifier': [svm.SVC(max_iter = 1000000)],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': [100, 1000]},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['poly'],
                 'svm_classifier__degree': [2, 3],
                 'svm_classifier__C': C_svm},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['rbf'],
                 'svm_classifier__gamma': gamma_svm,
                 'svm_classifier__C': C_svm}
                ]

clf = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_svm_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_svm = clf.fit(X_train_arr[i], Y_train_arr[i])
    best_svm_arr.append(best_svm)

Wall time: 5min 51s


In [22]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier__gamma': 0.5, 'svm_classifier__kernel': 'rbf'}
{'svm_classifier': SVC(), 'svm_classifier__C': 10.0, 'svm_classifier

## acc

In [23]:
# svm_lam_acc_train stores
# svm_lam_acc_test stores
svm_lam_acc_train = []
svm_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    svm_lam_acc_train.append(train_acc)
    svm_lam_acc_test.append(test_acc)
    
print(svm_lam_acc_train)
print(svm_lam_acc_test)

1.0
0.9606
1.0
0.9609333333333333
1.0
0.9610666666666666
1.0
0.9652
1.0
0.9641333333333333
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9606, 0.9609333333333333, 0.9610666666666666, 0.9652, 0.9641333333333333]


In [24]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_acc_train_mean = sum(svm_lam_acc_train)/len(svm_lam_acc_train)
svm_lam_acc_test_mean = sum(svm_lam_acc_test)/len(svm_lam_acc_test)
print(svm_lam_acc_train_mean)
print(svm_lam_acc_test_mean)

1.0
0.9623866666666666


## roc_auc

In [25]:
# svm_lam_roc_train stores
# svm_lam_roc_test stores
svm_lam_roc_train = []
svm_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    svm_lam_roc_train.append(train_roc)
    svm_lam_roc_test.append(test_roc)
    
print(svm_lam_roc_train)
print(svm_lam_roc_test)

1.0
0.9605837810161165
1.0
0.9608679245786181
1.0
0.9610332638641658
1.0
0.9651963470684943
1.0
0.964120041654833
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9605837810161165, 0.9608679245786181, 0.9610332638641658, 0.9651963470684943, 0.964120041654833]


In [26]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_roc_train_mean = sum(svm_lam_roc_train)/len(svm_lam_roc_train)
svm_lam_roc_test_mean = sum(svm_lam_roc_test)/len(svm_lam_roc_test)
print(svm_lam_roc_train_mean)
print(svm_lam_roc_test_mean)

1.0
0.9623602716364456


## f1

In [27]:
# svm_lam_f1_train stores
# svm_lam_f1_test stores
svm_lam_f1_train = []
svm_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    svm_lam_f1_train.append(train_f1)
    svm_lam_f1_test.append(test_f1)
    
print(svm_lam_f1_train)
print(svm_lam_f1_test)

1.0
0.9602635648490553
1.0
0.9602711864406779
1.0
0.9606203641267701
1.0
0.964975845410628
1.0
0.9638440860215054
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9602635648490553, 0.9602711864406779, 0.9606203641267701, 0.964975845410628, 0.9638440860215054]


In [28]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_f1_train_mean = sum(svm_lam_f1_train)/len(svm_lam_f1_train)
svm_lam_f1_test_mean = sum(svm_lam_f1_test)/len(svm_lam_f1_test)
print(svm_lam_f1_train_mean)
print(svm_lam_f1_test_mean)

1.0
0.9619950093697274


## KNN

In [29]:
#K list for KNN
K_knn = np.arange(1, 105, 4)
K_knn

array([  1,   5,   9,  13,  17,  21,  25,  29,  33,  37,  41,  45,  49,
        53,  57,  61,  65,  69,  73,  77,  81,  85,  89,  93,  97, 101])

In [30]:
%%time
##pipe = Pipeline(steps=[('std', StandardScaler()), ('knn', KNeighborsClassifier())])
pipe = Pipeline([('knn', KNeighborsClassifier())])

search_space = [{'knn': [KNeighborsClassifier()],
                 'knn__weights': ['uniform', 'distance'],
                 'knn__n_neighbors': K_knn}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_knn_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_knn = search.fit(X_train_arr[i], Y_train_arr[i])
    best_knn_arr.append(best_knn)

Wall time: 44.9 s


In [31]:
#show the result of best models
#each group of three is the result for a trial
#within each group, each result is the best model for each performance metric
for i in [0, 1, 2, 3, 4]:
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])])

{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 5, 'knn__weights': 'distance'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNei

## acc

In [32]:
# knn_lam_acc_train stores
# knn_lam_acc_test stores
knn_lam_acc_train = []
knn_lam_acc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__n_neighbors')
                              )
    
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    knn_lam_acc_train.append(train_acc)
    knn_lam_acc_test.append(test_acc)
    
print(knn_lam_acc_train)
print(knn_lam_acc_test)

1.0
0.9528
1.0
0.9511333333333334
1.0
0.9516
1.0
0.9536666666666667
1.0
0.9572
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9528, 0.9511333333333334, 0.9516, 0.9536666666666667, 0.9572]


In [33]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_acc_train_mean = sum(knn_lam_acc_train)/len(knn_lam_acc_train)
knn_lam_acc_test_mean = sum(knn_lam_acc_test)/len(knn_lam_acc_test)
print(knn_lam_acc_train_mean)
print(knn_lam_acc_test_mean)

1.0
0.95328


## roc_auc

In [34]:
# knn_lam_roc_train stores
# knn_lam_roc_test stores
knn_lam_roc_train = []
knn_lam_roc_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    knn_lam_roc_train.append(train_roc)
    knn_lam_roc_test.append(test_roc)
    
print(knn_lam_roc_train)
print(knn_lam_roc_test)

1.0
0.9483598742888079
1.0
0.945835116730869
1.0
0.9440546526341614
1.0
0.9467007478935908
1.0
0.9491511027730333
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9483598742888079, 0.945835116730869, 0.9440546526341614, 0.9467007478935908, 0.9491511027730333]


In [35]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_roc_train_mean = sum(knn_lam_roc_train)/len(knn_lam_roc_train)
knn_lam_roc_test_mean = sum(knn_lam_roc_test)/len(knn_lam_roc_test)
print(knn_lam_roc_train_mean)
print(knn_lam_roc_test_mean)

1.0
0.9468202988640926


## f1

In [36]:
# knn_lam_f1_train stores
# knn_lam_f1_test stores
knn_lam_f1_train = []
knn_lam_f1_test = []

#calculate the training and testing performance looping through each trial
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    knn_lam_f1_train.append(train_f1)
    knn_lam_f1_test.append(test_f1)
    
print(knn_lam_f1_train)
print(knn_lam_f1_test)

1.0
0.9524257492272543
1.0
0.9506231054227012
1.0
0.95114401076716
1.0
0.9534400750318215
1.0
0.9569820423478961
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.9524257492272543, 0.9506231054227012, 0.95114401076716, 0.9534400750318215, 0.9569820423478961]


In [37]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_f1_train_mean = sum(knn_lam_f1_train)/len(knn_lam_f1_train)
knn_lam_f1_test_mean = sum(knn_lam_f1_test)/len(knn_lam_f1_test)
print(knn_lam_f1_train_mean)
print(knn_lam_f1_test_mean)

1.0
0.9529229965593666
