In [1]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
import seaborn as sns
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.utils import resample

In [2]:
column_names = ['target', 'feature1', 'feature2', 'feature3', 'feature4', 
                'feature5', 'feature6', 'feature7', 'feature8', 'feature9', 'feature10', 
                'feature11', 'feature12', 'feature13', 'feature14', 'feature15', 'feature16']

## df_adult = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names = column_names, index_col = False)
df_letter = pd.read_csv('letter-recognition.data', names = column_names, index_col = False)

In [3]:
print(df_letter.shape)
df_letter.head()

(20000, 17)


Unnamed: 0,target,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16
0,T,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,I,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,D,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,N,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,G,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [4]:
X = df_letter.iloc[:, 1:]
print(X.shape)
X.head()

(20000, 16)


Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16
0,2,8,3,5,1,8,13,0,6,6,10,8,0,8,0,8
1,5,12,3,7,2,10,5,5,4,13,3,9,2,8,4,10
2,4,11,6,8,6,10,6,2,6,10,3,7,3,7,3,9
3,7,11,6,6,3,5,9,4,6,4,4,10,6,10,2,8
4,2,1,3,1,1,8,6,6,6,6,5,9,1,7,5,10


In [5]:
df_letter.loc[df_letter['target'] == 'O', ['target']] = 1
df_letter.loc[df_letter['target'] != 1, ['target']] = 0

In [6]:
df_letter['target'].describe()

count     20000
unique        2
top           0
freq      19247
Name: target, dtype: int64

In [7]:
Y = df_letter.loc[:, ['target']]
Y = Y.astype(int)

In [8]:
#Creat training and testing data sets for 5 trials
X_train_arr = []
X_test_arr = []
Y_train_arr = []
Y_test_arr = []

for i in [1, 2, 3, 4, 5]:
    X_train, X_test, Y_train, Y_test = train_test_split( 
        X, Y, train_size = 5000, stratify = Y, 
        shuffle = True, random_state = i)
    X_train_arr.append(X_train)
    X_test_arr.append(X_test)
    Y_train_arr.append(Y_train)
    Y_test_arr.append(Y_test)

In [9]:
X_train_arr[0]

Unnamed: 0,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10,feature11,feature12,feature13,feature14,feature15,feature16
4146,7,9,10,6,6,6,7,3,7,10,8,9,3,8,3,7
2604,5,5,7,8,8,8,8,7,3,7,7,8,6,10,6,4
18579,5,10,6,8,7,8,6,6,4,8,6,8,7,7,6,11
15804,3,4,5,7,1,8,8,4,3,6,14,8,3,9,0,8
19986,9,15,6,8,5,5,7,7,4,10,7,10,5,9,5,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4933,4,10,5,7,5,8,9,8,6,7,4,6,2,6,9,8
12596,4,7,5,5,3,7,6,7,7,11,6,11,2,10,4,9
1180,2,4,4,3,2,7,7,2,7,11,6,8,2,8,4,9
14763,5,11,6,8,7,8,7,5,5,10,5,5,3,8,3,8


In [10]:
# upsample negative data points in training set since they only occupy 3% of the whole data
upsampled_arr = []

for i in [0, 1, 2, 3, 4]:
    X_train_with_Y = pd.concat([X_train_arr[i], Y_train_arr[i]], axis=1)

    large_data = X_train_with_Y[X_train_with_Y.target==0]
    small_data = X_train_with_Y[X_train_with_Y.target==1]

    small_data_upsampled = resample(small_data,
                                    replace = True,
                                    n_samples = 4812, # add 3500 positive class with replacement
                                    random_state = 22)
    X_train_with_Y_upsampled = pd.concat([large_data, small_data_upsampled])
    upsampled_arr.append(X_train_with_Y_upsampled) # could check with X_train_with_Y_upsampled.target.value_counts()

In [11]:
# assign upsampled training set back to X_train_arr[] and Y_train_arr[]
for i in [0, 1, 2, 3, 4]:
    X_train_arr[i] = upsampled_arr[i].iloc[:, 1:]
    Y_train_arr[i] = upsampled_arr[i].loc[:, ['target']]

In [12]:
for i in [0, 1, 2, 3, 4]:
    Y_train_arr[i] = Y_train_arr[i].values.ravel()
    Y_test_arr[i] = Y_test_arr[i].values.ravel()

In [13]:
np.unique(Y_train_arr[4], return_counts=True)

(array([0, 1]), array([4812, 4812], dtype=int64))

In [14]:
X_train_arr[4].shape

(9624, 16)

## Logistic Regression

In [15]:
# C_list for SVM.
C_logre = [1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 
         1e1, 1e2, 1e3, 1e4]

In [16]:
%%time
pipe = Pipeline(steps=[('std', StandardScaler()), ('logRe', LogisticRegression())])

search_space = [{'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga'],
                 'logRe__penalty': ['l1', 'l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['l2'],
                 'logRe__C': C_logre},
                {'logRe': [LogisticRegression(max_iter = 5000)],
                 'logRe__solver': ['saga', 'lbfgs', 'sag', 'newton-cg'],
                 'logRe__penalty': ['none']}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring = ['accuracy', 'roc_auc', 'f1'], refit = False,
                   verbose = 0, n_jobs = -1)

best_logre_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_logre_arr
for i in [0, 1, 2, 3, 4]:
    best_logre = search.fit(X_train_arr[i], Y_train_arr[i])
    best_logre_arr.append(best_logre)

Wall time: 1min 27s


In [17]:
for i in [0, 1, 2, 3, 4]:
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
 
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'logRe__solver': 'saga'}
{'logRe': LogisticRegression(max_iter=5000), 'logRe__C': 0.001, 'logRe__penalty': 'l1', 'log

## acc

In [18]:
# logre_lam_acc_train 
# logre_lam_acc_test 
logre_lam_acc_train = []
logre_lam_acc_test = []
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_accuracy'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    logre_lam_acc_train.append(train_acc)
    logre_lam_acc_test.append(test_acc)
    
print(logre_lam_acc_train)
print(logre_lam_acc_test)

0.9826475477971738
0.0378
0.9845178719866999
0.0378
0.9838944305901912
0.037733333333333334
0.9853491271820449
0.0378
0.980257689110557
0.03766666666666667
[0.9826475477971738, 0.9845178719866999, 0.9838944305901912, 0.9853491271820449, 0.980257689110557]
[0.0378, 0.0378, 0.037733333333333334, 0.0378, 0.03766666666666667]


In [19]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_acc_train_mean = sum(logre_lam_acc_train)/len(logre_lam_acc_train)
logre_lam_acc_test_mean = sum(logre_lam_acc_test)/len(logre_lam_acc_test)
print(logre_lam_acc_train_mean)
print(logre_lam_acc_test_mean)

0.9833333333333334
0.03776


## roc_auc

In [20]:
# logre_lam_roc_train stores
# logre_lam_roc_test stores
logre_lam_roc_train = []
logre_lam_roc_test = []
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_roc_auc'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    logre_lam_roc_train.append(train_roc)
    logre_lam_roc_test.append(test_roc)
    
print(logre_lam_roc_train)
print(logre_lam_roc_test)

0.9826475477971737
0.5000692760651195
0.9845178719866999
0.5000692760651195
0.9838944305901912
0.5000346380325598
0.9853491271820448
0.5000692760651195
0.980257689110557
0.5
[0.9826475477971737, 0.9845178719866999, 0.9838944305901912, 0.9853491271820448, 0.980257689110557]
[0.5000692760651195, 0.5000692760651195, 0.5000346380325598, 0.5000692760651195, 0.5]


In [21]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_roc_train_mean = sum(logre_lam_roc_train)/len(logre_lam_roc_train)
logre_lam_roc_test_mean = sum(logre_lam_roc_test)/len(logre_lam_roc_test)
print(logre_lam_roc_train_mean)
print(logre_lam_roc_test_mean)

0.9833333333333332
0.5000484932455838


## f1

In [22]:
# logre_lam_f1_train stores
# logre_lam_f1_test stores
logre_lam_f1_train = []
logre_lam_f1_test = []
for i in [0, 1, 2, 3, 4]:
    logre = LogisticRegression(max_iter = 5000, 
                               C = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__C'),
                               penalty = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__penalty'),
                               solver = best_logre_arr[i].cv_results_['params'][np.argmin(best_logre_arr[i].cv_results_['rank_test_f1'])].get('logRe__solver')
                              )
    logre.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], logre.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], logre.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    logre_lam_f1_train.append(train_f1)
    logre_lam_f1_test.append(test_f1)
    
print(logre_lam_f1_train)
print(logre_lam_f1_test)

0.9829435195587786
0.07260810897641844
0.9847539138442648
0.07260810897641844
0.9841497085591574
0.07260344384477
0.9855606758832565
0.07260810897641844
0.9806399021805584
0.07259877931256023
[0.9829435195587786, 0.9847539138442648, 0.9841497085591574, 0.9855606758832565, 0.9806399021805584]
[0.07260810897641844, 0.07260810897641844, 0.07260344384477, 0.07260810897641844, 0.07259877931256023]


In [23]:
#get the mean scores cross 5 trials for each algo/dataset combo
logre_lam_f1_train_mean = sum(logre_lam_f1_train)/len(logre_lam_f1_train)
logre_lam_f1_test_mean = sum(logre_lam_f1_test)/len(logre_lam_f1_test)
print(logre_lam_f1_train_mean)
print(logre_lam_f1_test_mean)

0.9836095440052033
0.07260531001731711


## SVM

In [24]:
# C list for SVM
C_svm = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
C_svm_linear = [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1]

# gamma list for SVM with kernel rbf
gamma_svm = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 2]

In [25]:
%%time
pipe = Pipeline(steps=[('std', StandardScaler()), ('svm_classifier', svm.SVC())])

search_space = [{'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': C_svm_linear},
                {'svm_classifier': [svm.SVC(max_iter = 1000000)],
                 'svm_classifier__kernel': ['linear'],
                 'svm_classifier__C': [100, 1000]},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['poly'],
                 'svm_classifier__degree': [2, 3],
                 'svm_classifier__C': C_svm},
                {'svm_classifier': [svm.SVC()],
                 'svm_classifier__kernel': ['rbf'],
                 'svm_classifier__gamma': gamma_svm,
                 'svm_classifier__C': C_svm}
                ]

clf = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits=5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit=False,
                   verbose=0, n_jobs = -1)

best_svm_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_svm = clf.fit(X_train_arr[i], Y_train_arr[i])
    best_svm_arr.append(best_svm)

Wall time: 35min 21s


In [26]:
for i in [0, 1, 2, 3, 4]:
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])])
    print(' ')

{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.0001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.0001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.0001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
 
{'svm_classifier': SVC(), 'svm_classifier__C': 0.001, 'svm_classifier__kernel': 'linear'}
{'svm_classifier': SVC(), 'svm_classifier__C': 0.0001, 'svm_classifier__kernel': 'linear'}


## acc

In [27]:
# svm_lam_acc_train stores
# svm_lam_acc_test stores
svm_lam_acc_train = []
svm_lam_acc_test = []
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_accuracy'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    svm_lam_acc_train.append(train_acc)
    svm_lam_acc_test.append(test_acc)
    
print(svm_lam_acc_train)
print(svm_lam_acc_test)

1.0
0.0378
1.0
0.0378
1.0
0.037733333333333334
1.0
0.0378
1.0
0.03766666666666667
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.0378, 0.0378, 0.037733333333333334, 0.0378, 0.03766666666666667]


In [28]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_acc_train_mean = sum(svm_lam_acc_train)/len(svm_lam_acc_train)
svm_lam_acc_test_mean = sum(svm_lam_acc_test)/len(svm_lam_acc_test)
print(svm_lam_acc_train_mean)
print(svm_lam_acc_test_mean)

1.0
0.03776


## roc_auc

In [29]:
# svm_lam_roc_train stores
# svm_lam_roc_test stores
svm_lam_roc_train = []
svm_lam_roc_test = []
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_roc_auc'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    svm_lam_roc_train.append(train_roc)
    svm_lam_roc_test.append(test_roc)
    
print(svm_lam_roc_train)
print(svm_lam_roc_test)

0.8672069825436408
0.5008659508139938
0.8703241895261845
0.5002078281953586
0.8611803823773898
0.5005195704883962
0.855257689110557
0.5000692760651195
0.8853906899418122
0.5002078281953586
[0.8672069825436408, 0.8703241895261845, 0.8611803823773898, 0.855257689110557, 0.8853906899418122]
[0.5008659508139938, 0.5002078281953586, 0.5005195704883962, 0.5000692760651195, 0.5002078281953586]


In [30]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_roc_train_mean = sum(svm_lam_roc_train)/len(svm_lam_roc_train)
svm_lam_roc_test_mean = sum(svm_lam_roc_test)/len(svm_lam_roc_test)
print(svm_lam_roc_train_mean)
print(svm_lam_roc_test_mean)

0.8678719866999168
0.5003740907516454


## f1

In [31]:
# svm_lam_f1_train stores
# svm_lam_f1_test stores
svm_lam_f1_train = []
svm_lam_f1_test = []
for i in [0, 1, 2, 3, 4]:
    if best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'poly':
        svm_cls = svm.SVC(kernel = 'poly',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          degree = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__degree')
                         )
    elif best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__kernel') == 'linear':
        svm_cls = svm.SVC(kernel = 'linear',
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C')
                         )
    else:
        svm_cls = svm.SVC(kernel = 'rbf', 
                          C = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__C'),
                          gamma = best_svm_arr[i].cv_results_['params'][np.argmin(best_svm_arr[i].cv_results_['rank_test_f1'])].get('svm_classifier__gamma')
                         )
    
    svm_cls.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], svm_cls.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], svm_cls.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    svm_lam_f1_train.append(train_f1)
    svm_lam_f1_test.append(test_f1)
    
print(svm_lam_f1_train)
print(svm_lam_f1_test)

1.0
0.07260810897641844
1.0
0.07260810897641844
1.0
0.07260344384477
1.0
0.07260810897641844
1.0
0.07259877931256023
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.07260810897641844, 0.07260810897641844, 0.07260344384477, 0.07260810897641844, 0.07259877931256023]


In [32]:
#get the mean scores cross 5 trials for each algo/dataset combo
svm_lam_f1_train_mean = sum(svm_lam_f1_train)/len(svm_lam_f1_train)
svm_lam_f1_test_mean = sum(svm_lam_f1_test)/len(svm_lam_f1_test)
print(svm_lam_f1_train_mean)
print(svm_lam_f1_test_mean)

1.0
0.07260531001731711


## KNN

In [33]:
#K list for KNN
K_knn = np.arange(1, 105, 4)
K_knn.shape

(26,)

In [34]:
%%time
pipe = Pipeline(steps=[('std', StandardScaler()), ('knn', KNeighborsClassifier())])

search_space = [{'knn': [KNeighborsClassifier()],
                 'knn__weights': ['uniform', 'distance'],
                 'knn__n_neighbors': K_knn}
                ]

search = GridSearchCV(pipe, search_space, cv = StratifiedKFold(n_splits = 5), 
                   scoring=['accuracy', 'roc_auc', 'f1'], refit = False,
                   verbose=0, n_jobs = -1)

best_knn_arr = []

#search training set in stratified 5-fold manner cross 5 trials, and store the best models in best_svm_arr
for i in [0, 1, 2, 3, 4]:
    best_knn = search.fit(X_train_arr[i], Y_train_arr[i])
    best_knn_arr.append(best_knn)

Wall time: 4min 31s


In [35]:
for i in [0, 1, 2, 3, 4]:
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])])
    print(best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])])

{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighborsClassifier(), 'knn__n_neighbors': 1, 'knn__weights': 'uniform'}
{'knn': KNeighbo

In [36]:
best_knn_arr[i].cv_results_

{'mean_fit_time': array([0.01675324, 0.01695433, 0.01695461, 0.01675496, 0.01815176,
        0.06083665, 0.01934872, 0.09434729, 0.05166211, 0.08716683,
        0.05564971, 0.05505261, 0.01595721, 0.01496015, 0.01695466,
        0.05485296, 0.02034583, 0.01396232, 0.05206037, 0.05405588,
        0.05066385, 0.05425501, 0.05166173, 0.01136951, 0.05026269,
        0.05105963, 0.09315076, 0.0506649 , 0.05206008, 0.0159564 ,
        0.05525217, 0.12985258, 0.05704699, 0.05166187, 0.05545154,
        0.01615725, 0.01555834, 0.01914897, 0.05405574, 0.05545125,
        0.05505271, 0.01575799, 0.01635542, 0.05644879, 0.0540555 ,
        0.05405402, 0.01635613, 0.01635637, 0.01914845, 0.05624933,
        0.05545192, 0.05624948]),
 'std_fit_time': array([0.00074535, 0.00154502, 0.00252265, 0.00171544, 0.00230902,
        0.07571342, 0.00337427, 0.09641766, 0.07093119, 0.08884578,
        0.0769117 , 0.07671417, 0.00289083, 0.00109249, 0.00333808,
        0.07830884, 0.00873073, 0.0008918 , 0.072

In [37]:
model = KNeighborsClassifier(weights = 'uniform', n_neighbors = 1)
model.fit(X_train_arr[0], Y_train_arr[0])
f1_score(Y_test_arr[0], model.predict(X_test_arr[0]))

0.047728315741165664

## acc

In [38]:
# knn_lam_acc_train stores
# knn_lam_acc_test stores
knn_lam_acc_train = []
knn_lam_acc_test = []
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_accuracy'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_acc = accuracy_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_acc = accuracy_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_acc)
    print(test_acc)
    
    knn_lam_acc_train.append(train_acc)
    knn_lam_acc_test.append(test_acc)
    
print(knn_lam_acc_train)
print(knn_lam_acc_test)

1.0
0.8616666666666667
1.0
0.8782666666666666
1.0
0.9008
1.0
0.8941333333333333
1.0
0.8852
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.8616666666666667, 0.8782666666666666, 0.9008, 0.8941333333333333, 0.8852]


In [39]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_acc_train_mean = sum(knn_lam_acc_train)/len(knn_lam_acc_train)
knn_lam_acc_test_mean = sum(knn_lam_acc_test)/len(knn_lam_acc_test)
print(knn_lam_acc_train_mean)
print(knn_lam_acc_test_mean)

1.0
0.8840133333333334


## roc_auc

In [40]:
# knn_lam_roc_train stores
# knn_lam_roc_test stores
knn_lam_roc_train = []
knn_lam_roc_test = []
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_roc_auc'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_roc = roc_auc_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_roc = roc_auc_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_roc)
    print(test_roc)
    
    knn_lam_roc_train.append(train_roc)
    knn_lam_roc_test.append(test_roc)
    
print(knn_lam_roc_train)
print(knn_lam_roc_test)

1.0
0.49191309225671376
1.0
0.6042767241617136
1.0
0.5887742121380249
1.0
0.5419442051797653
1.0
0.5704650998832116
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.49191309225671376, 0.6042767241617136, 0.5887742121380249, 0.5419442051797653, 0.5704650998832116]


In [41]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_roc_train_mean = sum(knn_lam_roc_train)/len(knn_lam_roc_train)
knn_lam_roc_test_mean = sum(knn_lam_roc_test)/len(knn_lam_roc_test)
print(knn_lam_roc_train_mean)
print(knn_lam_roc_test_mean)

1.0
0.5594746667238858


## f1

In [42]:
# knn_lam_f1_train stores
# knn_lam_f1_test stores
knn_lam_f1_train = []
knn_lam_f1_test = []
for i in [0, 1, 2, 3, 4]:
    knn = KNeighborsClassifier(weights = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__weights'),
                               n_neighbors = best_knn_arr[i].cv_results_['params'][np.argmin(best_knn_arr[i].cv_results_['rank_test_f1'])].get('knn__n_neighbors')
                              )
    knn.fit(X_train_arr[i], Y_train_arr[i])
    
    train_f1 = f1_score(Y_train_arr[i], knn.predict(X_train_arr[i]))
    test_f1 = f1_score(Y_test_arr[i], knn.predict(X_test_arr[i]))
    
    print(train_f1)
    print(test_f1)
    
    knn_lam_f1_train.append(train_f1)
    knn_lam_f1_test.append(test_f1)
    
print(knn_lam_f1_train)
print(knn_lam_f1_test)

1.0
0.047728315741165664
1.0
0.16007359705611776
1.0
0.16027088036117385
1.0
0.10282485875706215
1.0
0.13118062563067606
[1.0, 1.0, 1.0, 1.0, 1.0]
[0.047728315741165664, 0.16007359705611776, 0.16027088036117385, 0.10282485875706215, 0.13118062563067606]


In [43]:
#get the mean scores cross 5 trials for each algo/dataset combo
knn_lam_f1_train_mean = sum(knn_lam_f1_train)/len(knn_lam_f1_train)
knn_lam_f1_test_mean = sum(knn_lam_f1_test)/len(knn_lam_f1_test)
print(knn_lam_f1_train_mean)
print(knn_lam_f1_test_mean)

1.0
0.12041565550923909
