In [1]:
import numpy as np
import os
from sklearn.semi_supervised import LabelSpreading, LabelPropagation
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_classif, mutual_info_classif
import time
import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression, LassoCV, RandomizedLasso
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold #, GridSearchCV, cross_val_score

%matplotlib inline 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib
from mpl_toolkits.mplot3d import Axes3D #, axes3d
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

import operator
import winsound
import pandas as pd 
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv("sonar.csv",header = None)
dataset = df.values
Xsonar = dataset[:, 0:60].astype(float)
Ysonar = dataset[:,60]
#change the string output to int 
encoder = LabelEncoder()
encoder.fit(Ysonar)
ysonar = encoder.transform(Ysonar)


In [5]:
# Simple nested CV, no preprocessing 
# %%time
# Nested CV. 
# The basic idea is that cross-validation is used to assess the performance of a method for fitting a model, 
# not of the model itself. If you need to perform model selection, then you need to perform that independently 
# in each fold of the cross-validation procedure, as it is an integral part of the model fitting procedure. 
# If you use a cross-validation based model selection procedure, this means you end up with nested cross-validation.
# It is helpful to consider the purpose of each cross-validation - one is for model selection, the other for performance
# estimation. I would make my final model by fitting the model (including model selection) to the whole dataset, after
# using nested cross-validation to get an idea of the performance I could reasonably expect to get from that model.

# ========= It'd be better if we repeated the nested cv with random data partitions to report the errors' behaviour =========


struct = [
      [ {'gamma': np.arange(0.02,5,0.5), 'C':[0.1, 1, 10]},
       {},
       {'n_neighbors' : [2, 5, 10, 15]},
       {'alpha': [0.0001, 0.01, 0.1, 1], 'hidden_layer_sizes': [(100,100), (50,)]}
      ],
      [SVC(), 
       GaussianNB(),
       KNeighborsClassifier(),
       MLPClassifier()]]


classifiers = struct[1]
x_tot = np.copy(Xsonar)
y_tot = np.copy(ysonar)
n_outer_folds = 5
n_inner_folds = 5
state = 42 

# define the number of bests ranking classifiers
n_top_classifiers = 2 
top_classifiers = []
outer_scores = np.zeros((n_top_classifiers, n_outer_folds)) #use this if you wanna keep the score of multiple "best" classifiers from the inner cv

#outer loop i.e. the one that evaluates the inner "best" model
outer_kfold = StratifiedKFold(n_splits = n_outer_folds, shuffle = True, random_state = state)
outer = outer_kfold.split(x_tot,y_tot)

for fold_out, (train_ind_out, test_ind_out) in enumerate(outer):
    print("Outer loop %d out of %d" %((fold_out + 1), n_outer_folds))
    x_trn_out, x_tst_out = x_tot[train_ind_out], x_tot[test_ind_out]
    y_trn_out, y_tst_out = y_tot[train_ind_out], y_tot[test_ind_out]  
    inner_scores = np.zeros((len(classifiers),n_inner_folds))
    inner_mean_scores = np.zeros(len(classifiers))

    for cl_ind,clf in enumerate(classifiers):
#         print("Classifier %d" %cl_ind)
        inner_kfold = StratifiedKFold( n_splits = n_inner_folds, shuffle = True, random_state = state)
        inner = inner_kfold.split(x_trn_out, y_trn_out)
        
        for fold_in, (train_ind_in, test_ind_in) in enumerate(inner): 
#             print("Inner fold %d " %fold_in)
            x_trn_in, x_tst_in = x_trn_out[train_ind_in], x_trn_out[test_ind_in]
            y_trn_in, y_tst_in = y_trn_out[train_ind_in], y_trn_out[test_ind_in]
            clf.fit(x_trn_in, y_trn_in)
            
            #inner scores calculated for every inner fold
            inner_scores[cl_ind][fold_in] = clf.score(x_tst_in, y_tst_in)
            
    # get the mean performance for every classifier
    inner_mean_scores = np.mean(inner_scores, axis = 1)
    print(inner_mean_scores)
    
    # sort the scores, low -> high 
    sorted_inds = inner_mean_scores.argsort()
    sorted_scores = inner_mean_scores[sorted_inds]
    
    print ("Best %d classifiers for outer fold %d are: \n " %(n_top_classifiers, fold_out))
    
    #get the inds of best performing classifiers 
    temp2 = 0
    for temp_ind in range(-1, -(n_top_classifiers +1), -1):
        actual_ind = sorted_inds[temp_ind]
#         top_classifiers[temp2] = classifiers[actual_ind]
        top_classifiers.append(classifiers[actual_ind])    
        print('\n', classifiers[actual_ind], '\n')
 
        #fit the best classifier on the outer test data
        classifiers[actual_ind].fit(x_trn_out, y_trn_out)
        
        #get the outer score 
        outer_scores[temp2, fold_out] = classifiers[actual_ind].score(x_tst_out, y_tst_out)
        print("Inner score: %f VS Outer score: %f \n" %(inner_mean_scores[actual_ind], outer_scores[temp2, fold_out]))
        print ("----------------------------------------------------------")
        temp2+=1

        
        
# print(outer_scores)  
# print(top_classifiers)
winsound.Beep(400,900)

Outer loop 1 out of 5




[ 0.56974153  0.7075869   0.7454434   0.80076872]
Best 2 classifiers for outer fold 0 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.800769 VS Outer score: 0.651163 

----------------------------------------------------------

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Inner score: 0.745443 VS Outer score: 0.767442 

----------------------------------------------------------
Outer loop 2 out of 5




[ 0.5902295   0.667959    0.76867201  0.7473262 ]
Best 2 classifiers for outer fold 1 are: 
 

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Inner score: 0.768672 VS Outer score: 0.738095 

----------------------------------------------------------

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.747326 VS Outer score: 0.809524 

----------------------------------------------------------
Outer loop 3 out of 5
[ 0.54512032  0.70650624  0.76017157  0.77816399]
Best 2 classifiers for outer fold 2 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.778164 VS Outer score: 0.780488 

----------------------------------------------------------

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Inner score: 0.760172 VS Outer score: 0.829268 

----------------------------------------------------------
Outer loop 4 out of 5




[ 0.59877451  0.66403743  0.78371212  0.82486631]
Best 2 classifiers for outer fold 3 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.824866 VS Outer score: 0.756098 

----------------------------------------------------------

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Inner score: 0.783712 VS Outer score: 0.804878 

----------------------------------------------------------
Outer loop 5 out of 5




[ 0.54512032  0.69474153  0.77174688  0.76512923]
Best 2 classifiers for outer fold 4 are: 
 

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Inner score: 0.771747 VS Outer score: 0.731707 

----------------------------------------------------------

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.765129 VS Outer score: 0.853659 

----------------------------------------------------------


In [7]:
# Preprocessing Data L: 34

struct = [
      [ {'gamma': np.arange(0.02,5,0.5), 'C':[0.1, 1, 10]},
       {},
       {'n_neighbors' : [2, 5, 10, 15]},
       {'alpha': [0.0001, 0.01, 0.1, 1], 'hidden_layer_sizes': [(100,100), (50,)]}
      ],
      [SVC(), 
       GaussianNB(),
       KNeighborsClassifier(),
       MLPClassifier()]]


classifiers = struct[1]
x_tot = np.copy(Xsonar)
y_tot = np.copy(ysonar)
n_outer_folds = 5
n_inner_folds = 5
state = 42 

# define the number of bests ranking classifiers
n_top_classifiers = 2 
top_classifiers = []
outer_scores = np.zeros((n_top_classifiers, n_outer_folds)) #use this if you wanna keep the score of multiple "best" classifiers from the inner cv

#outer loop i.e. the one that evaluates the inner "best" model
outer_kfold = StratifiedKFold(n_splits = n_outer_folds, shuffle = True, random_state = state)
outer = outer_kfold.split(x_tot,y_tot)

for fold_out, (train_ind_out, test_ind_out) in enumerate(outer):
    print("Outer loop %d out of %d" %((fold_out + 1), n_outer_folds))
#_-> -> ============================================
    x_tot = StandardScaler().fit_transform(x_tot)
    x_trn_out, x_tst_out = x_tot[train_ind_out], x_tot[test_ind_out]
    y_trn_out, y_tst_out = y_tot[train_ind_out], y_tot[test_ind_out]  
    inner_scores = np.zeros((len(classifiers),n_inner_folds))
    inner_mean_scores = np.zeros(len(classifiers))

    for cl_ind,clf in enumerate(classifiers):
#         print("Classifier %d" %cl_ind)
        inner_kfold = StratifiedKFold( n_splits = n_inner_folds, shuffle = True, random_state = state)
        inner = inner_kfold.split(x_trn_out, y_trn_out)
        
        for fold_in, (train_ind_in, test_ind_in) in enumerate(inner): 
#             print("Inner fold %d " %fold_in)
            x_trn_in, x_tst_in = x_trn_out[train_ind_in], x_trn_out[test_ind_in]
            y_trn_in, y_tst_in = y_trn_out[train_ind_in], y_trn_out[test_ind_in]
            clf.fit(x_trn_in, y_trn_in)
            
            #inner scores calculated for every inner fold
            inner_scores[cl_ind][fold_in] = clf.score(x_tst_in, y_tst_in)
            
    # get the mean performance for every classifier
    inner_mean_scores = np.mean(inner_scores, axis = 1)
    print(inner_mean_scores)
    
    # sort the scores, low -> high 
    sorted_inds = inner_mean_scores.argsort()
    sorted_scores = inner_mean_scores[sorted_inds]
    
    print ("Best %d classifiers for outer fold %d are: \n " %(n_top_classifiers, fold_out))
    
    #get the inds of best performing classifiers 
    temp2 = 0
    for temp_ind in range(-1, -(n_top_classifiers +1), -1):
        actual_ind = sorted_inds[temp_ind]
#         top_classifiers[temp2] = classifiers[actual_ind]
        top_classifiers.append(classifiers[actual_ind])    
        print('\n', classifiers[actual_ind], '\n')
 
        #fit the best classifier on the outer test data
        classifiers[actual_ind].fit(x_trn_out, y_trn_out)
        
        #get the outer score 
        outer_scores[temp2, fold_out] = classifiers[actual_ind].score(x_tst_out, y_tst_out)
        print("Inner score: %f VS Outer score: %f \n" %(inner_mean_scores[actual_ind], outer_scores[temp2, fold_out]))
        print ("----------------------------------------------------------")
        temp2+=1

        
        
# print(outer_scores)  
# print(top_classifiers)
winsound.Beep(400,900)

Outer loop 1 out of 5




[ 0.81841578  0.7075869   0.78184046  0.83035873]
Best 2 classifiers for outer fold 0 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.830359 VS Outer score: 0.790698 

----------------------------------------------------------

 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

Inner score: 0.818416 VS Outer score: 0.767442 

----------------------------------------------------------
Outer loop 2 out of 5




[ 0.80688503  0.667959    0.77013146  0.85501337]
Best 2 classifiers for outer fold 1 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.855013 VS Outer score: 0.904762 

----------------------------------------------------------

 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

Inner score: 0.806885 VS Outer score: 0.880952 

----------------------------------------------------------
Outer loop 3 out of 5




[ 0.8027852   0.70650624  0.8141934   0.82612522]
Best 2 classifiers for outer fold 2 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.826125 VS Outer score: 0.853659 

----------------------------------------------------------

 KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Inner score: 0.814193 VS Outer score: 0.878049 

----------------------------------------------------------
Outer loop 4 out of 5




[ 0.86124109  0.66403743  0.83792335  0.87891043]
Best 2 classifiers for outer fold 3 are: 
 

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.878910 VS Outer score: 0.829268 

----------------------------------------------------------

 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

Inner score: 0.861241 VS Outer score: 0.829268 

----------------------------------------------------------
Outer loop 5 out of 5




[ 0.83809046  0.69474153  0.80740865  0.82595811]
Best 2 classifiers for outer fold 4 are: 
 

 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

Inner score: 0.838090 VS Outer score: 0.878049 

----------------------------------------------------------

 MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False) 





Inner score: 0.825958 VS Outer score: 0.853659 

----------------------------------------------------------


In [27]:
# tuning MLP while keeping 20% of the data out as a test set

# 'alpha': [0.0001, 0.01, 0.1, 1], 'hidden_layer_sizes': [(100,100), (50,)]
a = [ 0.005, 0.01, 0.05]
hl = [(100,100), (50,), (200,)]

mean_scores = np.zeros((len(a),len(hl)))

X_tr, X_tst, y_tr, y_tst = train_test_split(x_tot, y_tot, test_size=42)


for ind1,param1 in enumerate(a):
    for ind2,param2 in enumerate(hl):

        # normal cross-validation
        scores = []
        kfoldss = StratifiedKFold( n_splits=5, shuffle=True, random_state=state)
        
        kfolds = kfoldss.split(X_tr, y_tr)
        for train_index, test_index in kfolds:
            # split the training data
            X_tr = StandardScaler().fit_transform(X_tr)
            X_train, X_test = X_tr[train_index], X_tr[test_index]
            y_train, y_test = y_tr[train_index], y_tr[test_index]

            # fit 
            clf2_5 = MLPClassifier(alpha = param1, hidden_layer_sizes= param2)
            clf2_5.fit(X_train, y_train)
            scores.append(clf2_5.score(X_test,y_test))
#             print(scores)

        # calculate mean score for folds
        mean_scores[ind1, ind2] = np.mean(scores, axis = 0)


print(mean_scores)
row_max, col_max = np.unravel_index(mean_scores.argmax(), mean_scores.shape)
print((a[row_max], hl[col_max]))

#fit best params 
best_clf = MLPClassifier(alpha = a[row_max], hidden_layer_sizes = hl[col_max])
best_clf.fit(X_tr,y_tr)
print("(external) Test set score = %f " %best_clf.score(X_tst,y_tst))
winsound.Beep(400,900)



[[ 0.80687389  0.8370098   0.8364639 ]
 [ 0.81900624  0.80705214  0.81900624]
 [ 0.82471034  0.8307598   0.82487745]]
(0.005, (50,))
(external) Test set score = 0.880952 


In [28]:
# tuning SVC while keeping 20% of the data out as a test set

g = np.arange(0.01, 2.5, 0.01)
c = [0.1, 1, 10, 100]
mean_scores = np.zeros((len(g),len(c)))

X_tr, X_tst, y_tr, y_tst = train_test_split(x_tot, y_tot, test_size=42)


for ind1,param1 in enumerate(g):
    for ind2,param2 in enumerate(c):

        # normal cross-validation
        scores = []
        kfoldss = StratifiedKFold( n_splits=5, shuffle=True, random_state=state)
        
        kfolds = kfoldss.split(X_tr, y_tr)
        for train_index, test_index in kfolds:
            # split the training data
            X_tr = StandardScaler().fit_transform(X_tr)
            X_train, X_test = X_tr[train_index], X_tr[test_index]
            y_train, y_test = y_tr[train_index], y_tr[test_index]

            # fit 
            clf2_5 = SVC(gamma = param1, C = param2)
            clf2_5.fit(X_train, y_train)
            scores.append(clf2_5.score(X_test,y_test))
#             print(scores)

        # calculate mean score for folds
        mean_scores[ind1, ind2] = np.mean(scores, axis = 0)


# print(mean_scores)
row_max, col_max = np.unravel_index(mean_scores.argmax(), mean_scores.shape)
print((g[row_max], c[col_max]))

#fit best params 
best_clf = SVC(gamma = g[row_max], C = c[col_max])
best_clf.fit(X_tr,y_tr)
print("(external) Test set score = %f " %best_clf.score(X_tst,y_tst))
winsound.Beep(400,900)

(0.02, 10)
(external) Test set score = 0.928571 
