In [1]:
import numpy as np
import os
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn.feature_selection import SelectFromModel, SelectKBest, chi2, f_classif, mutual_info_classif
from sklearn.model_selection import StratifiedKFold, GridSearchCV
%matplotlib inline 
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib
from mpl_toolkits.mplot3d import Axes3D #, axes3d
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline
import itertools
import operator


In [2]:
# copied from scikit.
# prints the "percentages" in each class.

def plot_confusion_matrix(cm, classes,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
#     """
#     This function prints and plots the confusion matrix.
#     Normalization can be applied by setting `normalize=True`.
#     """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

   
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    print("Normalized confusion matrix")
   
    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
# cnf_matrix = confusion_matrix(y_test, y_pred)
# np.set_printoptions(precision=2)

In [3]:
def make_pipe_clf(scaler,feature_selection,decomp,clf,order):
# order = 1 : first perform feature selection and then apply PCA
# order = 0 : first apply PCA and then reduce the transformed features
    if order:
        pipeline = Pipeline([('scaler', scaler),
                    ('feature_selection', feature_selection),
                    ('decomp', decomp),         
                    ('classifier', clf) ])
    else:
        pipeline = Pipeline([('scaler', scaler),
                    ('decomp', decomp ),                 
                    ('feature_selection', feature_selection),        
                    ('classifier', clf) ])
    return pipeline
###########################################################################################
def make_pipe(scaler,feature_selection,decomp,order):
    if order:
        pipeline = Pipeline([('scaler', scaler),
                    ('feature_selection', feature_selection),
                    ('decomp', decomp),         
                     ])
    else:
        pipeline = Pipeline([('scaler', scaler),
                    ('decomp', decomp ),                 
                    ('feature_selection', feature_selection),        
                     ])
    return pipeline

In [4]:
def nested_cv(x_tot, y_tot, pip_steps, pip_params, n_outer_folds, n_inner_folds, n_top, state):
# ==================== Description ====================    
# Nested CV in order to evaluate the models' expected performance
# Since feature selection and normalization are a part of the model's
# construction, they should be performed inside the CV to avoid over-optimistic
# errors. Normalizing the whole dataset leaks future information to the estimators
# thus, you should only normalize the train set and then use mean and std to normalize
# test/val set. In that regard, the inner training set is normalized, its params are used
# on the inner test set and then the outer training set is normalized accordingly

# ==================== Inputs ====================
# x_tot, y_tot = the whole dataset you wish to perform nested cv on 
# pip_steps = list of steps in the pipeline
# example: sda = [StandardScaler(), SelectKBest(), PCA(), 1];
# ATTENTION: ALWAYS IN THIS ORDER : [scaler,feature_selection,decomp,clf,order]
# pip_params = list of kwargs
# example: pip_params = {'feature_selection__k' : [100], 'decomp__n_components' : [50]}
# ATTENTION: NAMES MUST BE CONSISTENT WITH DECLARATIONS IN MAKE_PIPE
# n_outer_folds, n_inner_folds = the number of folds in outer and inner cv respectively 
# n_top = the number of best ranking models to print 
# state = set to any integer for reproducibility

# ******************** Example *****************************
# steps = [[StandardScaler(), SelectKBest(), PCA(), SVC(), 1], [MinMaxScaler(), SelectKBest(), PCA(), SVC(), 1]]
# params = {'feature_selection__k' : [4], 'decomp__n_components':[3,2], 'classifier__kernel' : ['rbf'], 'classifier__C': [10], 'classifier__gamma': [0.1,1]}
# for step in steps:
#     result = nested_cv(n_inner_folds=4,n_outer_folds=4,n_top=1,pip_params=params,pip_steps=step,x_tot=x_tot,y_tot=y_tot,state=42)


    #create the pipeline
    n_steps = len(pip_steps) # 4 == make_pipe, 5 == make_pipe_clf

    if n_steps == 4:
        pipe = make_pipe(scaler = pip_steps[0], feature_selection = pip_steps[1], decomp = pip_steps[2], order = pip_steps[3])
    if n_steps == 5 :
        pipe = make_pipe_clf(scaler = pip_steps[0], feature_selection = pip_steps[1], decomp = pip_steps[2], order = pip_steps[4], clf = pip_steps[3])

    # initialization
#     nfeat = pip_params['feature_selection__k']
#     best_feat_ind = np.zeros((n_top,nfeat))
#     outer_scores = np.zeros((n_top, n_outer_folds)) 
    print(pipe)
    outer_cv = StratifiedKFold(n_splits = n_outer_folds, shuffle = True, random_state = state)
    inner_cv = StratifiedKFold(n_splits = n_inner_folds, shuffle = True, random_state = state )
    
    grid_search = GridSearchCV(pipe, pip_params, cv = inner_cv, verbose = 1)
    nested_score = cross_val_score(grid_search, X=x_tot, y=y_tot, cv = outer_cv)
    mean_nest_score = nested_score.mean()
    return mean_nest_score