# **DEMO:  Feature Selection using GridSearchCV/RandomizedSearchCV into a Python Pipeline**

Importing needed packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
import math
import os

  import pandas.util.testing as tm


ReliefF package installation

In [2]:
!pip install --user skrebate



Specific functions importing

In [3]:
from sklearn.model_selection import train_test_split, GridSearchCV,cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.base import  BaseEstimator, TransformerMixin
from skrebate import ReliefF
from sklearn.metrics import roc_curve, auc
from scipy import interp
from itertools import cycle
from sklearn.metrics import pairwise_distances
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
from sklearn.linear_model import LogisticRegression, SGDClassifier, LassoCV
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from tqdm import tqdm
from numpy import array



Avoiding unnecesary warning

In [0]:
warnings.filterwarnings("ignore")

Fixing a seed to set experimental reproducibility

In [0]:
np.random.seed(42)

Since this notebook runs Google Colab, we suggest to mount your Drive account

In [6]:
from google.colab import files, drive

drive.mount('/content/drive')
%cd /content/drive/My\ Drive/

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive
/content/drive/My Drive


Inline $\texttt{matplotlib}$ plots

In [0]:
%matplotlib inline

Useful function to save figures and store results

In [0]:
# Function to store plots
def save_fig(path_img, fig_id, tight_layout = True, fig_extension = "png", resolution = 300):
    path = os.path.join(path_img, fig_id + "." + fig_extension)
    print("Guardando...", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [0]:
# Function to plot ROC curve
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

In [0]:
# Function to compute class-wise ROC curve and ROC area
def roc_multiclass(ytrue,yscore):
    fpr       = dict() # False positive rate
    tpr       = dict() # True positive rate
    roc_auc   = dict()
    n_classes = ytrue.shape[1]
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(ytrue[:, i], yscore[:, i])
        roc_auc[i]        = auc(fpr[i], tpr[i])

    # Micro-average computation from ROC curve and ROC area
    fpr["micro"], tpr["micro"], _ = roc_curve(ytrue.ravel(), yscore.ravel())
    roc_auc["micro"]              = auc(fpr["micro"], tpr["micro"])
    return roc_auc, fpr, tpr, n_classes

In [0]:
# Function to plot ROC curve and stores them
def roc_auc_mc(roc_auc,fpr,tpr,n_classes,title,path_img):   
    lw = 2
    # First, we add all false postive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
    
    # Then, it is interpolated each ROC curve over these points
    mean_tpr= np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally, it is computed the AUC
    mean_tpr /= n_classes

    fpr["macro"]     = all_fpr
    tpr["macro"]     = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    # Each ROC curve is plotted
    plt.figure(figsize=(6,6))
    plt.plot(fpr["micro"], tpr["micro"],
             label='micro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["micro"]),
             color='deeppink', linestyle=':', linewidth=4)

    plt.plot(fpr["macro"], tpr["macro"],
             label='macro-average ROC curve (area = {0:0.2f})'
                   ''.format(roc_auc["macro"]),
             color='navy', linestyle=':', linewidth=4)

    #colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
    colors = sns.color_palette(None, n_classes)
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color, lw=lw,
                 label='AUC_class_{0} (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--', lw=lw)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="best") 
    save_fig(path_img,title)
    plt.show()

In [0]:
# Function to plot confusion matrices
def plot_confusion_matrix(y_true, y_pred, classes,normalize=False,title=None,cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix. Normalitzation is done
    setting normalize = True
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'
            
    # Confusion matrix computation
    cm = confusion_matrix(y_true, y_pred)
    # Classes are obtained from data labels
    classes = classes[unique_labels(y_true, y_pred)-1]
    if normalize:
        cm = 100*cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    fig, ax = plt.subplots()
    im      = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # All ticks are shown...
    ax.set(xticks = np.arange(cm.shape[1]),
           yticks = np.arange(cm.shape[0]),
           # ... and their respective labels 
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel = 'True label',
           xlabel = 'Predicted label')
    
    # Ticks labels and alignment rotation
    plt.setp(ax.get_xticklabels(), rotation = 45, ha = "right", rotation_mode = "anchor")
    
    plt.autoscale()
    return ax

In [0]:
# This class allows to represent an input Data matrix as a similarity/dissimilarity
# matrix
class Dist_Rep(BaseEstimator, TransformerMixin):
    def __init__(self, metric = 'euclidean', n_jobs = None):
        self.metric = metric
        self.n_jobs = n_jobs
        
    def fit(self, X, *_):
        self.Xtrain = X
        return self
    
    def transform(self, X):
        return pairwise_distances(X, self.Xtrain)
    
    def fit_tranform(self,X,Y):
        self.fit(X)
        return pairwise_distances(Y,self.Xtrain)

Setting the image directory

In [0]:
# KIMIA 99-SHAPE DATABASE
img_dir = './Machine Learning/ML Codes/MLPython/Databases/CorrectedDBs/Kimia99Shape_DB_Corrected'

# **Step 1: It is load an specific feature representation**

In [15]:
# Database directory

# To load BoCF-based representation
data_dir = "./Machine Learning/ML Codes/MLPython/Databases/PreprocessingBoCFKimia99ShapeDB(csv-version)"

# To load DTW-CCS-based representation
# data_dir = "./Machine Learning/ML Codes/MLPython/Databases/Kimia99ShapeDB_DTW_CCSFeatRep(csv-version)"

# To load DTW-CS-based representation
# data_dir = "./Machine Learning/ML Codes/MLPython/Databases/Kimia99ShapeDB_DTW_CSFeatRep(csv-version)"

# Set the representation flag: BoCF, DTW-CCS and DTW-CS, according to the data
# prviously loaded
typeR    = 'BoCF'

X        = pd.read_csv(data_dir + '.csv')
y        = np.array(X.iloc[:,-1])
X.drop(columns=X.columns[-1],inplace=True)
X.shape

(99, 31500)

# **Step 2: Variable declaration to store results and models**

In [0]:
# Variable declaration
n_partitions = 10
test_per     = 0.5
n_classes    = len(np.unique(y))
f_step       = 1500
fold         = 0
train_idx    = []
test_idx     = []
alpha_L      = []
alpha_LogR   = []
alpha_lSVM   = []
sel_fts_L    = []
sel_fts_LogR = []
sel_fts_lSVM = []
thld_L       = []
thld_LogR    = []
thld_lSVM    = []
sel_fts_Lt   = []
sel_fts_LogRt= []
sel_fts_lSVMt= []
nfeats_L     = []
nfeats_LogR  = []
nfeats_lSVM  = []
accuracy_L   = np.zeros((n_partitions))
accuracy_LogR= np.zeros((n_partitions))
accuracy_lSVM= np.zeros((n_partitions))
cm_L         = np.zeros((n_partitions,n_classes,n_classes))
cm_LogR      = np.zeros((n_partitions,n_classes,n_classes))
cm_lSVM      = np.zeros((n_partitions,n_classes,n_classes))
cr_L         = []
cr_LogR      = []
cr_lSVM      = []
best_mod_L   = []
best_mod_LogR= []
best_mod_lSVM= []
best_pms_L   = []
best_pms_LogR= []
best_pms_lSVM= []

if typeR == 'BoCF' or typeR == 'DTW-CCS':
  # For BoCF and DTW-CCS representations
  ftr_vec      = np.arange(0,int((X.shape[1]))+f_step,f_step).astype(int) 
  ftr_vec      = ftr_vec[1:]
elif typeR == 'DTW-CS':
  # For DTW-CS representation
  ftr_vec      = np.arange(1,math.ceil((X.shape[0])*test_per),f_step).astype(int) 
  ftr_vec[0]   = 1

# Setting the data partition scheme to work like HoldOut validation
sss = StratifiedShuffleSplit(n_splits = n_partitions, test_size = test_per, random_state=42)

In [0]:
"""Type of classification ensemble. 
   'N' : To run classification
   'PR': To run classification including prototype representation 
         (dissimilarity/similarity-based representation) 
   'NE': To run classification and Non-Exhaustive feature selection including 
         prototype representation
   'E' : To run classification and Exhaustive feature selection including 
         prototype representation       
   'NE-NPR' : To run classification and Non-Exhaustive feature selection without 
              including prototype representation 
   'E-NPR'  : To run classification and Exhaustive feature selection without 
              including prototype representation 
"""
typeC        = 'E-NPR'

# Model declaration

# ------------------------------------------------------------------------------
# CLASSIFICATION
if typeC == 'N':
    # Steps to be include into Pipeline
    steps = [
            [('Preprocessing2',StandardScaler()),
             ('Classification',SGDClassifier())],      # Linear Classifier
            [('Preprocessing2',StandardScaler()),
             ('Classification',LogisticRegression())], # Logistic Regression
            [('Preprocessing2',StandardScaler()),
             ('Classification',LinearSVC())],          # SVM
            ]
    
    # Parameter grid declaration
    parameters = [
                 {'Classification__penalty': ['l1', 'l2', 'elasticnet'] },
                 {'Classification__C': [0.01,0.1,1,10]},
                 {'Classification__C': [0.1,1,10,100,1000]}
                 ]
    
    # Model labels
    label_models = ['Linear','LogisticRegression','LinearSVC']
# ------------------------------------------------------------------------------
# CLASSIFICATION FROM DISSIMILARITY/SIMILARITY REPRESENTATION
if typeC == 'PR':
    # Steps to be include into the Pipeline
    steps = [
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('Classification',SGDClassifier())],      # Linear Classifier
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('Classification',LogisticRegression())], # Logistic Regression
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('Classification',LinearSVC())],          # SVM
            ]
    
    # Parameter grid declaration
    parameters = [
                 {'Classification__penalty': ['l1', 'l2', 'elasticnet'] },
                 {'Classification__C': [0.01,0.1,1,10]},
                 {'Classification__C': [0.1,1,10,100,1000]}
                 ]
    
    # Model labels
    label_models = ['Linear','LogisticRegression','LinearSVC']
# ------------------------------------------------------------------------------
# FEATURE SELECTION: NON-EXHAUSTIVE - LASSO    
elif typeC == 'NE':
    # Steps to be include into the Pipeline
    steps = [
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('FeatureSelectn',SelectFromModel(LassoCV(n_alphas = 20, max_iter = 250),threshold="mean")),
             ('Classification',SGDClassifier())],      # Linear Classifier
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('FeatureSelectn',SelectFromModel(LassoCV(n_alphas = 20, max_iter = 250),threshold="mean")),
             ('Classification',LogisticRegression())], # Logistic Regression
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('FeatureSelectn',SelectFromModel(LassoCV(n_alphas = 20, max_iter = 250),threshold="mean")),
             ('Classification',LinearSVC())],          # SVM
            ]
    
    # Parameter grid declaration
    parameters = [
                 {'FeatureSelectn__threshold': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
                  'Classification__penalty': ['l1', 'l2', 'elasticnet'] },
                 {'FeatureSelectn__threshold': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
                  'Classification__C': [0.01,0.1,1,10]},
                 {'FeatureSelectn__threshold': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
                  'Classification__C': [0.1,1,10,100,1000]}
                 ]

    # Model labels
    label_models = ['Linear','LogisticRegression','LinearSVC']
# ------------------------------------------------------------------------------
# FEATURE SELECTION: EXHAUSTIVE - PROTOTYPE REPRESENTATION - RELIEFF
elif typeC == 'E':
    # Feature weigths storage
    scores_rlff_L    = []
    scores_rlff_LogR = []
    scores_rlff_lSVM = []

    # Steps to be include into the Pipeline
    steps = [
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('FeatureSelectnRel',ReliefF()),
             ('Classification',SGDClassifier())],      # Linear Classifier
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('FeatureSelectnRel',ReliefF()),
             ('Classification',LogisticRegression())], # Logistic Regression
            [('Preprocessing', StandardScaler()),
             ('Representation',Dist_Rep()),
             ('Preprocessing2',StandardScaler()),
             ('FeatureSelectnRel',ReliefF()),
             ('Classification',LinearSVC())],          # SVM
            ]
    
    # Parameter grid declaration
    parameters = [
                 {'FeatureSelectnRel__n_features_to_select': ftr_vec,
                  'FeatureSelectnRel__n_neighbors':[1],
                  'Classification__penalty': ['l1', 'l2', 'elasticnet'] },
                 {'FeatureSelectnRel__n_features_to_select': ftr_vec,
                  'FeatureSelectnRel__n_neighbors':[1],
                  'Classification__C': [0.01,0.1,1,10]},
                 {'FeatureSelectnRel__n_features_to_select': ftr_vec,
                  'FeatureSelectnRel__n_neighbors':[1],
                  'Classification__C': [0.1,1,10,100,1000]}
                 ]

    # Model labels
    label_models = ['Linear','LogisticRegression','LinearSVC']
# ------------------------------------------------------------------------------
# FEATURE SELECTION: NON-EXHAUSTIVE - WITHOUR PROTOTYPE REPRESENTATION - LASSO    
elif typeC == 'NE-NPR':
    # Steps to be include into the pipeline
    steps = [
            [('Preprocessing2',StandardScaler()),
             ('FeatureSelectn',SelectFromModel(LassoCV(n_alphas = 20, max_iter = 200,tol=1e-4),threshold="mean")),
             ('Classification',SGDClassifier())],      # Clasificador Lineal
            [('Preprocessing2',StandardScaler()),
             ('FeatureSelectn',SelectFromModel(LassoCV(n_alphas = 20, max_iter = 200,tol=1e-4),threshold="mean")),
             ('Classification',LogisticRegression())], # Regresion Logistica
            [('Preprocessing2',StandardScaler()),
             ('FeatureSelectn',SelectFromModel(LassoCV(n_alphas = 20, max_iter = 200,tol=1e-4),threshold="mean")),
             ('Classification',LinearSVC())],          # Maquina de Vectores de Soporte
            ]
    
    # Parameter grid declaration
    parameters = [
                 {'FeatureSelectn__threshold': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
                  'Classification__penalty': ['l1', 'l2', 'elasticnet'] },
                 {'FeatureSelectn__threshold': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
                  'Classification__C': [0.01,0.1,1,10]},
                 {'FeatureSelectn__threshold': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1],
                  'Classification__C': [0.1,1,10,100,1000]}
                 ]

    # Model labels
    label_models = ['Linear','LogisticRegression','LinearSVC']    
# ------------------------------------------------------------------------------
# FEATURE SELECTION: EXHAUSTIVE - WITHOUT PROTOTYPE REPRESENTATION - RELIEFF
elif typeC == 'E-NPR':
    # Feature weigths storage
    scores_rlff_L    = []
    scores_rlff_LogR = []
    scores_rlff_lSVM = []

    # Steps to be include into the Pipeline
    steps = [
            [('Preprocessing', StandardScaler()),
             ('FeatureSelectnRel',ReliefF()),
             ('Classification',SGDClassifier())],      # Linear Classifier
            [('Preprocessing', StandardScaler()),
             ('FeatureSelectnRel',ReliefF()),
             ('Classification',LogisticRegression())], # Logistic Regression
            [('Preprocessing', StandardScaler()),
             ('FeatureSelectnRel',ReliefF()),
             ('Classification',LinearSVC())],          # SVM
            ]
    
    # Parameter grid declaration
    parameters = [
                 {'FeatureSelectnRel__n_features_to_select': ftr_vec,
                  'FeatureSelectnRel__n_neighbors':[1],
                  'Classification__penalty': ['l1', 'l2', 'elasticnet'] },
                 {'FeatureSelectnRel__n_features_to_select': ftr_vec,
                  'FeatureSelectnRel__n_neighbors':[1],
                  'Classification__C': [0.01,0.1,1,10]},
                 {'FeatureSelectnRel__n_features_to_select': ftr_vec,
                  'FeatureSelectnRel__n_neighbors':[1],
                  'Classification__C': [0.1,1,10,100,1000]}
                 ]

    # Model labels
    label_models = ['Linear','LogisticRegression','LinearSVC']
# ------------------------------------------------------------------------------

Here, we suggest to just uncomment the lines according to the classification ensemble that you want to run. However, feel free to modify the directory names as you prefer

In [0]:
# Directory to save results and plots

# BoCF-based scheme, without applying Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_BoCF_Results/Results_BoCF_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_BoCF_Results'

# BoCF-based scheme, applying Exhaustive Feature Selection
rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_BoCF_E_Results/Results_BoCF_E_Py'
img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_BoCF_E_Results'

# BoCF-based scheme, applying Non-Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_BoCF_NE_Results/Results_BoCF_NE_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_BoCF_NE_Results'


# Dissimilarity/similarity BoCF-based scheme, without applying Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_PR_BoCF_Results/Results_PR_BoCF_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_PR_BoCF_Results'

# Dissimilarity/similarity BoCF-based scheme, applying Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_PR_BoCF_E_Results/Results_PR_BoCF_E_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_PR_BoCF_E_Results'

# Dissimilarity/similarity BoCF-based scheme, applying Non-Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_PR_BoCF_NE_Results/Results_PR_BoCF_NE_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_PR_BoCF_NE_Results'


# DTW-CCS scheme, without applying Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CCS_Results/Results_DTW_CCS_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CCS_Results'

# DTW-CCS scheme, applying Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/Kimia99ShapeDB_DTW_CCS_E_Results/Results_DTW_CCS_E_Py'
# img_path     = img_dir + '/FeatureSelection/Kimia99ShapeDB_DTW_CCS_E_Results'

# DTW-CCS scheme, applying Non-Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CCS_NE_Results/Results_DTW_CCS_NE_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CCS_NE_Results'


# DTW-CS scheme, without applying Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CS_Results/Results_DTW_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CS_Results'

# DTW-CS scheme, applying Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CS_E_Results/Results_DTW_CS_E_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CS_E_Results'

# DTW-CS scheme, applying Non-Exhaustive Feature Selection
# rslt_dir     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CS_NE_Results/Results_DTW_CS_NE_Py'
# img_path     = img_dir + '/FeatureSelection/DemoTest/Kimia99ShapeDB_DTW_CS_NE_Results'

# **Step 3: Feature Selection loop**

In [0]:
# Traininig/Testing loop

for train_index, test_index in tqdm(sss.split(X,y)):
    # Training/testing index storage
    train_idx += [train_index]
    test_idx  += [test_index]
    
    # Number of partitions flag
    fold = fold + 1
    print("Iteration = ", str(fold) +'/'+ str(n_partitions))
    
    # Iteration file name 
    filename = img_path + "/Fold" + str(fold)
    
    # Train/Test 
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # ---------------------------------------------------------------------------------------------------------------------------
    # Training
    
    # Linear
    print('Linear Model')
    # Using GridSearchCV 
    # hs_Lineal = GridSearchCV(Pipeline(steps[0]), parameters[0], n_jobs = 6, cv = 5, scoring = 'balanced_accuracy', verbose = 50)
    # Using RandomizedSearchCV
    hs_Lineal = RandomizedSearchCV(Pipeline(steps[0]), param_distributions=parameters[0],n_iter=10, cv=5, iid=False, n_jobs=-1)
    hs_Lineal.fit(X_train,y_train)
    if typeC == 'NE':
      # Features selected by LASSO
      alpha_L   += [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].estimator_.alpha_]
      sel_fts_L += [train_index[hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].get_support()]]
      # Features selected according to threshold value from SelectFromModel()
      thld_L    += [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].threshold_]
      sel_fts_Lt+= [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].estimator_.coef_>thld_L[-1]]
      nfeats_L  += [np.sum(sel_fts_Lt)]
    elif typeC == 'NE-NPR':
      # Features selected by LASSO
      alpha_L   += [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].estimator_.alpha_]
      sel_fts_L += [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].get_support()]
      # Features selected according to threshold value from SelectFromModel()
      thld_L    += [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].threshold_]
      sel_fts_Lt+= [hs_Lineal.best_estimator_.named_steps['FeatureSelectn'].estimator_.coef_>thld_L[-1]]
      nfeats_L  += [np.sum(sel_fts_Lt)]
    elif typeC == 'E' or typeC == 'E-NPR':
      # Features selected by ReliefF method
      nfeats_L     += [hs_Lineal.best_estimator_.named_steps['FeatureSelectnRel'].n_features_to_select]
      sel_fts_L    += [hs_Lineal.best_estimator_.named_steps['FeatureSelectnRel'].top_features_]
      scores_rlff_L+= [hs_Lineal.best_estimator_.named_steps['FeatureSelectnRel'].feature_importances_]


    # Logistic Regression
    print('Logistic Regression Model')
    # Usaing GridSearchCV 
    # hs_LogR = GridSearchCV(Pipeline(steps[1]), parameters[1], n_jobs = 6, cv = 5, scoring = 'balanced_accuracy', verbose = 50)
    # Using RandomizedSearchCV
    hs_LogR = RandomizedSearchCV(Pipeline(steps[1]), param_distributions=parameters[1],n_iter=10, cv=5, iid=False,n_jobs=-1)
    hs_LogR.fit(X_train,y_train)
    if typeC == 'NE':
      # Features selected by LASSO
      alpha_LogR   += [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].estimator_.alpha_]
      sel_fts_LogR += [train_index[hs_LogR.best_estimator_.named_steps['FeatureSelectn'].get_support()]]
      # Features selected according to threshold value from SelectFromModel()
      thld_LogR    += [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].threshold_]
      sel_fts_LogRt+= [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].estimator_.coef_>thld_LogR[-1]]
      nfeats_LogR  += [np.sum(sel_fts_LogRt)]
    elif typeC =='NE-NPR':
      # Features selected by LASSO
      alpha_LogR   += [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].estimator_.alpha_]
      sel_fts_LogR += [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].get_support()]
      # Features selected according to threshold value from SelectFromModel()
      thld_LogR    += [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].threshold_]
      sel_fts_LogRt+= [hs_LogR.best_estimator_.named_steps['FeatureSelectn'].estimator_.coef_>thld_LogR[-1]]
      nfeats_LogR  += [np.sum(sel_fts_LogRt)]
    elif typeC == 'E' or typeC == 'E-NPR':
      # Features selected by ReliefF method
      nfeats_LogR     += [hs_LogR.best_estimator_.named_steps['FeatureSelectnRel'].n_features_to_select]
      sel_fts_LogR    += [hs_LogR.best_estimator_.named_steps['FeatureSelectnRel'].top_features_]
      scores_rlff_LogR+= [hs_LogR.best_estimator_.named_steps['FeatureSelectnRel'].feature_importances_]
    

    # Linear SVM
    print('Linear SVM Model')
    # Using GridSearchCV 
    #hs_lSVM = GridSearchCV(Pipeline(steps[2]), parameters[2], n_jobs = 6, cv = 5, scoring = 'balanced_accuracy', verbose = 50)
    # Using RandomizedSearchCV
    hs_lSVM = RandomizedSearchCV(Pipeline(steps[2]), param_distributions=parameters[2],n_iter=10, cv=5, iid=False, n_jobs=-1)  
    hs_lSVM.fit(X_train,y_train)
    if typeC == 'NE':
      # Features selected by LASSO
      alpha_lSVM   += [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].estimator_.alpha_]
      sel_fts_lSVM += [train_index[hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].get_support()]]
      # Features selected according to threshold value from SelectFromModel()
      thld_lSVM    += [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].threshold_]
      sel_fts_lSVMt+= [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].estimator_.coef_>thld_lSVM[-1]]
      nfeats_lSVM  += [np.sum(sel_fts_lSVMt)]
    elif typeC == 'NE-NPR':
      # Features selected by LASSO
      alpha_lSVM   += [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].estimator_.alpha_]
      sel_fts_lSVM += [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].get_support()]
      # Features selected according to threshold value from SelectFromModel()
      thld_lSVM    += [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].threshold_]
      sel_fts_lSVMt+= [hs_lSVM.best_estimator_.named_steps['FeatureSelectn'].estimator_.coef_>thld_lSVM[-1]]
      nfeats_lSVM  += [np.sum(sel_fts_lSVMt)]
    elif typeC == 'E' or typeC == 'E-NPR':
      # Features selected by ReliefF method
      nfeats_lSVM     += [hs_lSVM.best_estimator_.named_steps['FeatureSelectnRel'].n_features_to_select]
      sel_fts_lSVM    += [hs_lSVM.best_estimator_.named_steps['FeatureSelectnRel'].top_features_]
      scores_rlff_lSVM+= [hs_lSVM.best_estimator_.named_steps['FeatureSelectnRel'].feature_importances_]

    # ---------------------------------------------------------------------------------------------------------------------------
    # Validation
    
    # Linear
    y_pred_L           = hs_Lineal.best_estimator_.predict(X_test)
    accuracy_L[fold-1] = accuracy_score(y_test,y_pred_L)
    cm_temp            = confusion_matrix(y_test,y_pred_L)
    cm_L[fold-1,:,:]   = 100*cm_temp.astype('float') / cm_temp.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(y_test, y_pred_L, classes=np.unique(y),normalize=True,title='ACC = %.1f %% Fold %d' % (100*accuracy_L[fold-1],fold) + '_'+ label_models[0])
    plt.autoscale()
    save_fig(img_path,label_models[0]+'_Fold'+str(fold))                      
    plt.show()
    cr_L += [classification_report(y_test,y_pred_L)]
    print(cr_L[-1])
    # Best model storage
    # best_mod_L += [hs_Lineal.best_estimator_, accuracy_L,cm_L,cr_L, sel_fts_L]
    #best_mod_L += [hs_Lineal.best_estimator_]
    if typeC == 'E' or typeC == 'E-NPR':
      best_pms_L += [hs_Lineal.best_params_,nfeats_L,sel_fts_L,scores_rlff_L,accuracy_L,cm_L,cr_L]
    elif typeC == 'NE' or typeC == 'NE-NPR':
      best_pms_L += [hs_Lineal.best_params_,alpha_L,sel_fts_L,thld_L,sel_fts_Lt,nfeats_L,accuracy_L,cm_L,cr_L]
    elif typeC == 'N' or typeC == 'PR':
      best_pms_L += [hs_Lineal.best_params_,accuracy_L,cm_L,cr_L]

    
    # Logistic Regression
    y_pred_LogR          = hs_LogR.best_estimator_.predict(X_test)
    accuracy_LogR[fold-1]= accuracy_score(y_test,y_pred_LogR)
    cm_temp              = confusion_matrix(y_test,y_pred_LogR)
    cm_LogR[fold-1,:,:]  = 100*cm_temp.astype('float') / cm_temp.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(y_test, y_pred_LogR, classes=np.unique(y),normalize=True,title='ACC = %.1f %% Fold %d' % (100*accuracy_LogR[fold-1],fold) + '_'+ label_models[1])
    plt.autoscale()
    save_fig(img_path,label_models[1]+'_Fold'+str(fold))                      
    plt.show()
    cr_LogR += [classification_report(y_test,y_pred_LogR)]
    print(cr_LogR[-1])
    # Best model storage
    # best_mod_LogR += [hs_LogR.best_estimator_, accuracy_LogR,cm_LogR,cr_LogR, sel_fts_LogR]
    # best_mod_LogR += [hs_LogR.best_estimator_]
    if typeC == 'E' or typeC == 'E-NPR':
      best_pms_LogR += [hs_LogR.best_params_,nfeats_LogR,sel_fts_LogR,scores_rlff_LogR,accuracy_LogR,cm_LogR,cr_LogR]
    elif typeC == 'NE' or typeC == 'NE-NPR':
      best_pms_LogR += [hs_LogR.best_params_,alpha_LogR,sel_fts_LogR,thld_LogR,sel_fts_LogRt,nfeats_LogR,accuracy_LogR,cm_LogR,cr_LogR]
    elif typeC == 'N' or typeC == 'PR':
      best_pms_LogR += [hs_LogR.best_params_,accuracy_LogR,cm_LogR,cr_LogR]
    

    # Linear SVM
    y_pred_lSVM          = hs_lSVM.best_estimator_.predict(X_test)
    accuracy_lSVM[fold-1]= accuracy_score(y_test,y_pred_lSVM)
    cm_temp              = confusion_matrix(y_test,y_pred_lSVM)
    cm_LogR[fold-1,:,:]  = 100*cm_temp.astype('float') / cm_temp.sum(axis=1)[:, np.newaxis]
    plot_confusion_matrix(y_test, y_pred_lSVM, classes=np.unique(y),normalize=True,title='ACC = %.1f %% Fold %d' % (100*accuracy_lSVM[fold-1],fold) + '_'+ label_models[2])
    plt.autoscale()
    save_fig(img_path,label_models[2]+'_Fold'+str(fold))                      
    plt.show()
    cr_lSVM += [classification_report(y_test,y_pred_lSVM)]
    print(cr_lSVM[-1])
    # Best model storage
    # best_mod_lSVM += [hs_lSVM.best_estimator_, accuracy_lSVM,cm_lSVM,cr_lSVM, sel_fts_lSVM]
    # best_mod_lSVM += [hs_lSVM.best_estimator_]
    if typeC == 'E' or typeC == 'E-NPR':
      best_pms_lSVM += [hs_lSVM.best_params_,nfeats_lSVM,sel_fts_lSVM,scores_rlff_lSVM,accuracy_lSVM,cm_lSVM,cr_lSVM]
    elif typeC == 'NE' or typeC == 'NE-NPR':
      best_pms_lSVM += [hs_lSVM.best_params_,alpha_lSVM,sel_fts_lSVM,thld_L,sel_fts_lSVMt,nfeats_lSVM,accuracy_lSVM,cm_L,cr_lSVM]
    elif typeC == 'N' or typeC == 'PR':
      best_pms_lSVM += [hs_lSVM.best_params_,accuracy_lSVM,cm_lSVM,cr_lSVM]

0it [00:00, ?it/s]

Iteration =  1/10
Linear Model
Logistic Regression Model
Linear SVM Model


Once the feature selection loop has run, it is stored general results

In [0]:
if typeC == 'N' or typeC == 'PR':
  # Results dictionary creation
  L_dict = {'accuracy_L': accuracy_L,
            'cm_L': cm_L,
            'cr_L': cr_L}

  LogR_dict = {'accuracy_LogR': accuracy_LogR,
              'cm_LogR': cm_LogR,
              'cr_LogR': cr_LogR}

  lSVM_dict = {'accuracy_lSVM': accuracy_lSVM,
              'cm_lSVM': cm_lSVM,
              'cr_lSVM': cr_lSVM}

elif typeC == 'NE' or typeC == 'NE-NPR':
  # Results dictionary creation
  L_dict = {'accuracy_L': accuracy_L,
            'cm_L': cm_L,
            'cr_L': cr_L,
            'sel_fts_L':sel_fts_L,
            'sel_fts_Lt':sel_fts_Lt,
            'alpha_L': alpha_L,
            'thld_L': thld_L,
            'nfeats_L': nfeats_L}

  LogR_dict = {'accuracy_LogR': accuracy_LogR,
              'cm_LogR': cm_LogR,
              'cr_LogR': cr_LogR,
              'sel_fts_LogR':sel_fts_LogR,
              'sel_fts_LogRt':sel_fts_LogRt,
              'alpha_LogR': alpha_LogR,
              'thld_LogR': thld_LogR,
              'nfeats_LogR': nfeats_LogR}

  lSVM_dict = {'accuracy_lSVM': accuracy_lSVM,
              'cm_lSVM': cm_lSVM,
              'cr_lSVM': cr_lSVM,
              'sel_fts_lSVMR':sel_fts_lSVM,
              'sel_fts_lSVMt':sel_fts_lSVMt,
              'alpha_lSVM': alpha_lSVM,
              'thld_lSVM': thld_lSVM,
              'nfeats_lSVM': nfeats_lSVM}

elif typeC == 'E' or typeC == 'E-NPR':
  # Results dictionary creation
  L_dict = {'accuracy_L': accuracy_L,
            'cm_L': cm_L,
            'cr_L': cr_L,
            'sel_fts_L':sel_fts_L,
            'scores_rlff_L':scores_rlff_L,
            'nfeats_L': nfeats_L}

  LogR_dict = {'accuracy_LogR': accuracy_LogR,
              'cm_LogR': cm_LogR,
              'cr_LogR': cr_LogR,
              'sel_fts_LogR':sel_fts_LogR,
              'scores_rlff_LogR':scores_rlff_LogR,
              'nfeats_LogR': nfeats_LogR}

  lSVM_dict = {'accuracy_lSVM': accuracy_lSVM,
              'cm_lSVM': cm_lSVM,
              'cr_lSVM': cr_lSVM,
              'sel_fts_lSVM':sel_fts_lSVM,
              'scores_rlff_lSVM':scores_rlff_lSVM,
              'nfeats_lSVM': nfeats_lSVM}


Results = [L_dict, LogR_dict, lSVM_dict]

joblib.dump(Results, rslt_dir + ".pkl")

In [0]:
print('Linear Classifier')
print(np.mean(np.array(Results[0]['accuracy_L']))*100)
print(np.std(np.array(Results[0]['accuracy_L']))*100)

print('Logistic Regression Classifier')
print(np.mean(np.array(Results[1]['accuracy_LogR']))*100)
print(np.std(np.array(Results[1]['accuracy_LogR']))*100)

print('Linear SVM Classifier')
print(np.mean(np.array(Results[2]['accuracy_lSVM']))*100)
print(np.std(np.array(Results[2]['accuracy_lSVM']))*100)