# Classifiers OPTIMIZERS

#### Testando um série de conbinações de parâmetros de modelo & combinações de parâmetros de performance (Número de características, características selecionadas)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Reading the data (.csv file)

data = pd.read_csv('3DRadiomics_checked.csv')
print ("Dataset read OK!")

Dataset read OK!


### Coding binary features -> Outcome:

'''
Immuno -> 0 = False = Negative for any hormonone
         1 = True = Positive for one or more hormones (ACTH, LH, GH, PRL, FSH)
Hardy -> 0 = False = No sign of invasiveness
         1 = True = Some level of invasiveness
Progression -> 0 = False = Stable lesion in the observed period
               1 = True = Recurrent lesion in the observed period
'''

In [3]:
# Organizing and separating features and classes

# Catching dataset without 'Patients' Column
data_new = data.drop(['Patients'], axis=1).copy()

# Converting dataset into float array
data_np = data_new.astype(np.float64).values

# Catching all line values except the last column (ends to be a matrix) - samples matrix
X_raw = data_np[:,:-1] 

# Catching only the last y-component's tuple (vector, or column) - target column
y = data_np[:,-1]  

dictionary to hold names in our calssification
--->> Stable == 0 == False; Recurrent == 1 == True 

In [4]:
# Used in the confusion matrix

classes = ['Stable','Recurrent']

## Rescaling routine and function

In [5]:
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing.data import QuantileTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import scale
from sklearn import preprocessing as prep 
MinMaxScaler = prep.MinMaxScaler((0,1))

def rescaling(normChosen,X_r0):
    if normChosen == 0: # Robust Scalar
        X_r1 = RobustScaler(quantile_range=(25, 75)).fit_transform(X_r0)
        return X_r1
    elif normChosen == 1: # Standard Scalar
        X_r1 = StandardScaler().fit_transform(X_r0)
        return X_r1
    elif normChosen == 2: # Quantile Transformer Scaler        
        X_r1 = QuantileTransformer(output_distribution='uniform').fit_transform(X_r0)
        return X_r1
    elif normChosen == 3: # Normalizer        
        X_r1 = Normalizer().fit_transform(X_r0)
        return X_r1
    elif normChosen == 5: # Scale        
        X_r1 = scale(X_r0)
        return X_r1
    elif normChosen == 6: # MinMaxScaler        
        X_r1 = MinMaxScaler.fit_transform(X_r0)
        return X_r1
    else:
        return X_r0
print ("Reascaling function OK!")

Reascaling function OK!


## Feature selection function

In [6]:
from sklearn.ensemble import ExtraTreesClassifier # extremely randomized clf
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA

def featureSelection(featSelChosen,X_f0,size):
    if featSelChosen == 0: # ExtraTreeClass Feat. Sel.
        clfRF = ExtraTreesClassifier()
        clfRF = clfRF.fit(X_f0, y)
        clfRF.feature_importances_
        model = SelectFromModel(clfRF, prefit=True)
        X_f1 = model.transform(X_f0)
        return X_f1
    elif featSelChosen == 1: # Select K best feat.
        select = SelectKBest(mutual_info_classif, k=size)
        X_f1 = select.fit_transform(X_f0, y)
        return X_f1
    elif featSelChosen == 2: # Principal Component Analysis
        pca = PCA(n_components=size,svd_solver='full')
        pca.fit(X_f0,y)
        X_f1 = pca.transform(X_f0)
        return X_f1
    elif fsChosen == 3: # ReliefF feature selection filrer
        reliefF = ReliefF(n_neighbors=10, n_features_to_keep=size)
        X_f1 = reliefF.fit_transform(X_f0, y)
        return X_f1
print ("Feature selection funtion OK!")

Feature selection funtion OK!


##  Selected Features identifier

In [7]:
def showFeatures (data_new,X_sub):
    selected_Feat = []
    X_f = pd.DataFrame(X_sub)
    for i,c in enumerate(data_new.columns):
        if i != (len(data_new.columns) - 1): # Excluding "Status" column
            for f in (X_f.columns):            
                alpha = data_new[c] - X_f[f] 
                if alpha.mean() == 0: # Checking similarity
                    #name = data.columns[c+1]
                    selected_Feat = np.append(selected_Feat,c)
    return selected_Feat
print ("Show feature function OK!")

Show feature function OK!


## Preprocessing step caller

In [8]:
def preProcessX(size):
    '''
        'size' indicates the number of desired selected features in the feature
        selection step.
                
        'typeChosen' indicates standardization strategy:
            #0: Robust Scalar
            #1: Standard Scalar
            #2: Quantile Transformer (uniform) Scaler
            #3: Normalizer
            #4: No normalization
            #5: Scale
            #6: MinMax Scaller
        
        'fsChosen' indicates features selection strategy:
            #0: ExtraTreeClass Feat. Sel.
            #1: Select K best feat.
            #2: Principal Component Analysis
            #3: ReliefF feature selection filrer
    '''
    
    normChosen = 6
    featSelChosen = 1
    # X_sub = subset of relevant features
    X_sub = featureSelection(featSelChosen,X_raw,size) 
    # X_resc = rescaled subset of relevant features
    X_norm = rescaling(normChosen,X_sub)                   
    # print (X_sub.shape) 
    
    # Returning 
    return X_norm, X_sub
print ("Preprocess caller funtion OK!")

Preprocess caller funtion OK!


## Creating Classifiers

In [15]:
from sklearn import linear_model as lm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

### Logistic regression
clf_lr = ("Logistic_Regression", lm.LogisticRegression(solver='lbfgs'));

### Random Forest
clf_rf = ("Random_Forest",RandomForestClassifier(n_estimators=150, max_depth=5,
                                min_samples_split=2, random_state=0))

### K Nearest Neighbor
clf_knn = ("k_Nearest_Neighbor", KNeighborsClassifier(n_neighbors=5)) 

# Storing classifiers and their respective names into a dictionary for further processes
clfs = [clf_lr, clf_rf, clf_knn]

print ("Classifiers creation OK!")

Classifiers creation OK!


In [10]:
# Global variables for performance
acc0, auc0, spe0 = 0.0, 0.0, 0.0

## Run - classifier executable function - ***With mean ACC, SPE, SEN, AUC***

In [11]:
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from scipy import interp

def run(params):

    classifier_name = params[2][0]
    classifier = params[2][1]
    size = params[0]
    execution_time = params[1]

    X,X_sub = preProcessX(size)
    # X is the normalized subset, ready for processing
    # X_sub is the raw subset used only for identifying the used features by name
    
    # Definig parameters for k-fold cross validation
    # n_samples = len(X) # Number of samples in our dataset -> Same as LOOCV
    kf = KFold(n_splits=3)
                
    # Concatenating both X and y data into a single array in order to use kFold-cv!
    dataset = np.column_stack([X,y])
    
    tprs = []  # hold the true positive rate for each pair test-train 
    aucs = []  # hold the area under the curve for each pair test-train
    spes = []  # hold the specificity for each pair test-train
    sens = []  # hold the sensitivity rate for each pair test-train
    accs = []  # hold the accuracy rate for each pair test-train
    
    mean_fpr = np.linspace(0, 1, 100)
    
    for train_indices, test_indices in kf.split(dataset):

        # The output resultant refers to the sample indexes present in the
        # [Training set] and [Testing set] respectively!

        X_train = X[train_indices]     # X array hold only sample features. 'test' and 'train' bring a list of line numbers
        y_train = y[train_indices]     # y array hold sample classification (1-d array) 
        X_test = X[test_indices]       # We have not explicitly
        y_test = y[test_indices]       # Y-true for each pair train_test
            
        classifier.fit(X_train, y_train)
        probas_ = classifier.predict_proba(X_test)
        y_pred = classifier.predict(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, probas_[:, 1])

        tprs.append(interp(mean_fpr, fpr, tpr))
        tprs[-1][0] = 0.0

        roc_auc = auc(fpr, tpr)
        aucs.append(roc_auc)
        
        # Computing confusion matrix
        cnf_matrix = confusion_matrix(y_test, y_pred)
        # Classification numbers out of confusion matrix
        tp, fn, fp, tn = float(cnf_matrix[1,1]), float(cnf_matrix[1,0]), float(cnf_matrix[0,1]), float(cnf_matrix[0,0])

        # Computing scores
        acc = (tp + tn)/(tp + fp + fn + tn)
        sen = (tp)/(tp + fn)
        spe = (tn)/(tn + fp)
        
        accs.append(acc)
        sens.append(sen)
        spes.append(spe)
    
    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    
    accuracy = np.mean(accs, axis=0)
    sensitivity = np.mean(sens, axis=0) 
    specificity = np.mean(spes, axis=0)
        
    # Identifying features by name
    X_feature_names = showFeatures(data_new, X_sub)
    
    clf_result = [accuracy, sensitivity, specificity, mean_auc, config, size, X_feature_names]
    final_result.append(clf_result)
    print(size,"%.3f" % accuracy,"%.3f" % specificity,"%.3f" % mean_auc)    
    return classifier_name, accuracy, sensitivity, specificity, mean_auc, size, X_feature_names
    
    

## Best Solution assessment

In [25]:
def find_best(results):
    clf = results[0][0]
    acc0, spe0, auc0, sen0, best_size = 0.0, 0.0, 0.0, 0.0, 0.0
    for r in results:
        accuracy, sensitivity, specificity, roc_auc, size = r[1], r[2], r[3], r[4], r[5]
        if ((accuracy > acc0 and specificity >= spe0 ) or
            (accuracy > acc0 and roc_auc > auc0 ) or  
            (accuracy == acc0 and specificity >= spe0 and roc_auc > auc0) or
            (accuracy == acc0 and specificity == spe0 and roc_auc == auc0 and size < best_size)):
   
            best_X = r[6]
            best_size, acc0, sen0, spe0, auc0 = size, accuracy, sensitivity, specificity, roc_auc

            print ('Better Result!')
            print (clf,best_size,"%.3f" % acc0,"%.3f" % spe0, "%.3f" % auc0)
        else:
            #print ('Not Good!')
            print (clf,size,"%.3f" % accuracy,"%.3f" % specificity, "%.3f" % roc_auc)

    print("The very best solution for ",clf,"is:")
    print(acc0, sen0, spe0, auc0)
    print("features",best_size,": ", best_X)
    
    return "Best solution for:",clf," is:",acc0, sen0, spe0, auc0,". Size and feats:",best_size,"; ", best_X

## P A R A L L E L                       E X E C U T I O N

In [13]:
from multiprocessing import Pool as ThreadPool

def runParallel(parameters, threads=7):
    pool = ThreadPool(threads)
    results = pool.map(run, parameters)
    pool.close()
    pool.join()
    
    return results

## MAIN Function

In [16]:
# variable to hold all the classifiers configuration and performance
final_result = []

if __name__ == "__main__":

    num_feat = np.arange(1,16,1)
    # num_feature = number of desired features in the selected subset
    execution = np.arange(1,101,1)
    # execution = number of executions performed with the same num_feat

    # Generating possible confgurations for MLP classifiers.
    params = []
    for n in num_feat:
        for e in execution:
            for c in clfs:
                config = [n,e,c]
                params.append(config)
    
    print("Here we go!")
    
    res = runParallel(params, 160)
    
    #separate_classifier_results(res)

Here we go!
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.630 1.000 0.810
1 0.815 0.944 0.853
1 0.630 1.000 0.810
1 0.815 0.944 0.853
1 0.815 0.944 0.853
1 0.630 1.000 0.810
2 0.741 1.000 0.701
1 0.630 1.000 0.810
1 0.630 1.000 0.810
1 0.630 1.000 0.810
1 0.630 1.000 0.810
1 0.815 0.944 0.853
2 0.741 1.000 0.701
2 0.667 1.000 0.700
2 0.667 1.000 0.700
1 0.630 1.000 0.810
2 0.667 1.000 0.700
2 0.741 1.000 0.701
2 0.667 1.000 0.700
2 0.741 1.000 0.701
1 0.630 1.000 0.810
2 0.741 1.000 0.701
1 0.630 1.000 0.810
1 0.630 0.600 0.808
1 0.630 0.600 0.808
1 0.630 0.600 0.808
1 0.630 0.600 0.808
1 0.630 0.600 0.808
1 0.630 1.000 0.810
2 0.741 1.000 0.701
2 0.741 1.000 0.701
1 0.630 1.000 0.810
4 0.667 1.000 0.716
1 0.630 0.600 0.808
1 0.630 0.600 0.808
2 0.667 1.000 0.700
1 0.630 1.000 0.810
1 0.630 0.600 0.808
1 0.630 0.600 0.808
2 0.741 1.000 0.701
2 0.667 1.000 0.700
4 0.741 1.000 0.521
2 0.778 

## Separating results based on classifiers

In [20]:
def separate_classifier_results(res):
    res_clf={}
    for c in clfs:
        res_clf["{0}".format(c[0])]=c[0]
        res_clf[c[0]] = []
        for r in res:
            if r[0]==c[0]:
                res_clf[c[0]].append(r)
    return res_clf

In [23]:
res_clf = separate_classifier_results(res)

## Printing ou best results by classifier

In [26]:
for c in clfs:
    print(find_best(res_clf[c[0]]))

Better Result!
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.630 1.000 0.810
Logistic_Regression 1 0.6

In [None]:
final_result