# READ ME

#### This notebook is implemeted in a repository in github with input and output folders.
#### It has 2 parts:
##### > Part 1: Pre-defined funtions for each technique.
##### > Part 2: Execution of model pipelines, here users can modify which combination of techniques they want to run. The scores will be printed as a csv in output folders. 

# PART 1: FUNCTIONS

# Import library

In [2]:
#importing libraries
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from BorutaShap import BorutaShap
from sklearn.feature_selection import RFE

from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN

from collections import Counter
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, roc_auc_score, log_loss, cohen_kappa_score, make_scorer

import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical 

from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

  from .autonotebook import tqdm as notebook_tqdm


# Read df

In [3]:
#this function is to read, transform and join 2 data frame

def read_features():
    path = 'input/secom.data'
    df = pd.read_csv(path, delimiter=' ', header=None, na_values=['NaN'])
    df.columns = ['feature_'+str(x+1) for x in range(len(df.columns))]
    return df


def read_target():
    path = 'input/secom_labels.data'
    df = pd.read_csv(path, delimiter=' ', header=None, na_values=['NaN'])
    df.columns = ['status','timestamp']
    df['timestamp'] = pd.to_datetime(df['timestamp'],dayfirst=True)
    return df


# Remove duplicated columns

In [4]:
#find the duplicated features (columns)
def remove_duplicated_columns(df):
    list_duplicate = []
    to_remove = []
    for i in range(0, len(df.columns)):
        l = []
        for j in range(i+1,len(df.columns)):
            if df.iloc[:,i].equals(df.iloc[:,j]) == True:
                if j not in list_duplicate:
                    l.append(j)
                    to_remove.append('feature_'+str(j+1))
                list_duplicate.append(i)
                list_duplicate.append(j)

    return df.drop(columns=to_remove, axis = 1)

# X = remove_duplicated_columns(X)
# X.shape


# Remove columns with Constant volatility (std=0)

In [5]:
def remove_constant_volatility(df):
    df_EDA= df.describe().T
    df_EDA= df_EDA[df_EDA["std"] == 0]
    df = df.drop(axis=1, columns=df_EDA.index)
    return df

# X = remove_constant_volatility(X)
# X.shape

# Remove columns with high %Missing values

In [6]:
def remove_cols_with_high_pct_null(df, null_threshold):
    list_column_with_pct_null = pd.concat([df.isnull().sum(), df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
    list_column_with_pct_null= list_column_with_pct_null[list_column_with_pct_null["Percentage (%)"] >= null_threshold]
    df = df.drop(axis=1, columns=list_column_with_pct_null.index)
    return df

# X = remove_cols_with_high_pct_null(X, 0.8)
# X.shape

# Split data

In [7]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1,stratify=y)

# Outlier treatment

In [8]:
#how = ['NaN', '3s' ,'nothing']
def replace_outlier(df, how):
    for col in df:
        ll_col = df[col].mean() - 3 * df[col].std()
        ul_col = df[col].mean() + 3 * df[col].std()
        if how == 'NaN':
            df[col] = np.where(df[col]>ul_col,np.NaN,np.where(df[col]<ll_col,np.NaN,df[col]))
        elif how == '3s':
            df[col] = np.where(df[col]>ul_col,ul_col,np.where(df[col]<ll_col,ll_col,df[col]))
    return df

# Missing value Imputation

In [9]:
#which_weights = ['distance','uniform']

def impute_null_with_knn(X_train, X_test, which_weights):
    #First scale the data 
    scaler = MinMaxScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), columns= X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns= X_test.columns)

    knn = KNNImputer(n_neighbors=5, weights=which_weights) #check this neighbors = 5

    X_train = pd.DataFrame(knn.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(knn.transform(X_test), columns=X_test.columns)
    
    X_train = pd.DataFrame(scaler.inverse_transform(X_train), columns= X_train.columns)
    X_test = pd.DataFrame(scaler.inverse_transform(X_test), columns= X_test.columns)
    return X_train, X_test

#X_train = impute_null_with_knn(X_train)

In [10]:
def impute_null_with_mice(X_train, X_test): 
    imp = IterativeImputer(max_iter=5, verbose=0, imputation_order='roman', random_state=0)
    X_train = pd.DataFrame(imp.fit_transform(X_train), columns=X_train.columns)
    X_test = pd.DataFrame(imp.transform(X_test), columns=X_test.columns)
    return X_train, X_test

# Feature Selection

In [11]:
#This is BorutaShap with TENTATIVE features

#list_method=['shap','gini']

def BorutaShap_FS (X, y, method_option) :
    #modelshap = RandomForestClassifier(n_jobs=-1,n_estimators=100, class_weight='balanced_subsample', max_depth=5, random_state=100)
    modelshap = RandomForestClassifier(n_jobs=-1,n_estimators=100, max_depth=5, random_state=100)

    # define model for resp. classifier
    modelshap.fit(X,y)
    feature_names = np.array(X.columns)
    # define Boruta Sahp feature selection method
    feature_selector = BorutaShap(model=modelshap,
                              importance_measure=method_option,
                              classification=True)  # find all relevant features
    feature_selector.fit(X,y,n_trials=100,sample = False, verbose = False,random_state=100)  
    #feature_selector.plot(which_features='accepted',figsize=(20,10))
    tentative=X.loc[:,feature_selector.tentative]
    selected=feature_selector.Subset()
    selten=pd.concat([selected,tentative],axis=1)
    # call transform() on X to filter it down to selected features
    return  selten

In [12]:
#RFE

#classifier = ['RF', 'SVM']

def RFE_FS (X, y,classify) :
    scaler = MinMaxScaler()
    X_scaled= pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    feature_names = np.array(X_scaled.columns)
    if classify == 'RF':
    # define random forest classifier
        model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
       
    if classify== 'SVM':
        model = SVC(kernel='linear',C=5)
        #rfe = RFECV(estimator = model,scoring='accuracy')
    # find all relevant features
    model.fit(X_scaled, y)
    rfe = RFE(estimator = model,n_features_to_select = 30)
    rfe.fit(X_scaled,y)

     # zip feature names, ranks, and decisions 
    feature_ranks = list(zip(feature_names, 
                             rfe.ranking_, 
                             rfe.support_))

    final_features_rfe = list()
    indexes = np.where(rfe.ranking_ <= 2)
    for x in np.nditer(indexes):
        final_features_rfe.append(feature_names[x])
    
    
    # unscale the data before return
    X_unscaled=pd.DataFrame(scaler.inverse_transform(X_scaled), columns=X_scaled.columns)
    ff_rfe=pd.DataFrame(X_unscaled.filter(final_features_rfe))
    

 # call transform() on X to filter it down to selected features
    return  ff_rfe

In [13]:
#Boruta function with random forest

def BorutaPy_FS (X, y) :
    feature_names = np.array(X.columns)

    # define random forest classifier
    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5, random_state=100)
    model.fit(X, y)
    # define Boruta feature selection method
    
    feature_selector = BorutaPy(model, n_estimators='auto', verbose=0, random_state=100, max_iter=140)

    # find all relevant features
    feature_selector.fit(X.to_numpy(),y)

    # check selected features
    ##--feature_selector.support_

    # check ranking of features
    ##--feature_ranking=feature_selector.ranking_

    # zip feature names, ranks, and decisions 
    # feature_ranks = list(zip(feature_names, 
    #                          feature_selector.ranking_, 
    #                          feature_selector.support_))

    # print the results
    ##--for feat in feature_ranks:
    ##--    print('Feature: {:<30} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))
        
    final_features = list()
    indexes = np.where(feature_selector.ranking_ <= 2) #change to 2
    for x in np.nditer(indexes):
        final_features.append(feature_names[x])
    ##--print(final_features)
    
 # call transform() on X to filter it down to selected features
    return pd.DataFrame(X.filter(final_features))

# Multicolinearity treatement

In [14]:
#Remove the highly collinear features from data
def remove_collinear_features(x, threshold):
    '''
    Objective:
        Remove collinear features in a dataframe with a correlation coefficient
        greater than the threshold. Removing collinear features can help a model 
        to generalize and improves the interpretability of the model.

    Inputs: 
        x: features dataframe
        threshold: features with correlations greater than this value are removed

    Output: 
        dataframe that contains only the non-highly-collinear features
    '''

    # Calculate the correlation matrix
    corr_matrix = x.corr()
    iters = range(len(corr_matrix.columns) - 1)
    drop_cols = []

    # Iterate through the correlation matrix and compare correlations
    for i in iters:
        for j in range(i+1):
            item = corr_matrix.iloc[j:(j+1), (i+1):(i+2)]
            col = item.columns
            row = item.index
            val = abs(item.values)

            # If correlation exceeds the threshold
            if val >= threshold:
                #Print the correlated features and the correlation value
                #print(col.values[0], "|", row.values[0], "|", round(val[0][0], 2))
                drop_cols.append(col.values[0])

    # Drop one of each pair of correlated columns
    drops = set(drop_cols)
    x = x.drop(columns=drops)

    return x

#remove_collinear_features(X, 0.7)

# Balancing

In [15]:
def sampling(X_train, y_train, sampler):
    
    #SMOTE
    if sampler == 'SMOTE':
        sampler = SMOTE(random_state=100)    
    
    #ROSE
    if sampler == 'ROSE':
        sampler = RandomOverSampler(random_state=100, shrinkage=1)

    #ADASYN
    if sampler == 'ADASYN':
        sampler = ADASYN(random_state=100)
    

    #SMOTTEENN
    if sampler == 'SMOTEENN' :
        sampler = SMOTEENN(random_state=100)
        
        
    #Random under Sampling
    if sampler == "randomunder":
        sampler = RandomUnderSampler(random_state=100)

    X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)
    #counter = Counter(y_resampled)
    #print(counter)
    
    return X_resampled, y_resampled

# X_train, y_train = sampling(X_train, y_train,'SMOTE')
# X_train.shape

# Model

#### Model: Deep Neural Network

#Note:
<br>How to Use Keras Models in scikit-learn:
<br>-Keras models can be used in scikit-learn by wrapping them with the KerasClassifier or KerasRegressor class.
<br>-To use these wrappers you must define a function that creates and returns your Keras sequential model, then pass this function to the build_fn argument when constructing the KerasClassifier class.
<br>https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/


In [16]:
#the accuracy that is produced by keras is similar to the one being manually calculated by argmax, same with scikit learn evaluate
#grid seach does not build to apply on test set, after finding the best hyperparamter, we need to fit the model on train set again and use it for test set

# PART 2: EXECUTION

In [17]:
X = read_features()
y = read_target().iloc[:,0]


#step 1:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify=y)

# step 2:
X_train = remove_duplicated_columns(X_train)
#step 3:
X_train = remove_constant_volatility(X_train)
#step 4:
X_train = remove_cols_with_high_pct_null(X_train, 0.5) #this can be in the loop too, may be later
#step 5: remove the same columns from step 2-4 TRAIN_TEST split
X_test = X_test.loc[:,X_train.columns]



#step 6: oulier treatement (on both TRAIN & TEST split)
X_train = replace_outlier(X_train, '3s')
X_test = replace_outlier(X_test, '3s')

#step 7: missing value imputation (on both TRAIN & TEST split)
X_train, X_test = impute_null_with_knn(X_train, X_test, 'distance')

# #step 8: feature selection (on both TRAIN & TEST split)
# X_train = BorutaShap_FS(X_train, y_train, 'shap')

#make test set have the SAME features as train set
X_test = X_test.loc[:,X_train.columns]

#step 9: balancing only on TRAIN split
X_train, y_train = sampling(X_train, y_train, 'SMOTEENN')



In [18]:
X_train_backup, y_train_backup, X_test_backup, y_test_backup = X_train, y_train, X_test, y_test 

In [19]:
X_train.shape

(1808, 446)

In [24]:

model = Sequential()
model.add(Dense(10, activation='softmax', input_dim=5))
model.add(Dense(2, activation='softmax'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 10)                60        
                                                                 
 dense_8 (Dense)             (None, 2)                 22        
                                                                 
Total params: 82
Trainable params: 82
Non-trainable params: 0
_________________________________________________________________


In [62]:
#NN
batch_size = [100]
epochs = [20,50]
activation = ['linear','softmax','relu'] 
dropout_rate = [0,0.1]
neurons = [1,1.5,2]

def create_model_NN(batch_size=100, epochs=50, activation='linear', dropout_rate=0.0, neurons=10):

    input_dim = X_train.shape[1]

    model = Sequential()
    model.add(Dense(round((input_dim+2)/2)*neurons, activation=activation, input_dim=input_dim))
    model.add(Dropout(dropout_rate))
    #model.add(Dense(round((neurons*5+2)/2), activation=activation))
    model.add(Dense(2, activation='softmax'))

    # Compile the model
    model.compile(optimizer='adam', 
                loss='categorical_crossentropy', 
                metrics=['accuracy'])
    return model

model = KerasClassifier(build_fn=create_model_NN, epochs=100, batch_size=10, verbose=0) #epochs and batch_size here does not matter?

param_grid = dict(batch_size=batch_size, epochs=epochs, activation=activation, dropout_rate=dropout_rate, neurons=neurons)

# prepare the y set: to_categorical cannot work with negative numbers
y_train = y_train.replace(-1, 0)
y_test = y_test.replace(-1, 0)

#one hot encode outputs
# y_train_c = to_categorical(y_train)
# y_test_c = to_categorical(y_test)

#scoring = {'recall': make_scorer(recall_score),'accuracy': make_scorer(accuracy_score)}


grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring='f1_micro')
grid_result = grid.fit(X_train, y_train)


## summarize results
print("Best scores: %f (+-%f) using %s" % (grid_result.best_score_, grid_result.cv_results_['std_test_score'][grid_result.best_index_], grid_result.best_params_))
# means = grid_result.cv_results_['mean_test_score']
# stds = grid_result.cv_results_['std_test_score']
# params = grid_result.cv_results_['params']


# for mean, stdev, param in zip(means, stds, params):
#     print("%f (+-%f) with: %r" % (mean, stdev, param))

# df_gs_result = pd.DataFrame({'mean_acc': means, 'std_acc': stds, 'params':str(params)}, index = [i for i in range(means.shape[0])])





2022-07-04 10:27:16.432473: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-04 10:27:16.432611: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-04 10:27:16.432951: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the ap

Best scores: 0.850077 (+-0.064101) using {'activation': 'relu', 'batch_size': 100, 'dropout_rate': 0, 'epochs': 20, 'neurons': 1.5}


In [70]:
grid_result.cv_results_

{'mean_fit_time': array([4.63455192, 4.70958273, 4.38518031, 6.69540405, 6.93011141,
        7.23192032, 4.96219365, 5.20900734, 4.57080936, 9.03615967,
        7.9234101 , 7.67354059, 5.38781802, 3.93163276, 3.93308798,
        6.13986437, 7.05814131, 7.24230075, 4.29467924, 4.20269307,
        4.29121463, 6.28310402, 6.80867942, 7.43251832, 4.71969358,
        5.02972309, 4.40504853, 6.20263624, 7.33701173, 7.64497503,
        7.01784603, 4.90734959, 5.90564823, 7.27509793, 7.43507775,
        7.71234854]),
 'std_fit_time': array([0.01933183, 0.13492623, 0.35759974, 0.15493665, 0.05029474,
        0.35734784, 0.18493977, 0.14254074, 0.06432375, 2.37828935,
        0.51837459, 0.07669132, 0.28020669, 0.13884094, 0.06223558,
        0.52042024, 0.06381036, 0.07739656, 0.43396981, 0.32392887,
        0.37187422, 0.0298944 , 0.25780565, 0.21568663, 0.11810268,
        0.53189519, 0.36163872, 0.25688472, 0.68442742, 0.41939422,
        0.06483841, 0.42563661, 0.65638278, 0.24007044, 0.192

In [63]:
cv_results = pd.DataFrame.from_dict(grid_result.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_activation,param_batch_size,param_dropout_rate,param_epochs,param_neurons,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,4.634552,0.019332,0.723153,0.0155,linear,100,0.0,20,1.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.180763,0.932007,0.887043,0.666604,0.344032,11
1,4.709583,0.134926,0.693258,0.032939,linear,100,0.0,20,1.5,"{'activation': 'linear', 'batch_size': 100, 'd...",0.016584,0.386401,0.574751,0.325912,0.23185,25
2,4.38518,0.3576,1.165578,0.66719,linear,100,0.0,20,2.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.212272,0.898839,0.978405,0.696505,0.343942,10
3,6.695404,0.154937,0.954393,0.104962,linear,100,0.0,50,1.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.006633,0.557214,0.948505,0.504117,0.386346,19
4,6.930111,0.050295,0.807331,0.015935,linear,100,0.0,50,1.5,"{'activation': 'linear', 'batch_size': 100, 'd...",0.597015,0.524046,0.995017,0.705359,0.206974,7
5,7.23192,0.357348,0.836249,0.242957,linear,100,0.0,50,2.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.364842,0.923715,0.262458,0.517005,0.290609,17
6,4.962194,0.18494,1.114835,0.081871,linear,100,0.1,20,1.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.034826,0.696517,0.722591,0.484645,0.318248,20
7,5.209007,0.142541,1.154961,0.046756,linear,100,0.1,20,1.5,"{'activation': 'linear', 'batch_size': 100, 'd...",0.038143,0.875622,0.815615,0.57646,0.381435,15
8,4.570809,0.064324,2.21478,0.080781,linear,100,0.1,20,2.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.026534,0.661692,0.202658,0.296961,0.267739,29
9,9.03616,2.378289,1.394671,0.370759,linear,100,0.1,50,1.0,"{'activation': 'linear', 'batch_size': 100, 'd...",0.140962,0.45937,0.679402,0.426578,0.221037,21


In [76]:
#use the best param to apply for test set

list_of_params = grid_result.cv_results_['params']
dict_scores = {}
train_cf_matrix=[]
train_accuracy=[]
train_f1=[]
train_precision=[]
train_recall=[]
train_auc=[]
train_type_1_error_FP=[]
train_type_2_error_FN=[]
train_log_loss=[]
train_cohen_kappa_score=[]

test_cf_matrix=[]
test_accuracy=[]
test_f1=[]
test_precision=[]
test_recall=[]
test_auc=[]
test_type_1_error_FP=[]
test_type_2_error_FN=[]
test_log_loss=[]
test_cohen_kappa_score=[]

hyperparam = []

for p in list_of_params:
    real_model = create_model_NN(**p)
    real_model.fit(X_train,y_train_c, epochs= grid_result.best_params_['epochs'], batch_size = grid_result.best_params_['batch_size'], verbose=0)   

    hyperparam.append(p)


    #create predicted y
    y_pred = real_model.predict(X_train)
    #Converting y-predicted to labels
    pred = list()
    for i in range(len(y_pred)):
        pred.append(np.argmax(y_pred[i]))

    #Converting y-test to labels
    test = list()
    for i in range(len(y_train_c)):
        test.append(np.argmax(y_train_c[i]))

    train_cf_matrix.append(confusion_matrix(test, pred))
    train_accuracy.append(accuracy_score(test, pred)) #these are library from scikit learn, this yield the same result as the one of keras mode.evaluate()
    train_f1.append(f1_score(test, pred)) 
    train_precision.append(precision_score(test, pred))
    train_recall.append(recall_score(test, pred))
    train_auc.append(roc_auc_score(test, pred))
    train_type_1_error_FP.append(cf_matrix[1][0])
    train_type_2_error_FN.append(cf_matrix[0][1])
    train_log_loss.append(log_loss(test, pred))
    train_cohen_kappa_score.append(cohen_kappa_score(test, pred))


    #create predicted y
    y_pred = real_model.predict(X_test)
    #Converting y-predicted to labels
    pred = list()
    for i in range(len(y_pred)):
        pred.append(np.argmax(y_pred[i]))

    #Converting y-test to labels
    test = list()
    for i in range(len(y_test_c)):
        test.append(np.argmax(y_test_c[i]))
        
    test_cf_matrix.append(confusion_matrix(test, pred))
    test_accuracy.append(accuracy_score(test, pred)) #these are library from scikit learn, this yield the same result as the one of keras mode.evaluate()
    test_f1.append(f1_score(test, pred)) 
    test_precision.append(precision_score(test, pred))
    test_recall.append(recall_score(test, pred))
    test_auc.append(roc_auc_score(test, pred))
    test_type_1_error_FP.append(cf_matrix[1][0])
    test_type_2_error_FN.append(cf_matrix[0][1])
    test_log_loss.append(log_loss(test, pred))
    test_cohen_kappa_score.append(cohen_kappa_score(test, pred))


#create a dict of list
dict_scores['hyperparam']=hyperparam
dict_scores['train_cf_matrix']=train_cf_matrix
dict_scores['train_accuracy']=train_accuracy
dict_scores['train_f1']=train_f1
dict_scores['train_precision']=train_precision
dict_scores['train_recall']=train_recall
dict_scores['train_auc']=train_auc
dict_scores['train_type_1_error_FP']=train_type_1_error_FP
dict_scores['train_type_2_error_FN']=train_type_2_error_FN
dict_scores['train_log_loss']=train_log_loss
dict_scores['train_cohen_kappa_score']=train_cohen_kappa_score
dict_scores['test_cf_matrix']=test_cf_matrix
dict_scores['test_accuracy']=test_accuracy
dict_scores['test_f1']=test_f1
dict_scores['test_precision']=test_precision
dict_scores['test_recall']=test_recall
dict_scores['test_auc']=test_auc
dict_scores['test_type_1_error_FP']=test_type_1_error_FP
dict_scores['test_type_2_error_FN']=test_type_2_error_FN
dict_scores['test_log_loss']=test_log_loss
dict_scores['test_cohen_kappa_score']=test_cohen_kappa_score


df_scores = pd.DataFrame.from_dict(dict_scores)
df_scores



Unnamed: 0,hyperparam,train_cf_matrix,train_accuracy,train_f1,train_precision,train_recall,train_auc,train_type_1_error_FP,train_type_2_error_FN,train_log_loss,...,test_cf_matrix,test_accuracy,test_f1,test_precision,test_recall,test_auc,test_type_1_error_FP,test_type_2_error_FN,test_log_loss,test_cohen_kappa_score
0,"{'activation': 'linear', 'batch_size': 100, 'd...","[[42, 251], [0, 21]]",0.200637,0.143345,0.077206,1.0,0.571672,12,110,27.609661,...,"[[103, 545], [0, 1160]]",0.698562,0.809773,0.680352,1.0,0.579475,12,110,10.411543,0.195178
1,"{'activation': 'linear', 'batch_size': 100, 'd...","[[104, 189], [5, 16]]",0.382166,0.141593,0.078049,0.761905,0.558427,12,110,21.339725,...,"[[282, 366], [38, 1122]]",0.776549,0.847432,0.754032,0.967241,0.701213,12,110,7.717897,0.453034
2,"{'activation': 'linear', 'batch_size': 100, 'd...","[[239, 54], [15, 6]]",0.780255,0.148148,0.1,0.285714,0.550707,12,110,7.589869,...,"[[562, 86], [441, 719]]",0.708518,0.731807,0.893168,0.619828,0.743556,12,110,10.06748,0.434571
3,"{'activation': 'linear', 'batch_size': 100, 'd...","[[264, 29], [19, 2]]",0.847134,0.076923,0.064516,0.095238,0.498131,12,110,5.279887,...,"[[617, 31], [753, 407]]",0.566372,0.509387,0.929224,0.350862,0.651511,12,110,14.977005,0.243217
4,"{'activation': 'linear', 'batch_size': 100, 'd...","[[85, 208], [3, 18]]",0.328025,0.145749,0.079646,0.857143,0.573623,12,110,23.209707,...,"[[190, 458], [10, 1150]]",0.74115,0.830925,0.715174,0.991379,0.642295,12,110,8.94055,0.335828
5,"{'activation': 'linear', 'batch_size': 100, 'd...","[[222, 71], [10, 11]]",0.742038,0.213592,0.134146,0.52381,0.640744,12,110,8.909865,...,"[[522, 126], [420, 740]]",0.698009,0.730503,0.854503,0.637931,0.721743,12,110,10.430461,0.403123
6,"{'activation': 'linear', 'batch_size': 100, 'd...","[[158, 135], [8, 13]]",0.544586,0.153846,0.087838,0.619048,0.579148,12,110,15.729787,...,"[[389, 259], [118, 1042]]",0.791482,0.84681,0.800922,0.898276,0.749292,12,110,7.202061,0.523734
7,"{'activation': 'linear', 'batch_size': 100, 'd...","[[167, 126], [5, 16]]",0.582803,0.196319,0.112676,0.761905,0.665935,12,110,14.40981,...,"[[410, 238], [80, 1080]]",0.824115,0.871671,0.819423,0.931034,0.781875,12,110,6.074956,0.595811
8,"{'activation': 'linear', 'batch_size': 100, 'd...","[[236, 57], [12, 9]]",0.780255,0.206897,0.136364,0.428571,0.617016,12,110,7.589876,...,"[[555, 93], [396, 764]]",0.729535,0.757561,0.891482,0.658621,0.757551,12,110,9.341558,0.466919
9,"{'activation': 'linear', 'batch_size': 100, 'd...","[[243, 50], [14, 7]]",0.796178,0.179487,0.122807,0.333333,0.581342,12,110,7.039878,...,"[[580, 68], [561, 599]]",0.652102,0.65572,0.898051,0.516379,0.705721,12,110,12.016009,0.352291


In [88]:
df_scores

Unnamed: 0,level_0,index,hyperparam,train_cf_matrix,train_accuracy,train_f1,train_precision,train_recall,train_auc,train_type_1_error_FP,...,test_cf_matrix,test_accuracy,test_f1,test_precision,test_recall,test_auc,test_type_1_error_FP,test_type_2_error_FN,test_log_loss,test_cohen_kappa_score
25,25,25,"{'activation': 'relu', 'batch_size': 100, 'dro...","[[274, 19], [21, 0]]",0.872611,0.0,0.0,0.0,0.467577,12,...,"[[641, 7], [455, 705]]",0.744469,0.753205,0.990169,0.607759,0.798478,12,110,8.82573,0.517932
11,11,11,"{'activation': 'linear', 'batch_size': 100, 'd...","[[265, 28], [20, 1]]",0.847134,0.04,0.034483,0.047619,0.476028,12,...,"[[626, 22], [873, 287]]",0.504978,0.390742,0.928803,0.247414,0.606732,12,110,17.097468,0.165498
3,3,3,"{'activation': 'linear', 'batch_size': 100, 'd...","[[264, 29], [19, 2]]",0.847134,0.076923,0.064516,0.095238,0.498131,12,...,"[[617, 31], [753, 407]]",0.566372,0.509387,0.929224,0.350862,0.651511,12,110,14.977005,0.243217
15,15,15,"{'activation': 'softmax', 'batch_size': 100, '...","[[53, 240], [7, 14]]",0.213376,0.101818,0.055118,0.666667,0.423777,12,...,"[[146, 502], [89, 1071]]",0.673119,0.783754,0.680865,0.923276,0.574292,12,110,11.290276,0.172904
31,31,31,"{'activation': 'relu', 'batch_size': 100, 'dro...","[[260, 33], [18, 3]]",0.83758,0.105263,0.083333,0.142857,0.515115,12,...,"[[627, 21], [288, 872]]",0.829093,0.849489,0.976484,0.751724,0.859658,12,110,5.902931,0.65936
26,26,26,"{'activation': 'relu', 'batch_size': 100, 'dro...","[[106, 187], [9, 12]]",0.375796,0.109091,0.060302,0.571429,0.466602,12,...,"[[353, 295], [0, 1160]]",0.836836,0.887189,0.797251,1.0,0.772377,12,110,5.635606,0.605596
32,32,32,"{'activation': 'relu', 'batch_size': 100, 'dro...","[[217, 76], [15, 6]]",0.710191,0.116505,0.073171,0.285714,0.513164,12,...,"[[585, 63], [18, 1142]]",0.955199,0.965751,0.947718,0.984483,0.94363,12,110,1.547396,0.90107
22,22,22,"{'activation': 'softmax', 'batch_size': 100, '...","[[23, 270], [3, 18]]",0.130573,0.116505,0.0625,0.857143,0.467821,12,...,"[[82, 566], [11, 1149]]",0.680863,0.799304,0.669971,0.990517,0.55853,12,110,11.022858,0.144344
17,17,17,"{'activation': 'softmax', 'batch_size': 100, '...","[[0, 293], [0, 21]]",0.066879,0.125373,0.066879,1.0,0.5,12,...,"[[0, 648], [0, 1160]]",0.641593,0.781671,0.641593,1.0,0.5,12,110,12.379229,0.0
19,19,19,"{'activation': 'softmax', 'batch_size': 100, '...","[[0, 293], [0, 21]]",0.066879,0.125373,0.066879,1.0,0.5,12,...,"[[0, 648], [0, 1160]]",0.641593,0.781671,0.641593,1.0,0.5,12,110,12.379229,0.0


In [89]:
df_scores.to_csv('df_scores.csv')

In [72]:
df = pd.DataFrame(columns=['a','b'])
df.loc[0,'a']

KeyError: 0

In [28]:
from sklearn.metrics import make_scorer

In [None]:
#without FS
cfm [[181 112]
 [ 11  10]]
acc 0.60828025477707

#with FS
Best scores: 0.644106 (+-0.286622) using {'activation': 'linear', 'batch_size': 100, 'dropout_rate': 0, 'epochs': 100, 'neurons': 10}
cfm [[182 111]
 [  8  13]]
acc 0.6210191082802548
recall_score 0.6190476190476191