In [1]:
import pandas as pd
import numpy as np  
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [2]:
# For reproducibility of results
np.random.seed(0)

# Load the dataset if selected_features.csv does exist
if os.path.exists("selected_features.csv"):
    selected_features = pd.read_csv("selected_features.csv")
    print(selected_features.head())
else:
    print("The dataset is not present in the current directory.")

       2      7      8     11     14     15     17     18     21     22  ...  \
0 -0.684 -0.113  0.401 -0.251 -0.178  0.321  0.016 -0.003 -0.275 -0.162  ...   
1 -0.529 -0.066 -0.168 -0.205  0.020  0.031 -0.165 -0.026 -0.130  0.176  ...   
2 -0.240  0.031 -0.077 -0.036 -0.005  0.170  0.212  0.138 -0.300  0.689  ...   
3 -0.062 -0.143  0.068 -0.295 -0.188 -0.317 -0.113  0.018 -0.272 -0.531  ...   
4 -0.203 -0.168  0.008  0.166 -0.048 -0.291 -0.091  0.062  0.026 -0.036  ...   

      54     56     57     58     59     60     64     66     68  label  
0 -0.263 -0.075 -0.327 -0.198 -0.257 -0.205 -0.223 -0.136 -0.132      0  
1 -0.064  0.004 -0.305 -0.018 -0.230 -0.150 -0.151 -0.031 -0.097      0  
2 -0.149  0.102 -0.189 -0.112 -0.121  0.121 -0.172  0.033 -0.321      0  
3 -0.245 -0.153 -0.405 -0.102 -0.120 -0.141 -0.292 -0.078 -0.151      0  
4  0.043 -0.099 -0.188  0.042 -0.124 -0.097 -0.165 -0.112 -0.138      1  

[5 rows x 31 columns]


In [3]:
labels = selected_features.iloc[:, -1]
features = selected_features.iloc[:, :-1]

labels

0      0
1      0
2      0
3      0
4      1
      ..
391    1
392    1
393    1
394    1
395    1
Name: label, Length: 396, dtype: int64

## Min-Max

In [4]:
features

Unnamed: 0,2,7,8,11,14,15,17,18,21,22,...,53,54,56,57,58,59,60,64,66,68
0,-0.684000,-0.113000,0.401000,-0.251000,-0.178000,0.321000,0.016000,-0.003000,-0.275000,-0.162000,...,-0.334000,-0.263000,-0.075000,-0.327000,-0.198000,-0.257000,-0.205000,-0.223000,-0.136000,-0.132000
1,-0.529000,-0.066000,-0.168000,-0.205000,0.020000,0.031000,-0.165000,-0.026000,-0.130000,0.176000,...,0.216000,-0.064000,0.004000,-0.305000,-0.018000,-0.230000,-0.150000,-0.151000,-0.031000,-0.097000
2,-0.240000,0.031000,-0.077000,-0.036000,-0.005000,0.170000,0.212000,0.138000,-0.300000,0.689000,...,-0.027000,-0.149000,0.102000,-0.189000,-0.112000,-0.121000,0.121000,-0.172000,0.033000,-0.321000
3,-0.062000,-0.143000,0.068000,-0.295000,-0.188000,-0.317000,-0.113000,0.018000,-0.272000,-0.531000,...,0.389000,-0.245000,-0.153000,-0.405000,-0.102000,-0.120000,-0.141000,-0.292000,-0.078000,-0.151000
4,-0.203000,-0.168000,0.008000,0.166000,-0.048000,-0.291000,-0.091000,0.062000,0.026000,-0.036000,...,0.032000,0.043000,-0.099000,-0.188000,0.042000,-0.124000,-0.097000,-0.165000,-0.112000,-0.138000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
391,-0.240811,0.160049,-0.172790,0.431347,0.430861,-0.305451,-0.119770,-0.118006,0.160183,-0.901993,...,-0.306867,0.214024,0.228945,0.206469,-0.081721,0.104964,0.352103,0.319949,0.160157,-0.052955
392,-0.179418,-0.000023,-0.260852,0.237564,-0.080075,-0.354038,-0.004051,-0.026284,-0.041484,0.033320,...,-0.285406,0.022806,-0.073043,-0.207927,-0.091690,-0.157788,-0.137062,-0.001237,-0.015542,0.175237
393,-0.553610,-0.005744,-0.068708,-0.253044,0.017528,-0.070628,-0.172178,-0.100074,-0.106106,-0.846611,...,-0.366089,-0.098186,-0.101396,0.115763,-0.169348,-0.001617,0.531042,-0.260318,-0.063138,-0.627228
394,0.203276,-0.036268,-0.253699,0.382949,0.295444,0.049701,-0.245894,-0.095935,0.080918,-0.853218,...,-0.412803,0.245236,0.161970,0.013784,0.061287,-0.004070,0.369111,0.117017,0.084721,0.167364


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_normalized = scaler.fit_transform(features)
dataset_normalized

array([[0.18304732, 0.49900892, 0.89076305, ..., 0.33932274, 0.34881517,
        0.52682455],
       [0.26365055, 0.52229931, 0.43373494, ..., 0.38908086, 0.44834123,
        0.54374094],
       [0.41393656, 0.5703667 , 0.50682731, ..., 0.37456807, 0.50900474,
        0.43547608],
       ...,
       [0.25085283, 0.55215857, 0.51348755, ..., 0.31353283, 0.41787867,
        0.28746834],
       [0.6444493 , 0.53703271, 0.3649004 , ..., 0.57430339, 0.55802938,
        0.67151474],
       [0.42376079, 0.77832656, 0.33272129, ..., 0.38708569, 0.42213365,
        0.63412131]])

# ANN DE SEBITAS

In [6]:
import numpy as np
from sklearn.model_selection import GridSearchCV, KFold
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasClassifier
import keras.backend as K
import tensorflow as tf


In [7]:
import tensorflow as tf

class CustomOptimizer(tf.keras.optimizers.legacy.Optimizer):
    def __init__(self, learning_rate=0.01, name="CustomOptimizer", **kwargs):
        super().__init__(name, **kwargs)
        self._set_hyper("learning_rate", kwargs.get("lr", learning_rate))

    def _create_slots(self, var_list):
        pass  # No additional slots are needed for our custom optimizer.

    def _resource_apply_dense(self, grad, var, apply_state=None):
        learning_rate = self._get_hyper("learning_rate")
        var.assign_sub(learning_rate * grad)
        return var

    def get_config(self):
        config = super(CustomOptimizer, self).get_config()
        config.update({
            "learning_rate": self._serialize_hyperparameter("learning_rate"),
        })
        return config


In [8]:
def create_model(learning_rate):
    # Define the model
    model = Sequential()
    model.add(Dense(12, input_dim=30, activation='relu'))  # Adjust input_dim to your dataset
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))  # Binary classification
    # Compile the model
    model.compile(loss='binary_crossentropy', optimizer=CustomOptimizer(learning_rate=learning_rate), metrics=['accuracy'])
    return model



In [9]:
model = KerasClassifier(build_fn=create_model, verbose=0, learning_rate = 0.01)


In [10]:

param_grid = {
    'learning_rate': [0.1, 0.3, 0.5],
    'epochs' : [1, 50, 100, 150, 200,250,300,350,400,450,500]
}


In [11]:
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(10, random_state=0, shuffle=True), scoring='roc_auc')
grid_result = grid.fit(dataset_normalized, labels)  # X and y are your data


  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)




  X, y = self._initialize(X, y)




  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y = self._initialize(X, y)
  X, y =

In [12]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.797496 using {'epochs': 200, 'learning_rate': 0.1}
0.537548 (0.060174) with: {'epochs': 1, 'learning_rate': 0.1}
0.530411 (0.146745) with: {'epochs': 1, 'learning_rate': 0.3}
0.650043 (0.097142) with: {'epochs': 1, 'learning_rate': 0.5}
0.762896 (0.081516) with: {'epochs': 50, 'learning_rate': 0.1}
0.786099 (0.086671) with: {'epochs': 50, 'learning_rate': 0.3}
0.714151 (0.138181) with: {'epochs': 50, 'learning_rate': 0.5}
0.765243 (0.068559) with: {'epochs': 100, 'learning_rate': 0.1}
0.774760 (0.086786) with: {'epochs': 100, 'learning_rate': 0.3}
0.770791 (0.090075) with: {'epochs': 100, 'learning_rate': 0.5}
0.787707 (0.106613) with: {'epochs': 150, 'learning_rate': 0.1}
0.764352 (0.085695) with: {'epochs': 150, 'learning_rate': 0.3}
0.735064 (0.086594) with: {'epochs': 150, 'learning_rate': 0.5}
0.797496 (0.074433) with: {'epochs': 200, 'learning_rate': 0.1}
0.781110 (0.101075) with: {'epochs': 200, 'learning_rate': 0.3}
0.756391 (0.073127) with: {'epochs': 200, 'learning_ra

In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split
from keras.wrappers.scikit_learn import KerasClassifier
import numpy as np

# Define the input dimension based on your data
input_dim = 100

# Define a function to create a deep model with specified hyperparameters
def create_deep_model(learning_rate=0.01, n_hidden_layers=3, n_neurons=64, activation='relu'):
    model = Sequential()
    model.add(Dense(n_neurons, input_dim=input_dim, activation=activation))
    
    for _ in range(n_hidden_layers - 1):
        model.add(Dense(n_neurons, activation=activation))
    
    model.add(Dense(1, activation='sigmoid'))
    
    optimizer = SGD(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    return model

# Specify the hyperparameter grid
param_grid = {
    'learning_rate': [0.1, 0.3, 0.5],
    'n_hidden_layers': [3, 4, 5],
    'n_neurons': [64, 128, 256]
}

# Create a KerasClassifier for use with GridSearchCV
model = KerasClassifier(build_fn=create_deep_model, verbose=0)

# Split your data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(dataset_normalized, labels, test_size=0.2)

# Create a GridSearchCV instance with cross-validation
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(n_splits=10, shuffle=True), scoring=make_scorer(roc_auc_score))

# Fit the grid search to your data
grid_result = grid.fit(X_train, y_train)

# Print the best hyperparameters and their corresponding score
print("Best Parameters: ", grid_result.best_params_)
print("Best ROC-AUC Score: ", grid_result.best_score_)


ModuleNotFoundError: No module named 'keras.wrappers'

# ANN using keras (no optimizer)

In [25]:
import keras
print(keras.__version__)


2.14.0


In [26]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold

In [27]:
# Function to create model with specified learning rate
def create_model(learning_rate=0.01, n_hidden_layers=1, n_neurons=10, activation='relu', **kwargs):
    model = Sequential()
    model.add(Dense(n_neurons, input_dim=features.shape[1], activation=activation))
    for _ in range(n_hidden_layers - 1):
        model.add(Dense(n_neurons, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = SGD(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [28]:
# Define parameter grid for grid search
param_grid = {
    'epochs': [50, 100, 150],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_hidden_layers': [1, 2, 3],
    'n_neurons': [10, 20, 30],
    'build_fn__activation': ['relu', 'tanh']  # corrected parameter name
}

In [29]:
class CustomKerasClassifier(KerasClassifier):
    def set_params(self, **params):
        return super().set_params(**params)

# Create model
model = CustomKerasClassifier(build_fn=create_model, verbose=1)

In [30]:
# Create grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(10, random_state=0, shuffle=True), scoring='roc_auc')

# Fit grid search
grid_result = grid.fit(dataset_normalized, labels)

AttributeError: 'function' object has no attribute 'set_params'

In [31]:
# Display results
print("Best AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best AUC: 0.797496 using {'epochs': 200, 'learning_rate': 0.1}
0.537548 (0.060174) with: {'epochs': 1, 'learning_rate': 0.1}
0.530411 (0.146745) with: {'epochs': 1, 'learning_rate': 0.3}
0.650043 (0.097142) with: {'epochs': 1, 'learning_rate': 0.5}
0.762896 (0.081516) with: {'epochs': 50, 'learning_rate': 0.1}
0.786099 (0.086671) with: {'epochs': 50, 'learning_rate': 0.3}
0.714151 (0.138181) with: {'epochs': 50, 'learning_rate': 0.5}
0.765243 (0.068559) with: {'epochs': 100, 'learning_rate': 0.1}
0.774760 (0.086786) with: {'epochs': 100, 'learning_rate': 0.3}
0.770791 (0.090075) with: {'epochs': 100, 'learning_rate': 0.5}
0.787707 (0.106613) with: {'epochs': 150, 'learning_rate': 0.1}
0.764352 (0.085695) with: {'epochs': 150, 'learning_rate': 0.3}
0.735064 (0.086594) with: {'epochs': 150, 'learning_rate': 0.5}
0.797496 (0.074433) with: {'epochs': 200, 'learning_rate': 0.1}
0.781110 (0.101075) with: {'epochs': 200, 'learning_rate': 0.3}
0.756391 (0.073127) with: {'epochs': 200, 'learnin

# ANN (with optimizers - this is not valid)

In [32]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, log_loss, confusion_matrix
import numpy as np
import pandas as pd

In [33]:
# Initialize StratifiedKFold with 10 folds and a random seed for reproducibility
skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

# Define ANN topologies
topologies = [
    (1, 10, 'relu'),
    (2, 20, 'relu'),
    (3, 30, 'tanh'),
    (4, 40, 'tanh')
]

# Initialize lists to store metrics for each topology
conf_matrices = []
auc_scores = []
acc_scores = []
pre_scores = []
rec_scores = []
loss_scores = []

# Loop through each topology
for topology in topologies:
    # Initialize lists to store metrics for each fold
    conf_matrix_fold = []
    auc_scores_fold = []
    acc_scores_fold = []
    pre_scores_fold = []
    rec_scores_fold = []
    loss_scores_fold = []
    
    # Loop through each fold
    for train_index, test_index in skf.split(dataset_normalized, labels):
        # Split the dataset into training and testing sets
        X_train, X_test = dataset_normalized[train_index], dataset_normalized[test_index]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]
        
        # Define the ANN model
        model = MLPClassifier(hidden_layer_sizes=(topology[1],) * topology[0], activation=topology[2], random_state=0)
        
        # Fit the model and make predictions
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics and append to lists
        conf_matrix_fold.append(confusion_matrix(y_test, y_pred))
        auc_scores_fold.append(roc_auc_score(y_test, y_pred_proba))
        acc_scores_fold.append(accuracy_score(y_test, y_pred))
        pre_scores_fold.append(precision_score(y_test, y_pred))
        rec_scores_fold.append(recall_score(y_test, y_pred))
        loss_scores_fold.append(log_loss(y_test, y_pred_proba))
    
    # Append mean and standard deviation of metrics for each topology
    conf_matrices.append(np.mean(conf_matrix_fold, axis=0))
    auc_scores.append((np.mean(auc_scores_fold), np.std(auc_scores_fold)))
    acc_scores.append((np.mean(acc_scores_fold), np.std(acc_scores_fold)))
    pre_scores.append((np.mean(pre_scores_fold), np.std(pre_scores_fold)))
    rec_scores.append((np.mean(rec_scores_fold), np.std(rec_scores_fold)))
    loss_scores.append((np.mean(loss_scores_fold), np.std(loss_scores_fold)))

# Display metrics for each topology
for i, topology in enumerate(topologies):
    print(f"Topology: {topology[0]} hidden layers, {topology[1]} neurons per layer, Activation function: {topology[2]}")
    print(f"Confusion Matrix:\n{conf_matrices[i]}")
    print(f"AUC: Mean = {auc_scores[i][0]}, Std = {auc_scores[i][1]}")
    print(f"Accuracy: Mean = {acc_scores[i][0]}, Std = {acc_scores[i][1]}")
    print(f"Precision: Mean = {pre_scores[i][0]}, Std = {pre_scores[i][1]}")
    print(f"Recall: Mean = {rec_scores[i][0]}, Std = {rec_scores[i][1]}")
    print(f"Loss: Mean = {loss_scores[i][0]}, Std = {loss_scores[i][1]}")
    print("\n")



Topology: 1 hidden layers, 10 neurons per layer, Activation function: relu
Confusion Matrix:
[[12.3  7.1]
 [ 5.  15.2]]
AUC: Mean = 0.759515037593985, Std = 0.08152145023442492
Accuracy: Mean = 0.6941025641025641, Std = 0.0728718444937488
Precision: Mean = 0.6879810940394464, Std = 0.08080745630990398
Recall: Mean = 0.7535714285714286, Std = 0.09563755512409014
Loss: Mean = 0.5908325061769848, Std = 0.049209518978851396


Topology: 2 hidden layers, 20 neurons per layer, Activation function: relu
Confusion Matrix:
[[13.4  6. ]
 [ 5.1 15.1]]
AUC: Mean = 0.7762149122807017, Std = 0.07829145327057657
Accuracy: Mean = 0.7192307692307693, Std = 0.07661852102872081
Precision: Mean = 0.7266514739404766, Std = 0.09652886793894487
Recall: Mean = 0.748095238095238, Std = 0.07771393557344093
Loss: Mean = 0.580982283564691, Std = 0.09728851978142875


Topology: 3 hidden layers, 30 neurons per layer, Activation function: tanh
Confusion Matrix:
[[12.4  7. ]
 [ 5.5 14.7]]
AUC: Mean = 0.752249373433584

