In [24]:
import pandas as pd
import numpy as np  
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

In [25]:
# For reproducibility of results
np.random.seed(0)

# Load the dataset if selected_features.csv does exist
if os.path.exists("selected_features.csv"):
    selected_features = pd.read_csv("selected_features.csv")
    print(selected_features.head())
else:
    print("The dataset is not present in the current directory.")

       2      7      8     11     14     15     17     18     21     22  ...  \
0 -0.684 -0.113  0.401 -0.251 -0.178  0.321  0.016 -0.003 -0.275 -0.162  ...   
1 -0.529 -0.066 -0.168 -0.205  0.020  0.031 -0.165 -0.026 -0.130  0.176  ...   
2 -0.240  0.031 -0.077 -0.036 -0.005  0.170  0.212  0.138 -0.300  0.689  ...   
3 -0.062 -0.143  0.068 -0.295 -0.188 -0.317 -0.113  0.018 -0.272 -0.531  ...   
4 -0.203 -0.168  0.008  0.166 -0.048 -0.291 -0.091  0.062  0.026 -0.036  ...   

      54     56     57     58     59     60     64     66     68  label  
0 -0.263 -0.075 -0.327 -0.198 -0.257 -0.205 -0.223 -0.136 -0.132      0  
1 -0.064  0.004 -0.305 -0.018 -0.230 -0.150 -0.151 -0.031 -0.097      0  
2 -0.149  0.102 -0.189 -0.112 -0.121  0.121 -0.172  0.033 -0.321      0  
3 -0.245 -0.153 -0.405 -0.102 -0.120 -0.141 -0.292 -0.078 -0.151      0  
4  0.043 -0.099 -0.188  0.042 -0.124 -0.097 -0.165 -0.112 -0.138      1  

[5 rows x 31 columns]


In [26]:
labels = selected_features.iloc[:, -1]
features = selected_features.iloc[:, :-1]

features, labels

(            2         7         8        11        14        15        17  \
 0   -0.684000 -0.113000  0.401000 -0.251000 -0.178000  0.321000  0.016000   
 1   -0.529000 -0.066000 -0.168000 -0.205000  0.020000  0.031000 -0.165000   
 2   -0.240000  0.031000 -0.077000 -0.036000 -0.005000  0.170000  0.212000   
 3   -0.062000 -0.143000  0.068000 -0.295000 -0.188000 -0.317000 -0.113000   
 4   -0.203000 -0.168000  0.008000  0.166000 -0.048000 -0.291000 -0.091000   
 ..        ...       ...       ...       ...       ...       ...       ...   
 391 -0.240811  0.160049 -0.172790  0.431347  0.430861 -0.305451 -0.119770   
 392 -0.179418 -0.000023 -0.260852  0.237564 -0.080075 -0.354038 -0.004051   
 393 -0.553610 -0.005744 -0.068708 -0.253044  0.017528 -0.070628 -0.172178   
 394  0.203276 -0.036268 -0.253699  0.382949  0.295444  0.049701 -0.245894   
 395 -0.221108  0.450663 -0.293762  0.042793  0.153844 -0.268618 -0.083691   
 
            18        21        22  ...        53        54   

## Min-Max

In [27]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
dataset_normalized = scaler.fit_transform(features)
dataset_normalized

array([[0.18304732, 0.49900892, 0.89076305, ..., 0.33932274, 0.34881517,
        0.52682455],
       [0.26365055, 0.52229931, 0.43373494, ..., 0.38908086, 0.44834123,
        0.54374094],
       [0.41393656, 0.5703667 , 0.50682731, ..., 0.37456807, 0.50900474,
        0.43547608],
       ...,
       [0.25085283, 0.55215857, 0.51348755, ..., 0.31353283, 0.41787867,
        0.28746834],
       [0.6444493 , 0.53703271, 0.3649004 , ..., 0.57430339, 0.55802938,
        0.67151474],
       [0.42376079, 0.77832656, 0.33272129, ..., 0.38708569, 0.42213365,
        0.63412131]])

# ANN using keras (no optimizer)

In [28]:
import tensorflow as tf
print(tf.__version__)


2.14.0


In [37]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD
from scikeras.wrappers import KerasClassifier
from sklearn.model_selection import GridSearchCV, KFold

In [38]:
# Function to create model with specified learning rate
def create_model(learning_rate=0.01, n_hidden_layers=1, n_neurons=10, activation='relu', **kwargs):
    model = Sequential()
    model.add(Dense(n_neurons, input_dim=features.shape[1], activation=activation))
    for _ in range(n_hidden_layers - 1):
        model.add(Dense(n_neurons, activation=activation))
    model.add(Dense(1, activation='sigmoid'))
    optimizer = SGD(learning_rate=learning_rate)
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

In [39]:
# Define parameter grid for grid search
param_grid = {
    'epochs': [50, 100, 150],
    'learning_rate': [0.1, 0.3, 0.5],
    'n_hidden_layers': [1, 2, 3],
    'n_neurons': [10, 20, 30],
    'build_fn__activation': ['relu', 'tanh']  # corrected parameter name
}

In [42]:
class CustomKerasClassifier(KerasClassifier):
    def set_params(self, **params):
        return super().set_params(**params)

# Create model
model = CustomKerasClassifier(build_fn=create_model, verbose=0)

In [43]:
# Create grid search
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=KFold(10, random_state=0, shuffle=True), scoring='roc_auc')

# Fit grid search
grid_result = grid.fit(dataset_normalized, labels)

AttributeError: 'function' object has no attribute 'set_params'

In [21]:
# Display results
print("Best AUC: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

AttributeError: 'function' object has no attribute 'set_params'

# ANN (with optimizers - this is not valid)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, log_loss, confusion_matrix
import numpy as np
import pandas as pd

In [6]:
# Initialize StratifiedKFold with 10 folds and a random seed for reproducibility
skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)

# Define ANN topologies
topologies = [
    (1, 10, 'relu'),
    (2, 20, 'relu'),
    (3, 30, 'tanh'),
    (4, 40, 'tanh')
]

# Initialize lists to store metrics for each topology
conf_matrices = []
auc_scores = []
acc_scores = []
pre_scores = []
rec_scores = []
loss_scores = []

# Loop through each topology
for topology in topologies:
    # Initialize lists to store metrics for each fold
    conf_matrix_fold = []
    auc_scores_fold = []
    acc_scores_fold = []
    pre_scores_fold = []
    rec_scores_fold = []
    loss_scores_fold = []
    
    # Loop through each fold
    for train_index, test_index in skf.split(dataset_normalized, labels):
        # Split the dataset into training and testing sets
        X_train, X_test = dataset_normalized[train_index], dataset_normalized[test_index]
        y_train, y_test = labels.iloc[train_index], labels.iloc[test_index]
        
        # Define the ANN model
        model = MLPClassifier(hidden_layer_sizes=(topology[1],) * topology[0], activation=topology[2], random_state=0)
        
        # Fit the model and make predictions
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        # Calculate metrics and append to lists
        conf_matrix_fold.append(confusion_matrix(y_test, y_pred))
        auc_scores_fold.append(roc_auc_score(y_test, y_pred_proba))
        acc_scores_fold.append(accuracy_score(y_test, y_pred))
        pre_scores_fold.append(precision_score(y_test, y_pred))
        rec_scores_fold.append(recall_score(y_test, y_pred))
        loss_scores_fold.append(log_loss(y_test, y_pred_proba))
    
    # Append mean and standard deviation of metrics for each topology
    conf_matrices.append(np.mean(conf_matrix_fold, axis=0))
    auc_scores.append((np.mean(auc_scores_fold), np.std(auc_scores_fold)))
    acc_scores.append((np.mean(acc_scores_fold), np.std(acc_scores_fold)))
    pre_scores.append((np.mean(pre_scores_fold), np.std(pre_scores_fold)))
    rec_scores.append((np.mean(rec_scores_fold), np.std(rec_scores_fold)))
    loss_scores.append((np.mean(loss_scores_fold), np.std(loss_scores_fold)))

# Display metrics for each topology
for i, topology in enumerate(topologies):
    print(f"Topology: {topology[0]} hidden layers, {topology[1]} neurons per layer, Activation function: {topology[2]}")
    print(f"Confusion Matrix:\n{conf_matrices[i]}")
    print(f"AUC: Mean = {auc_scores[i][0]}, Std = {auc_scores[i][1]}")
    print(f"Accuracy: Mean = {acc_scores[i][0]}, Std = {acc_scores[i][1]}")
    print(f"Precision: Mean = {pre_scores[i][0]}, Std = {pre_scores[i][1]}")
    print(f"Recall: Mean = {rec_scores[i][0]}, Std = {rec_scores[i][1]}")
    print(f"Loss: Mean = {loss_scores[i][0]}, Std = {loss_scores[i][1]}")
    print("\n")



In [None]:
import tensorflow as tf
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
import numpy as np


def build_model(topology, learning_rate):
    model = tf.keras.Sequential()
    for i in range(topology[0]):
        if i == 0:
            model.add(tf.keras.layers.Dense(topology[i + 1], activation=topology[i + 2], input_shape=(X.shape[1],)))
        else:
            model.add(tf.keras.layers.Dense(topology[i + 1], activation=topology[i + 2]))
    model.add(tf.keras.layers.Dense(3, activation='softmax'))

    model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    return model


In [None]:
def run_hyperparameter_search(X, y, topology, learning_rates, epochs):
    resultados = []

    for learning_rate in learning_rates:
        for num_epochs in epochs:
            fold_accuracies = []

            model = build_model(topology, learning_rate)

            skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model.fit(X_train, y_train, epochs=num_epochs, verbose=0)

                _, accuracy = model.evaluate(X_test, y_test, verbose=0)
                fold_accuracies.append(accuracy)

            mean_accuracy = np.mean(fold_accuracies)
            resultados.append((topology, learning_rate, num_epochs, mean_accuracy))

    return resultados