In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
accuracy_score,
f1_score,
roc_auc_score
)

models_results = pd.DataFrame()

X_train = pd.read_csv("final_data/X_train.csv").values
y_train = pd.read_csv("final_data/y_train.csv").values.ravel()
X_test = pd.read_csv("final_data/X_test.csv").values
y_test = pd.read_csv("final_data/y_test.csv").values.ravel()

# Validation dataset

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
_, X_val, _, y_val = train_test_split(X_train, y_train, train_size = 0.9)

n_train = X_train.shape[0]
n_val = X_val.shape[0]
n_test = X_test.shape[0]

print("Train size: {}\nValidation size: {}\nTest size: {}".format(n_train, n_val, n_test))

Train size: 757
Validation size: 76
Test size: 134


# Random Forest Classifier

## Validation 

In [30]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node}
}
model = RandomForestClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf)
grid_search.fit(X_val, y_val)

rf_best_params = grid_search.best_params_
rf_best_score = grid_search.best_score_
rf_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(rf_best_params)
print("Best score:")
print(rf_best_score)

Best Hyperparameters:
{'max_depth': None, 'min_samples_split': 10, 'n_estimators': 50}
Best score:
0.75


## Train-Test

In [32]:
model = rf_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

models_results["RFC"] = [accuracy, f1, auc]

0.8208955223880597
0.7
0.7789855072463767


# Gaussian Naive Bayes

In [33]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

models_results["GNB"] = [accuracy, f1, auc]

0.8059701492537313
0.6666666666666666
0.755175983436853


# KNN

## Validation

In [34]:
params = {
    'n_neighbors': np.arange(1, 42, 2),           # Number of neighbors to consider
    'weights': ['uniform', 'distance'],        # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1, 2]                                # Power parameter for the Minkowski distance
}
model = KNeighborsClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=kf)
grid_search.fit(X_val, y_val)

knn_best_params = grid_search.best_params_
knn_best_score = grid_search.best_score_
knn_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(knn_best_params)
print("Best score:")
print(knn_best_score)

Best Hyperparameters:
{'algorithm': 'auto', 'n_neighbors': 17, 'p': 1, 'weights': 'uniform'}
Best score:
0.8026315789473684


## Train-Test

In [35]:
model = knn_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

models_results["KNN"] = [accuracy, f1, auc]

0.7910447761194029
0.6216216216216217
0.724896480331263


# MLP

## Validation

In [36]:
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

params = {
    'hidden_layer_sizes': [(20,), (35,), (50,), (20, 20)],   # Number of units in the hidden layers
    'activation': ['relu', 'tanh', 'logistic'],                    # Activation function for the hidden layers                              # L2 regularization parameter
    'learning_rate_init': [0.005, 0.01, 0.05],                      # Initial learning rate
    'max_iter': [500]
}
model = MLPClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=kf)
grid_search.fit(X_val, y_val)

mlp_best_params = grid_search.best_params_
mlp_best_score = grid_search.best_score_
mlp_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(mlp_best_params)
print("Best score:")
print(mlp_best_score)

Best Hyperparameters:
{'activation': 'logistic', 'hidden_layer_sizes': (20,), 'learning_rate_init': 0.005, 'max_iter': 500}
Best score:
0.75


## Train-Test

In [37]:
model = mlp_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

models_results["MLP"] = [accuracy, f1, auc]

0.8059701492537313
0.6388888888888888
0.7357660455486543


In [38]:
models_results.index = ["accuracy", "f1", "auc"]
print(models_results)
models_results.to_csv("final_data/models_results.csv", index = False)

               RFC       GNB       KNN       MLP
accuracy  0.820896  0.805970  0.791045  0.805970
f1        0.700000  0.666667  0.621622  0.638889
auc       0.778986  0.755176  0.724896  0.735766
