In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
accuracy_score,
f1_score,
roc_auc_score
)

X_train = pd.read_csv("final_data/X_train.csv").values
y_train = pd.read_csv("final_data/y_train.csv").values.ravel()
X_test = pd.read_csv("final_data/X_test.csv").values
y_test = pd.read_csv("final_data/y_test.csv").values.ravel()

# Validation dataset

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
_, X_val, _, y_val = train_test_split(X_train, y_train, train_size = 0.9)

n_train = X_train.shape[0]
n_val = X_val.shape[0]
n_test = X_test.shape[0]

print("Train size: {}\nValidation size: {}\nTest size: {}".format(n_train, n_val, n_test))

Train size: 435
Validation size: 44
Test size: 134


# Random Forest Classifier

## Validation 

In [19]:
param_grid = {
    'n_estimators': [50, 100, 150, 200],       # Number of trees in the forest
    'max_depth': [None, 10, 20],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],      # Minimum number of samples required to split an internal node}
}
model = RandomForestClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=kf)
grid_search.fit(X_val, y_val)

rf_best_params = grid_search.best_params_
rf_best_score = grid_search.best_score_
rf_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(rf_best_params)
print("Best score:")
print(rf_best_score)

Best Hyperparameters:
{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 50}
Best score:
0.9090909090909092


## Train-Test

In [20]:
model = rf_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

0.8208955223880597
0.6842105263157895
0.7660455486542443


# Gaussian Naive Bayes

In [23]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

0.8134328358208955
0.6835443037974683
0.7670807453416149


# KNN

## Validation

In [36]:
params = {
    'n_neighbors': np.arange(1, 22, 2),           # Number of neighbors to consider
    'weights': ['uniform', 'distance'],        # Weight function used in prediction
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['minkowski', 'mahalanobis', 'euclidean'],
    'p': [1, 2]                                # Power parameter for the Minkowski distance
}
model = KNeighborsClassifier()
kf = KFold(n_splits=4, shuffle=True, random_state=23)
grid_search = GridSearchCV(estimator=model, param_grid=params, cv=kf)
grid_search.fit(X_val, y_val)

knn_best_params = grid_search.best_params_
knn_best_score = grid_search.best_score_
knn_best_estimator = grid_search.best_estimator_
print("Best Hyperparameters:")
print(knn_best_params)
print("Best score:")
print(knn_best_score)

Traceback (most recent call last):
  File "C:\Users\João Paulo\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\model_selection\_validation.py", line 813, in _score
    scores = scorer(estimator, X_test, y_test)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\João Paulo\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\metrics\_scorer.py", line 527, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\João Paulo\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\sklearn\base.py", line 705, in score
    return accuracy_score(y, self.predict(X), sample_weight=sample_weight)
                             ^^^^^^^^^^^^^^^
  File "C:\Users\João Paulo\AppData\Local\Packages\PythonS

Best Hyperparameters:
{'algorithm': 'auto', 'metric': 'minkowski', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
Best score:
0.8636363636363636


 0.81818182 0.86363636 0.86363636 0.86363636 0.86363636 0.86363636
 0.84090909 0.86363636 0.84090909 0.86363636 0.84090909 0.86363636
 0.84090909 0.86363636 0.81818182 0.86363636 0.81818182 0.86363636
 0.86363636 0.86363636 0.84090909 0.86363636 0.84090909 0.86363636
 0.84090909 0.86363636 0.79545455 0.86363636 0.79545455 0.86363636
 0.70454545 0.86363636 0.70454545 0.86363636 0.70454545 0.86363636
 0.70454545 0.86363636        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan 0.84090909 0.84090909
 0.84090909 0.84090909 0.81818182 0.86363636 0.81818182 0.8636

## Train-Test

In [33]:
model = knn_best_estimator
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(accuracy)
print(f1)
print(auc)

0.7686567164179104
0.5974025974025974
0.7085921325051759
