In [48]:
import sys
from pathlib import Path

# Adiciona o caminho da pasta src ao sys.path
sys.path.append(str(Path("../src").resolve()))

In [49]:
from evaluation import *

from classifiers import BayesianKNNClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from classifiers import BayesianGaussianClassifier, MajorityVoteClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

import numpy as np
import pandas as pd
import joblib
from collections import Counter

In [50]:
# Carrega os dados balanceados e normalizados
X_train, y_train = joblib.load("../data/train_balanced.pkl")
X_val, y_val = joblib.load("../data/val_balanced.pkl")
X_test, y_test = joblib.load("../data/test_balanced.pkl")

# Junta tudo para clustering (sem separar)
X = np.concatenate([X_train, X_val, X_test], axis=0)
y = np.concatenate([y_train, y_val, y_test], axis=0)

print("Shape dos dados:", X.shape)
print("Distribuição das classes:", Counter(y))


Shape dos dados: (424, 44)
Distribuição das classes: Counter({np.int64(0): 212, np.int64(1): 212})


In [None]:
N_RUNS = 30
N_FOLDS = 10
SEED = 42
rng = np.random.RandomState(SEED)

results_knn = []
results_logistic = []
results_majority = []


In [None]:
for run in range(N_RUNS):
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=rng.randint(0, 10000))

    for train_idx, test_idx in skf.split(X, y):
        X_train_cv, X_test_cv = X[train_idx], X[test_idx]
        y_train_cv, y_test_cv = y[train_idx], y[test_idx]

        best_knn_params = select_best_bayesian_knn(X_train_cv, y_train_cv)
        best_logistic_params = select_best_logistic(X_train_cv, y_train_cv)

        # Instancia modelos com melhores hiperparâmetros
        knn_model = BayesianKNNClassifier(**best_knn_params)
        logistic_model = LogisticRegression(**best_logistic_params, solver="liblinear")
        bayes_model = BayesianGaussianClassifier()

        knn_model.fit(X_train_cv, y_train_cv)
        logistic_model.fit(X_train_cv, y_train_cv)
        bayes_model.fit(X_train_cv, y_train_cv)

        # Modelo de votação
        majority_model = MajorityVoteClassifier([bayes_model, knn_model, logistic_model])
        majority_model.fit(X_train_cv, y_train_cv)

        # Função de avaliação
        def evaluate(model):
            y_pred = model.predict(X_test_cv)
            return {
                "accuracy": accuracy_score(y_test_cv, y_pred),
                "precision": precision_score(y_test_cv, y_pred, zero_division=0),
                "recall": recall_score(y_test_cv, y_pred, zero_division=0),
                "f1": f1_score(y_test_cv, y_pred, zero_division=0)
            }

        results_knn.append(evaluate(knn_model))
        results_logistic.append(evaluate(logistic_model))
        results_majority.append(evaluate(majority_model))


In [53]:
df_knn = pd.DataFrame(results_knn)
df_logistic = pd.DataFrame(results_logistic)
df_majority = pd.DataFrame(results_majority)

df_knn.head()

Unnamed: 0,accuracy,precision,recall,f1
0,0.72093,0.909091,0.47619,0.625
1,0.813953,1.0,0.619048,0.764706
2,0.790698,1.0,0.590909,0.742857
3,0.837209,1.0,0.681818,0.810811
4,0.833333,1.0,0.666667,0.8
