In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import (
    RandomForestClassifier,
    StackingClassifier,
    VotingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC

from evaluation import get_cm_sklearn, print_metrics
from preprocessing import TrainingPreProcessor

RANDOM_STATE = 0

train_set = pd.read_csv("input/train.csv")

training_preprocessor = TrainingPreProcessor()
training_preprocessor.fit(train_set, ignore_columns=["CLIENTNUM"])

X, y = training_preprocessor.transform(train_set)
class_weight = training_preprocessor.class_weight

## Recuperação dos melhores modelos encontrados por gridsearch

In [12]:
logistic_regression_model = LogisticRegression(
    random_state=RANDOM_STATE,
    class_weight=class_weight,
    solver="saga",
    **{"C": 0.1, "max_iter": 1000, "penalty": "l2", "tol": 0.0001}
)

svc_model = SVC(
    random_state=RANDOM_STATE,
    class_weight=class_weight,
    **{"C": 1.0, "kernel": "rbf", "max_iter": 10000, "tol": 0.001}
)

rf_model = RandomForestClassifier(
    random_state=RANDOM_STATE,
    class_weight=class_weight,
    **{
        "criterion": "entropy",
        "max_depth": 7,
        "max_features": "sqrt",
        "n_estimators": 50,
    }
)

kfold = StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE)

## Treinando e avaliando um ensemble de votação

In [13]:
voting_ensemble = VotingClassifier(
    estimators=[
        ("logistic_regression", logistic_regression_model),
        ("svc", svc_model),
        ("random_forest", rf_model),
    ],
    voting="hard",
)

In [15]:
metrics = {
    'balanced_accuracy':[],
    'accuracy':[],
    'f1':[],
    'roc_auc':[],
}
for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    voting_ensemble.fit(X_train, y_train)
    y_val_pred = voting_ensemble.predict(X_val)
    metrics['balanced_accuracy'].append(balanced_accuracy_score(y_val, y_val_pred))
    metrics['accuracy'].append(accuracy_score(y_val, y_val_pred))
    metrics['f1'].append(f1_score(y_val, y_val_pred))
    metrics['roc_auc'].append(roc_auc_score(y_val, y_val_pred))
for metric, values in metrics.items():
    print(f"{metric}: {np.array(metrics[metric]).mean():.4f} +- {np.array(metrics[metric]).std():.4f}")
    

balanced_accuracy: 0.9154 +- 0.0060
accuracy: 0.9154 +- 0.0060
f1: 0.9481 +- 0.0037
roc_auc: 0.9052 +- 0.0136


## Treinando e avaliando (sobre validação) um ensemle de empilhamento

In [16]:
stacking_ensemble = StackingClassifier(
    estimators=[
        ("logistic_regression", logistic_regression_model),
        ("svc", svc_model),
        ("random_forest", rf_model),
    ],
    final_estimator=LogisticRegression(random_state=RANDOM_STATE),
    cv=StratifiedKFold(5, shuffle=True, random_state=RANDOM_STATE),
)

In [17]:
metrics = {
    'balanced_accuracy':[],
    'accuracy':[],
    'f1':[],
    'roc_auc':[],
}
for train_index, val_index in kfold.split(X, y):
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    stacking_ensemble.fit(X_train, y_train)
    y_val_pred = stacking_ensemble.predict(X_val)
    metrics['balanced_accuracy'].append(balanced_accuracy_score(y_val, y_val_pred))
    metrics['accuracy'].append(accuracy_score(y_val, y_val_pred))
    metrics['f1'].append(f1_score(y_val, y_val_pred))
    metrics['roc_auc'].append(roc_auc_score(y_val, y_val_pred))
for metric, values in metrics.items():
    print(f"{metric}: {np.array(metrics[metric]).mean():.4f} +- {np.array(metrics[metric]).std():.4f}")

balanced_accuracy: 0.9424 +- 0.0021
accuracy: 0.9424 +- 0.0021
f1: 0.9658 +- 0.0014
roc_auc: 0.8851 +- 0.0057
