## Pipeline Random Forest: Modelo sem boosting mais simples e com maior poder de generalização, bom para testar com quantidade de dados limitada

In [None]:
import pandas as pd
import numpy as np
from typing import Dict

from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
import category_encoders as ce
from james_stein_custom import _JamesSteinEncoder

from scipy import stats
from sklearn.metrics import roc_auc_score

def calcular_ks_scipy(y_true: np.ndarray, y_pred_proba: np.ndarray) -> float:
    prob_bads = y_pred_proba[y_true == 1]
    prob_goods = y_pred_proba[y_true == 0]
    if len(prob_bads) == 0 or len(prob_goods) == 0: return 0.0
    ks_statistic, p_value = stats.ks_2samp(prob_bads, prob_goods)
    return ks_statistic

try:
    treino_df = pd.read_csv("../data/dev/train.csv", index_col=0)
    validacao_df = pd.read_csv("../data/dev/val.csv", index_col=0)
    teste_oot_df = pd.read_csv("../data/dev/test.csv", index_col=0)

    dev_df = pd.concat([treino_df, validacao_df], ignore_index=True)
    
    TARGET = 'target'
    FEATURES = [col for col in dev_df.columns if col not in [
        TARGET, 'data_originacao', 'id_contrato', "custo_fn", "custo_fp"
    ]]

    X_dev = dev_df[FEATURES]
    y_dev = dev_df[TARGET]
    X_oot = teste_oot_df[FEATURES]
    y_oot = teste_oot_df[TARGET]

except FileNotFoundError as e:
    print(f"Problema com os dados: {e}")
    exit()

js_cols = ['estado', 'id_varejo']
ohe_cols = ['tipo_cliente']
target_enc_cols = [col for col in FEATURES if col.endswith('_bin')]

preprocessor = ColumnTransformer(
    transformers=[
        ('james_stein', _JamesSteinEncoder(min_samples=100), js_cols),
        ('one_hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, min_frequency=5), ohe_cols),
        ('target_encoder', ce.TargetEncoder(), target_enc_cols)
    ],
    remainder='passthrough'
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Grid de parâmetros 
param_dist = {
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__max_depth': [10, 20, 30, None],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__max_features': ['sqrt', 'log2'],
    'classifier__class_weight': ['balanced', 'balanced_subsample', None]
}

tscv = TimeSeriesSplit(n_splits=5)

# O RandomizedSearchCV testará 10 combinações de parâmetros usando o TimeSeriesSplit como validador
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=10,  
    scoring='roc_auc', 
    cv=tscv, 
    n_jobs=-1, 
    random_state=42,
    verbose=2 
)

random_search.fit(X_dev, y_dev)

print("Melhores parâmetros encontrados:")
print(random_search.best_params_)
print(f"\nMelhor AUC na validação cruzada: {random_search.best_score_:.4f}")

y_oot_pred_proba_tuned = random_search.predict_proba(X_oot)[:, 1]

auc_oot_tuned = roc_auc_score(y_oot, y_oot_pred_proba_tuned)
ks_oot_tuned = calcular_ks_scipy(y_oot.values, y_oot_pred_proba_tuned)

print("\n" + "---" * 10 + " RESULTADO FINAL - MODELO OTIMIZADO " + "---" * 10)
print(f">> Performance no Teste Out-of-Time: AUC = {auc_oot_tuned:.4f}, KS = {ks_oot_tuned:.4f}")

print("\nComparação com o Baseline (não otimizado):")
print("AUC Baseline: 0.6130 | KS Baseline: 0.1720") # -> achei anteriormente
print(f"Ganho de AUC: {(auc_oot_tuned - 0.6130):.4f}")
print(f"Ganho de KS:  {(ks_oot_tuned - 0.1720):.4f}")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
Melhores parâmetros encontrados:
{'classifier__n_estimators': 200, 'classifier__min_samples_split': 5, 'classifier__min_samples_leaf': 2, 'classifier__max_features': 'sqrt', 'classifier__max_depth': 10, 'classifier__class_weight': None}

Melhor AUC na validação cruzada: 0.6380

------------------------------ RESULTADO FINAL - MODELO OTIMIZADO ------------------------------
>> Performance no Teste Out-of-Time: AUC = 0.6551, KS = 0.2152

Comparação com o Baseline (não otimizado):
AUC Baseline: 0.6130 | KS Baseline: 0.1720
Ganho de AUC: 0.0421
Ganho de KS:  0.0432
