# Homogeneous Groups Demo

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from binary_performance_evaluator import BinaryPerformanceEvaluator

# --------------------------------------------------
# 1. CONFIGURAÇÕES
# --------------------------------------------------
FILE_PATH = "../../datasets/lending_club/accepted_2007_to_2018Q4.csv"
NROWS     = 1_000
TARGET_RAW = "loan_status"          # coluna original
TARGET     = "target"               # nome final binário

# --------------------------------------------------
# 2. LEITURA E CONVERSÃO DE TIPOS MISTOS
# --------------------------------------------------
def read_and_clean_csv_mixed_types(path, nrows=None, verbose=True):
    df = pd.read_csv(path, low_memory=False, nrows=nrows)

    # identificar colunas com tipos mistos
    for col in df.columns:
        types = df[col].dropna().map(type).value_counts()
        if len(types) > 1 and verbose:
            print(f"[!] '{col}' com múltiplos tipos: {dict(types)}")

        # tentativa de conversão automática p/ numérico
        if len(types) > 1:
            try:
                df[col] = pd.to_numeric(df[col], errors="coerce")
            except Exception:
                df[col] = df[col].astype(str)

    return df

df = read_and_clean_csv_mixed_types(FILE_PATH, nrows=NROWS)

# drop de colunas não usadas (caso existam)
df.drop(columns=[c for c in ["member_id"] if c in df.columns], inplace=True)

# --------------------------------------------------
# 3. TARGET BINÁRIO
# --------------------------------------------------
bad_status = ["Charged Off", "Default", "Late (31-120 days)"]
df[TARGET] = df[TARGET_RAW].isin(bad_status).astype(int)

# --------------------------------------------------
# 4. DATA E ID
# --------------------------------------------------
df["date"] = pd.to_datetime(df["issue_d"], format="%b-%Y", errors="coerce")
df.drop(columns=["issue_d", TARGET_RAW], inplace=True)

df.reset_index(drop=False, inplace=True)   # index ➜ nova coluna

# --------------------------------------------------
# 5. FEATURES NUMÉRICAS + IMPUTAÇÃO
# --------------------------------------------------
numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(["id", TARGET])
na_threshold = 0.30
valid_cols = numeric_cols[df[numeric_cols].isna().mean() < na_threshold].tolist()
valid_cols.remove('index')

df[valid_cols] = df[valid_cols].fillna(df[valid_cols].median())

# --------------------------------------------------
# 6. DATAFRAME FINAL
# --------------------------------------------------
df_model = df[["id", "date", TARGET] + valid_cols].dropna().copy()

In [4]:
# --------------------------------------------------
# 7. SPLIT TREINO / TESTE
# --------------------------------------------------
train_idx, test_idx = train_test_split(
    df_model.index,
    test_size=0.30,
    stratify=df_model[TARGET],
    random_state=42,
)

df_train = df_model.loc[train_idx].copy()
df_test  = df_model.loc[test_idx].copy()

X_train, y_train = df_train[valid_cols], df_train[TARGET]
X_test , y_test  = df_test[valid_cols] , df_test[TARGET]


In [11]:
# --------------------------------------------------
# 8. MODELOS COM PARÂMETROS APRIMORADOS
# --------------------------------------------------
models = {
    "LogisticRegression": LogisticRegression(
        max_iter=2000,
        solver='lbfgs',       # bom para datasets médios
        class_weight='balanced',  # trata desbalanceamento
        C=1.0                 # regularização inversa (quanto menor, mais regularizado)
    ),
    
    "DecisionTree": DecisionTreeClassifier(
        max_depth=6,          # evita overfitting
        min_samples_leaf=50,  # tamanho mínimo das folhas
        class_weight='balanced',
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=300,     
        learning_rate=0.05,   
        max_depth=4,          
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=1.0,     # pode ser ajustado com base no desbalanceamento
        use_label_encoder=False,
        eval_metric='logloss',
        random_state=42,
        n_jobs=-1
    ),

    "LightGBM": LGBMClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        num_leaves=15,            # normalmente ≈ 2^max_depth
        min_child_samples=50,
        subsample=0.8,
        colsample_bytree=0.8,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    )
}


`use_label_encoder` is deprecated in 1.7.0.



In [None]:
# --------------------------------------------------
# 9. LOOP DE TREINO + AVALIAÇÃO COM BinaryPerformanceEvaluator
# --------------------------------------------------
results = {}
for name, model in models.items():
    print(f"\n🔹  Treinando {name}")
    model.fit(X_train, y_train)

    evaluator = BinaryPerformanceEvaluator(
        model=model,
        df_train=df_train,
        df_test=df_test,
        target_col=TARGET,
        id_cols=["id"],
        date_col="date",
        homogeneous_group="auto",
    )

    metrics = evaluator.compute_metrics()
    results[name] = metrics

    radar = evaluator.plot_group_radar()
    radar.update_layout(title=f"Radar – {name}")
    radar.show()

# (opcional) métricas comparativas
#all_metrics = pd.concat(results, names=["model"])
#display(all_metrics)

In [None]:
results = {}
for name, model in models.items():
    print(f"\n🚀 Treinando modelo: {name}")
    model.fit(X_train, y_train)

    evaluator = BinaryPerformanceEvaluator(
        model=model,
        df_train=df_train,
        df_test=df_test,
        target_col="target",
        id_cols=["id"],
        date_col="date",
        homogeneous_group="auto",  # usa Optimal Binning
    )

    print(f"📊 Avaliando modelo: {name}")
    metrics_df = evaluator.compute_metrics()
    results[name] = metrics_df

    radar_fig = evaluator.plot_group_radar()
    radar_fig.update_layout(title=f"Radar – {name}")
    radar_fig.show()

# Opcional: juntar todas as métricas em um DataFrame
df_results = pd.concat(results, names=["modelo"])
display(df_results)

In [None]:
evaluator = BinaryPerformanceEvaluator(
    model=models.get('LogisticRegression'),
    df_train=train,
    df_test=test,
    target_col=TARGET,
    id_cols=['id'],
    date_col='date',
    homogeneous_group='auto',
)
evaluator.compute_metrics()

radar = evaluator.plot_group_radar()
radar.show()
