# Homogeneous Groups Demo

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from binary_performance_evaluator import BinaryPerformanceEvaluator

# === CONFIGURAÇÕES ===
FILE_PATH = '../../datasets/lending_club/accepted_2007_to_2018Q4.csv'
NROWS = 200_000
TARGET = 'target_risco_credito'

# === LEITURA E LIMPEZA DE TIPOS MISTOS ===
def read_and_clean_csv_mixed_types(path, nrows=None, verbose=True):
    df = pd.read_csv(path, low_memory=False, nrows=nrows)
    mixed_type_columns = {}
    for col in df.columns:
        types_in_col = df[col].dropna().apply(type).value_counts()
        if len(types_in_col) > 1:
            mixed_type_columns[col] = types_in_col
            if verbose:
                print(f"[!] Coluna '{col}' com múltiplos tipos:\n{types_in_col}")
    for col in mixed_type_columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce')
        except:
            df[col] = df[col].astype(str)
    return df, mixed_type_columns

df, _ = read_and_clean_csv_mixed_types(FILE_PATH, nrows=NROWS)

# === TARGET BINÁRIO ===
status_de_risco = ["Charged Off", "Default", "Late (31-120 days)"]
df[TARGET] = df["loan_status"].isin(status_de_risco).astype(int)

# === DATAS ===
df["issue_d"] = pd.to_datetime(df["issue_d"], format="%b-%Y", errors="coerce")
df["safra"] = df["issue_d"].dt.to_period("M")

# === FEATURES NUMÉRICAS VÁLIDAS ===
numeric_cols = df.select_dtypes(include=[np.number]).columns.drop(TARGET, errors='ignore')
na_threshold = 0.3
valid_numeric_cols = numeric_cols[df[numeric_cols].isna().mean() < na_threshold]
df[valid_numeric_cols] = df[valid_numeric_cols].fillna(df[valid_numeric_cols].median())

# === PREPARAÇÃO DO DATAFRAME PARA AVALIAÇÃO ===
df_model = df[valid_numeric_cols.tolist() + ["issue_d", TARGET]].dropna().copy()
df_model.rename(columns={"issue_d": "date", TARGET: "target"}, inplace=True)
df_model.reset_index(drop=False, inplace=True)
df_model.rename(columns={"index": "id"}, inplace=True)

# === DIVISÃO TREINO/TESTE USANDO ÍNDICES ===
train_idx, test_idx = train_test_split(
    df_model.index,
    test_size=0.3,
    stratify=df_model["target"],
    random_state=42
)
df_train = df_model.loc[train_idx].copy()
df_test = df_model.loc[test_idx].copy()

# === FEATURES ===
feature_cols = valid_numeric_cols.tolist()
X_train = df_train[feature_cols]
y_train = df_train["target"]
X_test = df_test[feature_cols]
y_test = df_test["target"]

# === MODELOS ===
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "DecisionTree": DecisionTreeClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier()
}

results = {}
for name, model in models.items():
    print(f"\n🚀 Treinando modelo: {name}")
    model.fit(X_train, y_train)

    evaluator = BinaryPerformanceEvaluator(
        model=model,
        df_train=df_train,
        df_test=df_test,
        target_col="target",
        id_cols=["id"],
        date_col="date",
        homogeneous_group="auto",  # usa Optimal Binning
    )

    print(f"📊 Avaliando modelo: {name}")
    metrics_df = evaluator.compute_metrics()
    results[name] = metrics_df

    radar_fig = evaluator.plot_group_radar()
    radar_fig.update_layout(title=f"Radar – {name}")
    radar_fig.show()

# Opcional: juntar todas as métricas em um DataFrame
df_results = pd.concat(results, names=["modelo"])
display(df_results)





=== LogisticRegression ===


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0      0.982     0.996     0.989     49237
           1      0.982     0.916     0.948     10763

    accuracy                          0.982     60000
   macro avg      0.982     0.956     0.969     60000
weighted avg      0.982     0.982     0.982     60000


=== DecisionTree ===
              precision    recall  f1-score   support

           0      0.994     0.993     0.993     49237
           1      0.969     0.971     0.970     10763

    accuracy                          0.989     60000
   macro avg      0.981     0.982     0.982     60000
weighted avg      0.989     0.989     0.989     60000


=== XGBoost ===
              precision    recall  f1-score   support

           0      0.995     0.999     0.997     49237
           1      0.997     0.978     0.987     10763

    accuracy                          0.996     60000
   macro avg      0.996     0.989     0.992     60000
weighted avg      0.996     0.996  

Exception in thread Thread-5 (_readerthread):
Traceback (most recent call last):
  File "c:\Users\JM\AppData\Local\anaconda3\envs\ENV_STONE\lib\threading.py", line 1016, in _bootstrap_inner
    self.run()
  File "c:\Users\JM\AppData\Local\anaconda3\envs\ENV_STONE\lib\site-packages\ipykernel\ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\JM\AppData\Local\anaconda3\envs\ENV_STONE\lib\threading.py", line 953, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\JM\AppData\Local\anaconda3\envs\ENV_STONE\lib\subprocess.py", line 1515, in _readerthread
    buffer.append(fh.read())
  File "c:\Users\JM\AppData\Local\anaconda3\envs\ENV_STONE\lib\codecs.py", line 322, in decode
    (result, consumed) = self._buffer_decode(data, self.errors, final)
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x87 in position 103: invalid start byte


[LightGBM] [Info] Number of positive: 25113, number of negative: 114887
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.036876 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 10094
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 69
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.179379 -> initscore=-1.520563
[LightGBM] [Info] Start training from score -1.520563
              precision    recall  f1-score   support

           0      0.994     0.999     0.997     49237
           1      0.996     0.974     0.985     10763

    accuracy                          0.995     60000
   macro avg      0.995     0.987     0.991     60000
weighted avg      0.995     0.995     0.995     60000



In [None]:
evaluator = BinaryPerformanceEvaluator(
    model=models.get('LogisticRegression'),
    df_train=train,
    df_test=test,
    target_col=TARGET,
    id_cols=['id'],
    date_col='date',
    homogeneous_group='auto',
)
evaluator.compute_metrics()

radar = evaluator.plot_group_radar()
radar.show()
