```mermaid
flowchart LR
    A[entrada X,y] --> B[Woodwork init]
    B --> C{overrides\nuser}
    C -->|force\ncategorical| D1
    C -->|force\nnumeric| D2
    D1 --> E[ID & alta-unicidade\nremovidas]
    D2 --> E
    E --> F{tipo}
    F -->|numérico| G[binning numérico: Optimal / Unsupervised]
    F -->|categórico| H[tratamento cat • encoding WoE ou freq]
    F -->|ignorado| I[(Ignora)]
    G --> J[refine_bins + checagens]
    H --> J
    J --> K[concat summaries]
    K --> L[pivot + PSI]
    L --> M[atributos finais :iv_, iv_dict_, schema_, …]
```

In [1]:
# imports
import os
import sys

# Adiciona o diretório raiz do projeto ao PYTHONPATH para importar o pacote local
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
from nasabinning.binning_engine import NASABinner
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline

In [2]:
# carregar dataset
df = pd.read_csv('../data/data_science_credit_not_time_col.csv', sep=';')
print(df.shape)
display(df.head())

(67463, 18)


Unnamed: 0,client_id,pf_ou_pj,grade,sub_grade,qtd_restritivos,verificacao_fonte_de_renda,razao_credito_tomado_vs_renda_informada,patrimonio_total,qtd_atrasos_ultimos_2a,valor_total_recuperacoes_ultimos_2a,contas_distintas_com_atraso,qtd_consultas_ultimos_6m,qtd_linhas_credito_abertas,saldo_rotativo_total,limite_rotativo_total,valor_total_emprestimos_tomados,taxa_juros_media_emprestimos_tomados,target
0,75521,PF,B,C4,0,Not Verified,16.284758,176346.6267,1,2.498291,0,0,13,24246,6619,10000,11.135007,0
1,28124,PF,C,D3,0,Source Verified,15.412409,39833.921,0,2.377215,0,0,12,812,20885,3609,12.237563,0
2,8420,PF,F,D4,0,Source Verified,28.137619,91506.69105,0,4.316277,0,0,14,1843,26155,28276,12.545884,0
3,22553,PF,C,C3,0,Source Verified,18.04373,108286.5759,1,0.10702,0,0,7,13819,60214,11170,16.731201,0
4,62952,PF,C,D4,1,Source Verified,17.209886,44234.82545,1,1294.818751,0,3,13,1544,22579,16890,15.0083,0


In [9]:
binner = NASABinner(
    strategy="supervised",
    max_bins=10,
    min_event_rate_diff=0.01,
    monotonic='descending',
    force_categorical=[
#        "verificacao_fonte_de_renda",
#        'qtd_restritivos',
#        'qtd_atrasos_ultimos_2a'
        ],
    force_numeric=[],
    use_optuna=True,
)

#X = df.drop(columns=['target','client_id'])
X = df[['sub_grade']]
y = df["target"]

binner.fit(X, y)
print("IV global:", binner.iv_)
display(binner.describe_schema())
display(binner.bin_summary)

IV global: 0.000429810926257144


Unnamed: 0,col,tipo
0,sub_grade,categorical


Unnamed: 0,variable,bin,count,event,non_event,event_rate
0,sub_grade,1,3250,498,2752,0.153231
1,sub_grade,2,64213,9071,55142,0.141264


In [10]:
bin_table = binner.bin_summary[binner.bin_summary['variable']=='sub_grade']

print(len(bin_table))
display(bin_table)

2


Unnamed: 0,variable,bin,count,event,non_event,event_rate
0,sub_grade,1,3250,498,2752,0.153231
1,sub_grade,2,64213,9071,55142,0.141264


In [5]:
# 3) Pego o mapeamento “categoria → bin”:
mapping = binner.get_bin_mapping("sub_grade")
# mapping costuma ter colunas ["categoria", "bin"]

# 4) Agrupo para ver a lista de categorias por bin:
agrupado = mapping.groupby("bin")["categoria"].apply(list).reset_index()
agrupado.columns = ["bin", "lista_de_categorias"]
display(agrupado)

ValueError: A coluna 'sub_grade' não passou por CategoricalBinning.

In [None]:
# 1) refazemos a etapa de “rare-merge” manualmente para sabermos quem era rare
s = X["sub_grade"].astype("category")
freq = s.value_counts(normalize=True)
rare_categories = freq[freq < binner._per_feature_binners["sub_grade"].rare_threshold].index.tolist()

# Mostrar quais categorias acabaram dentro de "_RARE_"
print("Categorias originais que viraram '_RARE_' antes da binagem:", rare_categories)