```mermaid
flowchart LR
    A[entrada X,y] --> B[Woodwork init]
    B --> C{overrides\nuser}
    C -->|force\ncategorical| D1
    C -->|force\nnumeric| D2
    D1 --> E[ID & alta-unicidade\nremovidas]
    D2 --> E
    E --> F{tipo}
    F -->|numérico| G[binning numérico: Optimal / Unsupervised]
    F -->|categórico| H[tratamento cat • encoding WoE ou freq]
    F -->|ignorado| I[(Ignora)]
    G --> J[refine_bins + checagens]
    H --> J
    J --> K[concat summaries]
    K --> L[pivot + PSI]
    L --> M[atributos finais :iv_, iv_dict_, schema_, …]
```

In [1]:
# imports
import os
import sys

# Adiciona o diretório raiz do projeto ao PYTHONPATH para importar o pacote local
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
from nasabinning.binning_engine import NASABinner
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

%matplotlib inline

In [None]:
# carregar dataset
df = pd.read_csv('../data/data_science_credit_not_time_col.csv', sep=';')
print(df.shape)
display(df.head())

(67463, 18)


Unnamed: 0,client_id,pf_ou_pj,grade,sub_grade,qtd_restritivos,verificacao_fonte_de_renda,razao_credito_tomado_vs_renda_informada,patrimonio_total,qtd_atrasos_ultimos_2a,valor_total_recuperacoes_ultimos_2a,contas_distintas_com_atraso,qtd_consultas_ultimos_6m,qtd_linhas_credito_abertas,saldo_rotativo_total,limite_rotativo_total,valor_total_emprestimos_tomados,taxa_juros_media_emprestimos_tomados,target
0,75521,PF,B,C4,0,Not Verified,16.284758,176346.6267,1,2.498291,0,0,13,24246,6619,10000,11.135007,0
1,28124,PF,C,D3,0,Source Verified,15.412409,39833.921,0,2.377215,0,0,12,812,20885,3609,12.237563,0
2,8420,PF,F,D4,0,Source Verified,28.137619,91506.69105,0,4.316277,0,0,14,1843,26155,28276,12.545884,0
3,22553,PF,C,C3,0,Source Verified,18.04373,108286.5759,1,0.10702,0,0,7,13819,60214,11170,16.731201,0
4,62952,PF,C,D4,1,Source Verified,17.209886,44234.82545,1,1294.818751,0,3,13,1544,22579,16890,15.0083,0


In [3]:
df.columns

Index(['client_id', 'pf_ou_pj', 'grade', 'sub_grade', 'qtd_restritivos',
       'verificacao_fonte_de_renda', 'razao_credito_tomado_vs_renda_informada',
       'patrimonio_total', 'qtd_atrasos_ultimos_2a',
       'valor_total_recuperacoes_ultimos_2a', 'contas_distintas_com_atraso',
       'qtd_consultas_ultimos_6m', 'qtd_linhas_credito_abertas',
       'saldo_rotativo_total', 'limite_rotativo_total',
       'valor_total_emprestimos_tomados',
       'taxa_juros_media_emprestimos_tomados', 'target'],
      dtype='object')

In [6]:
binner = NASABinner(
    strategy="supervised",
    max_bins=10,
    min_event_rate_diff=0.0001,
    force_categorical=["verificacao_fonte_de_renda", "grade"],
    force_numeric=["qtd_atrasos_ultimos_2a"],
    use_optuna=False
)

X = df[[
    "verificacao_fonte_de_renda",
    "qtd_atrasos_ultimos_2a",
    "grade"
]]

y = df["target"]

binner.fit(X, y)
display(binner.describe_schema())
print("IV global:", binner.iv_)
display(binner.bin_summary.head(20))

Unnamed: 0,col,tipo
0,qtd_atrasos_ultimos_2a,numeric
1,verificacao_fonte_de_renda,categorical
2,grade,categorical


IV global: 0.20644667755307297


Unnamed: 0,variable,bin,count,event,non_event,event_rate
0,qtd_atrasos_ultimos_2a,"(-inf, 0.50)",52054,6869,45185,0.131959
1,qtd_atrasos_ultimos_2a,"[0.50, 1.50)",11736,1538,10198,0.13105
2,qtd_atrasos_ultimos_2a,"[1.50, inf)",3673,1162,2511,0.316363
3,verificacao_fonte_de_renda,1,16349,2264,14085,0.138479
4,verificacao_fonte_de_renda,2,33036,4759,28277,0.144055
5,verificacao_fonte_de_renda,3,18078,2546,15532,0.140834
6,grade,1,18742,2161,16581,0.115303
7,grade,2,19085,2370,16715,0.124181
8,grade,3,2246,681,1565,0.303206
9,grade,4,12055,1429,10626,0.11854


In [None]:
# # mapeamento categoria → bin
binner.get_bin_mapping("verificacao_fonte_de_renda")