# Exemplo com Vassoura e dataset PD Behavior

Este notebook demonstra como carregar o dataset `pd_behavior_example.parquet`
 e aplicar a classe `Vassoura`.

In [None]:
import pandas as pd
from vassoura.core import Vassoura
import vassoura as vs
from audittrail import AuditTrail

pd.set_option('display.max_columns', None)

FILE_PATH_1 = '../../../datasets/lending_club/accepted_2007_to_2018Q4.csv'
FILE_PATH_2 = '../../datasets/lending_club/rejected_2007_to_2018Q4.csv'

In [None]:
# Carregar dataset de exemplo
def read_and_clean_csv_mixed_types(path, nrows=None, verbose=True):
    """
    Lê um arquivo CSV, detecta colunas com tipos mistos e aplica conversão automática.
    
    Parâmetros:
    - path: caminho para o arquivo CSV
    - nrows: número de linhas a serem lidas (None = todas)
    - verbose: se True, imprime colunas com tipos mistos

    Retorna:
    - DataFrame limpo
    - Dicionário com as colunas que tinham tipos mistos
    """
    # Leitura inicial
    df = pd.read_csv(path, low_memory=False, nrows=nrows)

    # Identificar colunas com tipos mistos
    mixed_type_columns = {}
    for col in df.columns:
        types_in_col = df[col].dropna().apply(type).value_counts()
        if len(types_in_col) > 1:
            mixed_type_columns[col] = types_in_col
            if verbose:
                print(f"\n[!] Coluna '{col}' tem múltiplos tipos:")
                print(types_in_col)

    # Tentativa de padronização
    for col in mixed_type_columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='coerce')
            if verbose:
                print(f"[✓] Coluna '{col}' convertida para float.")
        except Exception:
            df[col] = df[col].astype(str)
            if verbose:
                print(f"[✓] Coluna '{col}' convertida para string.")

    return df, mixed_type_columns

df, problemas_1 = read_and_clean_csv_mixed_types(FILE_PATH_1, nrows=200_000)
#loans_rejected, problemas_2 = read_and_clean_csv_mixed_types(FILE_PATH_2, nrows=100_000)

print(df.shape)
display(df.head(3))

# print(loans_rejected.shape)
# display(loans_rejected.head(3))

#### Análise de Target

In [None]:
df['loan_status'].value_counts(dropna=False, normalize=True) * 100

In [None]:
TARGET = 'target_risco_credito'

status_de_risco = [
    "Charged Off",
    "Default",
    "Late (31-120 days)",
    #"Late (16-30 days)"
]
df[TARGET] = df["loan_status"].isin(status_de_risco).astype(int)

df[TARGET].value_counts(dropna=False, normalize=True) * 100

### Safras

In [None]:
#date_col=['issue_d', 'earliest_cr_line','last_pymnt_d','last_credit_pull_d','next_pymnt_d']

temporal_columns = [
    "issue_d",                # Data de emissão do empréstimo
    "earliest_cr_line",       # Primeira linha de crédito do cliente
    "last_pymnt_d",           # Último pagamento realizado
    "last_credit_pull_d",     # Última consulta ao crédito
    "next_pymnt_d",           # Próximo pagamento previsto (se aplicável)
    #"last_pymnt_amnt",             # (associada à data de pagamento)
    "debt_settlement_flag_date",   # Data em que houve acordo de dívida
    "settlement_date"              # Data em que o acordo foi fechado
]



for col in temporal_columns:

    df[col] = pd.to_datetime(df[col], format="%b-%Y")


df["safra"] = df["issue_d"].dt.to_period("M").astype(str)  # formato 'YYYY-MM'

print(df["safra"].min())
print(df["safra"].max())

### Captura Inicial

In [6]:
trail = AuditTrail(
    track_histograms=True,
    track_distributions=True,
    enable_logging=True,
    auto_detect_types=True,
    target_col=TARGET,
    default_keys=["id"]
)

trail.take_snapshot(df, name="original")

In [7]:
trail.describe_snapshot("original")


📄 Descrição do snapshot 'original':

▶️ Shape: (200000, 153)
▶️ Chaves de duplicação: ['id']
   • Duplicatas nas chaves: 0

🧱 Tipos de dados:


id                         int64
member_id                float64
loan_amnt                float64
funded_amnt              float64
funded_amnt_inv          float64
                          ...   
settlement_amount        float64
settlement_percentage    float64
settlement_term          float64
target_risco_credito       int32
safra                     object
Length: 153, dtype: object


🔎 Colunas detectadas automaticamente:
   • Numéricas (88): ['id', 'loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'inq_last_6mths', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp', 'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp', 'total_rec_int', 'total_rec_late_fee', 'recoveries', 'collection_recovery_fee', 'last_pymnt_amnt', 'last_fico_range_high', 'last_fico_range_low', 'collections_12_mths_ex_med', 'mths_since_last_major_derog', 'policy_code', 'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'bc_open_to_buy', 'bc_util', 'chargeoff_withi

member_id                                     200000
revol_bal_joint                               200000
sec_app_fico_range_high                       200000
sec_app_earliest_cr_line                      200000
sec_app_inq_last_6mths                        200000
sec_app_mort_acc                              200000
sec_app_open_acc                              200000
sec_app_revol_util                            200000
sec_app_open_act_il                           200000
sec_app_num_rev_accts                         200000
sec_app_chargeoff_within_12_mths              200000
sec_app_collections_12_mths_ex_med            200000
sec_app_mths_since_last_major_derog           200000
sec_app_fico_range_low                        200000
desc                                          199989
dti_joint                                     199491
annual_inc_joint                              199489
verification_status_joint                     199489
orig_projected_additional_accrued_interest    


📊 Estatísticas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,200000.0,6.229632e+07,3.941141e+06,56705.00,59411728.75,62217530.00,65644567.00,68617057.00
member_id,0.0,,,,,,,
loan_amnt,200000.0,1.527816e+04,8.651134e+03,1000.00,8500.00,14000.00,20000.00,35000.00
funded_amnt,200000.0,1.527816e+04,8.651134e+03,1000.00,8500.00,14000.00,20000.00,35000.00
funded_amnt_inv,200000.0,1.526951e+04,8.646320e+03,900.00,8475.00,14000.00,20000.00,35000.00
...,...,...,...,...,...,...,...,...
hardship_last_payment_amount,1409.0,1.860614e+02,1.812045e+02,0.02,44.95,134.56,275.29,927.79
settlement_amount,5840.0,5.015406e+03,3.557532e+03,130.00,2192.49,4344.99,7000.00,26242.50
settlement_percentage,5840.0,4.710027e+01,5.753855e+00,20.00,45.00,45.00,50.00,97.66
settlement_term,5840.0,1.348545e+01,7.537550e+00,0.00,8.00,13.50,18.00,65.00



🏷️ Estatísticas categóricas:


Unnamed: 0,count,unique,top,freq
term,200000,2,36 months,134643
grade,200000,7,B,59333
sub_grade,200000,35,C1,12803
emp_title,187870,66136,Teacher,3823
emp_length,187897,11,10+ years,67516
home_ownership,200000,4,MORTGAGE,99618
verification_status,200000,3,Source Verified,84756
loan_status,200000,7,Fully Paid,140992
pymnt_plan,200000,2,n,199978
url,200000,200000,https://lendingclub.com/browse/loanDetail.acti...,1



📈 Histogramas (categorias apenas):
  term: 2 valores distintos (top 3: {' 36 months': 134643, ' 60 months': 65357})
  grade: 7 valores distintos (top 3: {'B': 59333, 'C': 57429, 'A': 34866})
  sub_grade: 35 valores distintos (top 3: {'C1': 12803, 'B3': 12497, 'B4': 12214})
  emp_length: 12 valores distintos (top 3: {'10+ years': 67516, '2 years': 17529, '< 1 year': 16736})
  home_ownership: 4 valores distintos (top 3: {'MORTGAGE': 99618, 'RENT': 77978, 'OWN': 22403})
  verification_status: 3 valores distintos (top 3: {'Source Verified': 84756, 'Verified': 60961, 'Not Verified': 54283})
  loan_status: 7 valores distintos (top 3: {'Fully Paid': 140992, 'Charged Off': 35090, 'Current': 22637})
  pymnt_plan: 2 valores distintos (top 3: {'n': 199978, 'y': 22})
  purpose: 13 valores distintos (top 3: {'debt_consolidation': 114750, 'credit_card': 50817, 'home_improvement': 12599})
  title: 15 valores distintos (top 3: {'Debt consolidation': 114692, 'Credit card refinancing': 50811, 'Home imp

#### Limpeza com Vassoura

In [8]:
assert set(df[TARGET].dropna().unique()) == {0, 1}, "TARGET não é binário!"

In [9]:
vs = Vassoura(
    df,
    id_cols = ['id','member_id'],
    date_cols=temporal_columns,
    ignore_cols=['url'],
    drop_ignored=True,
    target_col=TARGET,
    verbose='basic',
    
    engine='polars',
    
    adaptive_sampling=True,

    heuristics=[
        "iv",
        "graph_cut",
        "corr",
        "vif",

    ],
    thresholds={
        "missing": 0.60,
        "corr": 0.85,
        "vif": 5,
        "iv": 0.01,
        "graph_cut": 0.9,
    },
    n_steps=2,         # controla quantas remoções por iteração
    vif_n_steps=2      # VIF será sempre 1-step (a menos que você altere)
)

df_limpo = vs.run(recompute=True)

[Vassoura] Missing heuristic (thr>0.6)
  → dropped ['desc', 'mths_since_last_record', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'verification_status_joint', 'open_acc_6m', 'open_act_il', 'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util', 'inq_fi', 'total_cu_tl', 'inq_last_12m', 'mths_since_recent_bc_dlq', 'mths_since_recent_revol_delinq', 'revol_bal_joint', 'sec_app_fico_range_low', 'sec_app_fico_range_high', 'sec_app_earliest_cr_line', 'sec_app_inq_last_6mths', 'sec_app_mort_acc', 'sec_app_open_acc', 'sec_app_revol_util', 'sec_app_open_act_il', 'sec_app_num_rev_accts', 'sec_app_chargeoff_within_12_mths', 'sec_app_collections_12_mths_ex_med', 'sec_app_mths_since_last_major_derog', 'hardship_type', 'hardship_reason', 'hardship_status', 'deferral_term', 'hardship_amount', 'hardship_start_date', 'hardship_end_date', 'payment_plan_start_date', 'hardship_length', 'hardship_dpd', 'hardsh

INFO | Tipos detectados: 34 numéricas, 12 categóricas
INFO | Método de correlação sugerido: spearman
INFO | Método spearman não suportado por engine 'polars'; utilizando pandas.
INFO | Matriz de correlação spearman calculada para 34 variáveis numéricas (engine=pandas)


  → dropped ['last_fico_range_low'] (corr>0.85)
  → dropped ['tot_hi_cred_lim'] (corr>0.85)


INFO | Tipos detectados: 32 numéricas, 12 categóricas
INFO | Método de correlação sugerido: spearman
INFO | Método spearman não suportado por engine 'polars'; utilizando pandas.
INFO | Matriz de correlação spearman calculada para 32 variáveis numéricas (engine=pandas)


  → dropped ['bc_util'] (corr>0.85)


INFO | Tipos detectados: 31 numéricas, 12 categóricas
INFO | Método de correlação sugerido: spearman
INFO | Método spearman não suportado por engine 'polars'; utilizando pandas.
INFO | Matriz de correlação spearman calculada para 31 variáveis numéricas (engine=pandas)


  → dropped ['total_rev_hi_lim'] (corr>0.85)


INFO | Tipos detectados: 30 numéricas, 12 categóricas
INFO | Método de correlação sugerido: spearman
INFO | Método spearman não suportado por engine 'polars'; utilizando pandas.
INFO | Matriz de correlação spearman calculada para 30 variáveis numéricas (engine=pandas)


[Vassoura] VIF heuristic (thr=5) vif_n_steps=2


INFO | Tipos detectados: 30 numéricas, 12 categóricas
INFO | Descartando 23505 linha(s) com NaN/inf para cálculo de VIF
INFO | VIF calculado para 30 variáveis


  → dropped ['policy_code'] (vif>5)
  → dropped ['total_rec_prncp'] (vif>5)
  → dropped ['total_bc_limit'] (vif>5)


INFO | Tipos detectados: 27 numéricas, 12 categóricas
INFO | Descartando 23505 linha(s) com NaN/inf para cálculo de VIF
INFO | VIF calculado para 27 variáveis


  → dropped ['fico_range_high'] (vif>5)
  → dropped ['last_fico_range_high'] (vif>5)
  → dropped ['num_op_rev_tl'] (vif>5)
  → dropped ['int_rate'] (vif>5)
  → dropped ['revol_util'] (vif>5)
  → dropped ['num_rev_tl_bal_gt_0'] (vif>5)
  → dropped ['installment'] (vif>5)


INFO | Tipos detectados: 20 numéricas, 12 categóricas
INFO | Descartando 23497 linha(s) com NaN/inf para cálculo de VIF
INFO | VIF calculado para 20 variáveis


  → dropped ['acc_open_past_24mths'] (vif>5)
  → dropped ['num_tl_op_past_12m'] (vif>5)


INFO | Tipos detectados: 18 numéricas, 12 categóricas
INFO | Descartando 23497 linha(s) com NaN/inf para cálculo de VIF
INFO | VIF calculado para 18 variáveis


  → dropped ['mo_sin_old_rev_tl_op'] (vif>5)


INFO | Tipos detectados: 17 numéricas, 12 categóricas
INFO | Descartando 23497 linha(s) com NaN/inf para cálculo de VIF
INFO | VIF calculado para 17 variáveis
INFO | Tipos detectados: 17 numéricas, 12 categóricas
INFO | Método de correlação sugerido: spearman
INFO | Método spearman não suportado por engine 'polars'; utilizando pandas.
INFO | Matriz de correlação spearman calculada para 17 variáveis numéricas (engine=pandas)


In [10]:
df_limpo.head()

Unnamed: 0,id,member_id,issue_d,earliest_cr_line,last_pymnt_d,last_credit_pull_d,next_pymnt_d,debt_settlement_flag_date,settlement_date,target_risco_credito,term,grade,sub_grade,emp_title,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,title,zip_code,addr_state,dti,inq_last_6mths,revol_bal,out_prncp_inv,total_rec_int,total_rec_late_fee,collection_recovery_fee,last_pymnt_amnt,avg_cur_bal,bc_open_to_buy,mo_sin_rcnt_rev_tl_op,mo_sin_rcnt_tl,mort_acc,mths_since_recent_bc,mths_since_recent_inq,percent_bc_gt_75,disbursement_method,debt_settlement_flag
0,56705,,2015-11-01,1997-11-01,2018-12-01,2019-03-01,NaT,NaT,NaT,0,36 months,B,B3,Academic Admin,10+ years,MORTGAGE,33500.0,Source Verified,Fully Paid,debt_consolidation,Debt consolidation,658xx,MO,18.38,0.0,10403.0,0.0,1793.23,0.0,0.0,295.72,6554.0,4288.0,13.0,10.0,0.0,18.0,14.0,50.0,Cash,N
1,67025,,2015-12-01,1993-02-01,2017-01-01,2019-03-01,NaT,NaT,NaT,0,36 months,A,A2,Technical Support Engineer,10+ years,MORTGAGE,65000.0,Source Verified,Fully Paid,credit_card,Credit card refinancing,064xx,CT,17.17,0.0,8951.0,0.0,596.41,0.0,0.0,6936.16,26658.0,13060.0,31.0,31.0,3.0,69.0,,50.0,Cash,N
2,302598,,2015-08-01,2000-04-01,2019-03-01,2019-03-01,2019-04-01,NaT,NaT,0,60 months,B,B3,IT Admin,10+ years,MORTGAGE,60000.0,Not Verified,Current,credit_card,Credit card refinancing,140xx,NY,25.56,0.0,42738.0,5903.1,4359.97,0.0,0.0,373.87,18924.0,6178.0,18.0,17.0,4.0,18.0,17.0,80.0,Cash,N
3,361774,,2015-12-01,1969-06-01,2017-09-01,2019-03-01,NaT,NaT,NaT,0,36 months,B,B1,Senior IT Consultant,3 years,MORTGAGE,115000.0,Not Verified,Fully Paid,debt_consolidation,Debt consolidation,631xx,MO,5.1,0.0,10168.0,0.0,567.32,0.0,0.0,586.68,49598.0,832.0,52.0,3.0,4.0,52.0,3.0,66.7,Cash,N
4,365090,,2015-11-01,2005-10-01,2018-07-01,2018-07-01,NaT,NaT,NaT,0,36 months,B,B5,Financial Associate,7 years,RENT,60000.0,Source Verified,Fully Paid,debt_consolidation,Debt consolidation,112xx,NY,26.58,0.0,18145.0,0.0,1089.53,0.0,0.0,977.86,3220.0,97.0,10.0,10.0,0.0,24.0,24.0,100.0,Cash,N


In [12]:
vs.generate_report("../reports/vassoura_report_2.html")

INFO | Tipos detectados: 115 numéricas, 26 categóricas
INFO | Relatório gerado em ..\reports\vassoura_report_2.html


'..\\reports\\vassoura_report_2.html'

#### Comparação após limpeza

In [None]:
trail.take_snapshot(df_limpo, "limpo")

In [None]:
trail.compare_snapshots("original", "limpo")

In [None]:


print(len(loans_accepted))
loans_accepted.dropna(axis=0, inplace=True)
print(len(loans_accepted))

In [None]:
loans_accepted.term.value_counts(dropna=False, normalize=True)*100

In [None]:
for col in df.columns:
    tipos = df[col].apply(type).value_counts()
    if len(tipos) > 1:
        print(f"{col}: {tipos}")

In [None]:
vsess = Vassoura(
    df,
    target_col='ever90m12',
    heuristics=['corr', 'vif'],
    thresholds={'corr': 0.9, 'vif': 10},
)
df_clean = vsess.run()
df_clean.head()

In [None]:
# Gera relatório resumido
vsess.generate_report('example_report.html')