In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scaler import DynamicScaler

# 1. Carregar o dataset
df = pd.read_csv("./data/case_data_science_credit.csv", sep=";")
print(f"Shape do dataset original: {df.shape}")

TARGET = 'target'

# 2. Selecionar apenas colunas numéricas (para escalonamento)
num_cols = df.select_dtypes(include=["number"]).columns.tolist()

# Remover colunas específicas
for col in ['client_id', 'target']:
    if col in num_cols:
        num_cols.remove(col)

print(f"Colunas numéricas para escalonamento: {num_cols}")

# 3. Separar em treino e teste
df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
print(f"Treino: {df_train.shape}, Teste: {df_test.shape}")

y_train = df_train[TARGET]
y_test = df_test[TARGET]

Shape do dataset original: (67463, 18)
Colunas numéricas para escalonamento: ['qtd_restritivos', 'razao_credito_tomado_vs_renda_informada', 'patrimonio_total', 'qtd_atrasos_ultimos_2a', 'valor_total_recuperacoes_ultimos_2a', 'contas_distintas_com_atraso', 'qtd_consultas_ultimos_6m', 'qtd_linhas_credito_abertas', 'saldo_rotativo_total', 'limite_rotativo_total', 'valor_total_emprestimos_tomados', 'taxa_juros_media_emprestimos_tomados']
Treino: (47224, 18), Teste: (20239, 18)


In [3]:
# 4. Aplicar o DynamicScaler no treino
scaler = DynamicScaler(
    strategy="auto",
    shapiro_p_val = 0.01, # se aumentar fica mais restritiva a escolha de StandardScaler()
    serialize=False,
    save_path="scalers.pkl", # arquivo com informações de escalonamento salvas 
    random_state=42
)

scaler.fit(df_train[num_cols], y_train)

In [4]:
# Relatório das decisões
print("\nResumo das decisões por feature:")
display(scaler.report_as_df())


Resumo das decisões por feature:


Unnamed: 0,chosen_scaler,validation_stats,ignored,candidates_tried,reason
qtd_restritivos,,"{'post_std': nan, 'post_iqr': nan, 'post_n_uni...",[],"[PowerTransformer, QuantileTransformer, Robust...",all_rejected
razao_credito_tomado_vs_renda_informada,,"{'post_std': nan, 'post_iqr': nan, 'post_n_uni...",[],"[PowerTransformer, QuantileTransformer, Robust...",all_rejected
patrimonio_total,PowerTransformer,"{'post_std': 1.0, 'post_iqr': 1.25796611169819...",[],[PowerTransformer],stats|skew|kurt
qtd_atrasos_ultimos_2a,,"{'post_std': nan, 'post_iqr': nan, 'post_n_uni...",[],"[PowerTransformer, QuantileTransformer, Robust...",all_rejected
valor_total_recuperacoes_ultimos_2a,PowerTransformer,"{'post_std': 1.0, 'post_iqr': 1.21328323139613...",[],[PowerTransformer],stats|skew|kurt
contas_distintas_com_atraso,,{},[],[],constante
qtd_consultas_ultimos_6m,,"{'post_std': nan, 'post_iqr': nan, 'post_n_uni...",[],"[PowerTransformer, QuantileTransformer, Robust...",all_rejected
qtd_linhas_credito_abertas,PowerTransformer,"{'post_std': 1.0, 'post_iqr': 1.23769385688904...",[],[PowerTransformer],stats|skew|kurt
saldo_rotativo_total,PowerTransformer,"{'post_std': 1.0, 'post_iqr': 1.31396458552683...",[],[PowerTransformer],stats|skew|kurt
limite_rotativo_total,PowerTransformer,"{'post_std': 1.0, 'post_iqr': 1.45938330005203...",[],[PowerTransformer],stats|skew|kurt


In [5]:
display(df_train[num_cols].head())

# 5. Transformar o conjunto de treino
X_train_scaled = scaler.transform(df_train[num_cols], return_df=True)

print("\nExemplo de dados escalados (treino):")
display(X_train_scaled.head())

Unnamed: 0,qtd_restritivos,razao_credito_tomado_vs_renda_informada,patrimonio_total,qtd_atrasos_ultimos_2a,valor_total_recuperacoes_ultimos_2a,contas_distintas_com_atraso,qtd_consultas_ultimos_6m,qtd_linhas_credito_abertas,saldo_rotativo_total,limite_rotativo_total,valor_total_emprestimos_tomados,taxa_juros_media_emprestimos_tomados
10823,0,31.904233,49591.35011,0,7.037046,0,0,14,9126,7697,5784,11.154593
56039,0,20.329191,58654.77591,1,3075.908451,0,0,8,15137,56296,19462,9.203992
32457,0,32.012076,71289.34266,0,3.766088,0,0,10,604,25222,30348,10.323713
43496,0,20.265629,66101.82345,0,7.138711,0,0,19,10079,62191,19156,8.711736
50204,1,20.363201,70478.22719,1,2.111264,0,0,11,11949,10611,18358,9.587092



Exemplo de dados escalados (treino):


Unnamed: 0,qtd_restritivos,razao_credito_tomado_vs_renda_informada,patrimonio_total,qtd_atrasos_ultimos_2a,valor_total_recuperacoes_ultimos_2a,contas_distintas_com_atraso,qtd_consultas_ultimos_6m,qtd_linhas_credito_abertas,saldo_rotativo_total,limite_rotativo_total,valor_total_emprestimos_tomados,taxa_juros_media_emprestimos_tomados
10823,0,31.904233,-0.728688,0,0.86317,0,0,0.215087,0.529706,-0.776316,5784,11.154593
56039,0,20.329191,-0.36418,1,2.822869,0,0,-1.334301,1.120289,1.434187,19462,9.203992
32457,0,32.012076,0.045879,0,0.282778,0,0,-0.687966,-1.605296,0.458558,30348,10.323713
43496,0,20.265629,-0.111229,0,0.875453,0,0,0.95899,0.63965,1.563984,19156,8.711736
50204,1,20.363201,0.02222,1,-0.308651,0,0,-0.423312,0.834692,-0.464617,18358,9.587092


In [7]:
scaler.plot_information_gain_logreg(
        gain_thr=0.0001,
        top_n=10,
        title = 'DynamicScaler - Impacto no Ganho de Informação'
)

Unnamed: 0,feature,auc_before,auc_after,gain
1,valor_total_recuperacoes_ultimos_2a,0.496898,0.498204,0.002629
2,qtd_linhas_credito_abertas,0.493854,0.510822,0.034358
3,saldo_rotativo_total,0.493644,0.504945,0.022894


In [None]:
# Visualizar
scaler.plot_histograms(
    df_train,
    X_train_scaled,
    features=num_cols)
        #"patrimonio_total",                 # RobustScaler
        #"qtd_restritivos",                  # QuantileTransformer
        #"valor_total_emprestimos_tomados",  # MinMaxScaler
        #])

In [None]:
# 6. Carregar o scaler salvo e aplicar no teste
scaler_test = DynamicScaler()
scaler_test.load("scalers.pkl")

display(df_test[num_cols].head())

X_test_scaled = scaler_test.transform(df_test[num_cols], return_df=True)

print("\nExemplo de dados escalados (teste):")
display(X_test_scaled.head())

In [None]:
scaler_cv = DynamicScaler(
    strategy="auto",
    evaluation_mode='linear',
    importance_metric='coef',       # ou shap
    importance_gain_thr=0.01,
    extra_validation=True,          # habilita CV para todos
    allow_minmax=True,              # deixa MinMax entrar
    random_state=42
)

scaler_cv.fit(df_train[num_cols], y_train)
X_train_scaled = scaler_cv.transform(df_train[num_cols], return_df=True)

X_test_scaled = scaler_cv.transform(df_test[num_cols], return_df=True)

In [None]:
report_scaler = scaler_cv.report_as_df()
report_scaler[report_scaler['chosen_scaler']!='None']

In [None]:
# Relatório das decisões
print("\nResumo das decisões por feature:")
display(scaler_cv.report_as_df())

In [None]:
# Visualizar somente as que passaram por scaler
scaler_cv.plot_histograms(
    df_test,
    X_test_scaled, 
    features=report_scaler[report_scaler['chosen_scaler']!='None'].index.tolist())