Modelos Preditivos:
-

In [121]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline as SkPipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report, ConfusionMatrixDisplay

In [122]:

# Caminho relativo correto a partir da pasta 'notebooks'
df = pd.read_pickle('../data/df_unique.pkl')

# Cria uma cópia independente
df_copy = df.copy()

# Salva a cópia em um novo arquivo .pkl
df_copy.to_pickle('../data/df_unique_copy.pkl')

# Exibindo as primeiras linhas do DataFrame
df_copy.head()


Unnamed: 0,age,amt_active_loan,channel,ext_score_1,ext_score_2,ext_score_3,ext_score_4,flag_document_A,gender,ids,income,occupation_type,score_checks,score_date,start_hour,default,credit_card_initial_line,payment,date,credit_line,spends,score_month,score_days,date_ordinal
0,37,4560.0,NCqL3QBx0pscDnx3ixKwXg==,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,,m,648384b9-f932-d221-45d7-85d0aa0a412c,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,2017-01-15,9.0,0,1300.0,198.4,2016-09-30,2180.0,198.4,2017-01,14,736237
1,37,4560.0,NCqL3QBx0pscDnx3ixKwXg==,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,,m,648384b9-f932-d221-45d7-85d0aa0a412c,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,2017-01-15,9.0,0,1300.0,149.94,2016-10-31,2368.0,145.49,2017-01,14,736268
2,37,4560.0,NCqL3QBx0pscDnx3ixKwXg==,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,,m,648384b9-f932-d221-45d7-85d0aa0a412c,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,2017-01-15,9.0,0,1300.0,10.02,2016-11-30,3103.0,10.02,2017-01,14,736298
3,37,4560.0,NCqL3QBx0pscDnx3ixKwXg==,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,,m,648384b9-f932-d221-45d7-85d0aa0a412c,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,2017-01-15,9.0,0,1300.0,38.48,2016-12-31,2430.0,278.05,2017-01,14,736329
4,37,4560.0,NCqL3QBx0pscDnx3ixKwXg==,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,,m,648384b9-f932-d221-45d7-85d0aa0a412c,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,2017-01-15,9.0,0,1300.0,2888.77,2017-01-31,2650.0,2888.77,2017-01,14,736360


Tratamento da variável target:
-

In [123]:
print("Contagem de valores únicos na coluna 'default':")
print(df_copy['default'].value_counts(dropna=False))


Contagem de valores únicos na coluna 'default':
default
0    214627
1     44877
Name: count, dtype: int64


In [124]:
df_copy['default'] = df_copy['default'].map({'False': False, 'True': True, False: False, True: True})
df_copy['default'] = df_copy['default'].map({False: 0, True: 1})
df_copy = df_copy[df_copy['default'].notna()]

Tratamento das demais variáveis:
-

In [125]:
# Garantindo que as colunas estão no formato datetime
df_copy['score_date'] = pd.to_datetime(df_copy['score_date'])
df_copy['date'] = pd.to_datetime(df_copy['date'])

# Extraindo features de score_date
df_copy['score_year'] = df_copy['score_date'].dt.year
df_copy['score_month'] = df_copy['score_date'].dt.month
df_copy['score_day'] = df_copy['score_date'].dt.day
df_copy['score_dayofweek'] = df_copy['score_date'].dt.dayofweek
df_copy['score_weekofyear'] = df_copy['score_date'].dt.isocalendar().week
df_copy['score_quarter'] = df_copy['score_date'].dt.quarter

# Extraindo features de date
df_copy['date_year'] = df_copy['date'].dt.year
df_copy['date_month'] = df_copy['date'].dt.month
df_copy['date_day'] = df_copy['date'].dt.day
df_copy['date_dayofweek'] = df_copy['date'].dt.dayofweek
df_copy['date_weekofyear'] = df_copy['date'].dt.isocalendar().week
df_copy['date_quarter'] = df_copy['date'].dt.quarter

# (Opcional) Criando feature de diferença em dias entre as duas datas
df_copy['days_diff'] = (df_copy['score_date'] - df_copy['date']).dt.days


In [126]:
# Drop  columns
df_copy.drop(['channel', 'ids', 'score_date', 'date'], axis=1, inplace=True)

# Preencher valores nulos de flag_document_A com a moda
df_copy['flag_document_A'].fillna(df_copy['flag_document_A'].mode()[0], inplace=True)

# Mapear True/False para 1/0
df_copy['flag_document_A'] = df_copy['flag_document_A'].astype(bool).astype(int)

# Mapear os valores da variável gender: 'm' → 1 e 'f' → 0
df_copy['gender'] = df_copy['gender'].map({'m': 1, 'f': 0})

df_copy.head(20)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_copy['flag_document_A'].fillna(df_copy['flag_document_A'].mode()[0], inplace=True)
  df_copy['flag_document_A'].fillna(df_copy['flag_document_A'].mode()[0], inplace=True)


Unnamed: 0,age,amt_active_loan,ext_score_1,ext_score_2,ext_score_3,ext_score_4,flag_document_A,gender,income,occupation_type,score_checks,start_hour,default,credit_card_initial_line,payment,credit_line,spends,score_month,score_days,date_ordinal,score_year,score_day,score_dayofweek,score_weekofyear,score_quarter,date_year,date_month,date_day,date_dayofweek,date_weekofyear,date_quarter,days_diff
0,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,198.4,2180.0,198.4,1,14,736237,2017,15,6,2,1,2016,9,30,4,39,3,107
1,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,149.94,2368.0,145.49,1,14,736268,2017,15,6,2,1,2016,10,31,0,44,4,76
2,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,10.02,3103.0,10.02,1,14,736298,2017,15,6,2,1,2016,11,30,2,48,4,46
3,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,38.48,2430.0,278.05,1,14,736329,2017,15,6,2,1,2016,12,31,5,52,4,15
4,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,2888.77,2650.0,2888.77,1,14,736360,2017,15,6,2,1,2017,1,31,1,5,1,-16
5,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,715.24,2828.0,715.24,1,14,736388,2017,15,6,2,1,2017,2,28,1,9,1,-44
6,37,4560.0,467.0,DGCQep2AE5QRkNCshIAlFQ==,LCak332j+TYFqHC3NDwiqg==,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,13.73,2480.0,667.83,1,14,736419,2017,15,6,2,1,2017,3,31,4,13,1,-75
23,46,47720.0,324.0,fyrlulOiZ+5hoFqLa6UbDQ==,Fv28Bz0YRTVAT5kl1bAV6g==,71.497943,1,0.0,191517.4,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,20.0,11.0,0,2200.0,797.25,1996.0,868.18,1,12,736237,2017,13,4,2,1,2016,9,30,4,39,3,105
24,46,47720.0,324.0,fyrlulOiZ+5hoFqLa6UbDQ==,Fv28Bz0YRTVAT5kl1bAV6g==,71.497943,1,0.0,191517.4,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,20.0,11.0,0,2200.0,527.27,1881.0,344.11,1,12,736268,2017,13,4,2,1,2016,10,31,0,44,4,74
25,46,47720.0,324.0,fyrlulOiZ+5hoFqLa6UbDQ==,Fv28Bz0YRTVAT5kl1bAV6g==,71.497943,1,0.0,191517.4,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,20.0,11.0,0,2200.0,339.88,1788.0,339.88,1,12,736298,2017,13,4,2,1,2016,11,30,2,48,4,44


In [127]:
# Remover espaços e verificar valores únicos
df_copy['ext_score_2_clean'] = df_copy['ext_score_2'].astype(str).str.strip()

df_copy['ext_score_3_clean'] = df_copy['ext_score_3'].astype(str).str.strip()

df_copy['occupation_type_clean'] = df_copy['occupation_type'].astype(str).str.strip()

In [128]:
# Criar categorias fixas com pd.Categorical
ext_score_2_categories = df_copy['ext_score_2_clean'].unique()
df_copy['ext_score_2_cat'] = pd.Categorical(df_copy['ext_score_2_clean'], categories=ext_score_2_categories)

ext_score_3_categories = df_copy['ext_score_3_clean'].unique()
df_copy['ext_score_3_cat'] = pd.Categorical(df_copy['ext_score_3_clean'], categories=ext_score_3_categories)

occupation_categories = df_copy['occupation_type_clean'].unique()
df_copy['occupation_type_cat'] = pd.Categorical(df_copy['occupation_type_clean'], categories=occupation_categories)

In [129]:
ext_score_2_encoded = pd.get_dummies(df_copy['ext_score_2_cat'], prefix='ext_score_2').astype(int)
print(ext_score_2_encoded.head())


   ext_score_2_DGCQep2AE5QRkNCshIAlFQ==  ext_score_2_fyrlulOiZ+5hoFqLa6UbDQ==  \
0                                     1                                     0   
1                                     1                                     0   
2                                     1                                     0   
3                                     1                                     0   
4                                     1                                     0   

   ext_score_2_1Rk8w4Ucd5yR3KcqZzLdow==  ext_score_2_8k8UDR4Yx0qasAjkGrUZLw==  \
0                                     0                                     0   
1                                     0                                     0   
2                                     0                                     0   
3                                     0                                     0   
4                                     0                                     0   

   ext_score_2_4DLlLW62jRe

In [130]:
ext_score_3_encoded = pd.get_dummies(df_copy['ext_score_3_cat'], prefix='ext_score_3').astype(int)
print(ext_score_3_encoded.head())

   ext_score_3_LCak332j+TYFqHC3NDwiqg==  ext_score_3_Fv28Bz0YRTVAT5kl1bAV6g==  \
0                                     1                                     0   
1                                     1                                     0   
2                                     1                                     0   
3                                     1                                     0   
4                                     1                                     0   

   ext_score_3_w1miZqhB5+RSamEQJa0rqg==  ext_score_3_O4i7FxcROACMVTCgI0WXuA==  \
0                                     0                                     0   
1                                     0                                     0   
2                                     0                                     0   
3                                     0                                     0   
4                                     0                                     0   

   ext_score_3_rJZgTmANW3P

In [132]:
valor_linha0 = df_copy.loc[0, 'ext_score_2_cat']
coluna_dummy = f'ext_score_2_{valor_linha0}'

if coluna_dummy in ext_score_2_encoded.columns:
    print(f"Dummy correta na linha 0: {ext_score_2_encoded.loc[0, coluna_dummy]}")
else:
    print(f"A coluna {coluna_dummy} não foi criada.")


Dummy correta na linha 0: 1


In [133]:
# Verifique a linha 0, por exemplo:
valor_linha0 = df_copy.loc[0, 'ext_score_3_cat']
coluna_dummy = f'ext_score_3_{valor_linha0}'

if coluna_dummy in ext_score_3_encoded.columns:
    print(f"Dummy correta na linha 0: {ext_score_3_encoded.loc[0, coluna_dummy]}")
else:
    print(f"A coluna {coluna_dummy} não foi criada.")


Dummy correta na linha 0: 1


In [134]:
valor_linha0 = df_copy.loc[0, 'occupation_type_cat']
coluna_dummy = f'occupation_{valor_linha0}'

if coluna_dummy in occupation_encoded.columns:
    print(f"Dummy correta na linha 0: {occupation_encoded.loc[0, coluna_dummy]}")
else:
    print(f"A coluna {coluna_dummy} não foi criada.")

Dummy correta na linha 0: 1


In [137]:
df_copy_encoded = pd.concat([
    df_copy.drop(columns=['ext_score_2', 'ext_score_2_clean', 'ext_score_2_cat']),
    ext_score_2_encoded
], axis=1)

df_copy_encoded = pd.concat([
    df_copy_encoded.drop(columns=['ext_score_3', 'ext_score_3_clean', 'ext_score_3_cat']),
    ext_score_3_encoded
], axis=1)


pd.set_option('display.max_columns', None)


df_copy_encoded.head(1)


Unnamed: 0,age,amt_active_loan,ext_score_1,ext_score_4,flag_document_A,gender,income,occupation_type,score_checks,start_hour,default,credit_card_initial_line,payment,credit_line,spends,score_month,score_days,date_ordinal,score_year,score_day,score_dayofweek,score_weekofyear,score_quarter,date_year,date_month,date_day,date_dayofweek,date_weekofyear,date_quarter,days_diff,occupation_type_clean,occupation_type_cat,ext_score_2_DGCQep2AE5QRkNCshIAlFQ==,ext_score_2_fyrlulOiZ+5hoFqLa6UbDQ==,ext_score_2_1Rk8w4Ucd5yR3KcqZzLdow==,ext_score_2_8k8UDR4Yx0qasAjkGrUZLw==,ext_score_2_4DLlLW62jReXaqbPaHp1vQ==,ext_score_2_e4NYDor1NOw6XKGE60AWFw==,ext_score_2_smzX0nxh5QlePvtVf6EAeg==,ext_score_3_LCak332j+TYFqHC3NDwiqg==,ext_score_3_Fv28Bz0YRTVAT5kl1bAV6g==,ext_score_3_w1miZqhB5+RSamEQJa0rqg==,ext_score_3_O4i7FxcROACMVTCgI0WXuA==,ext_score_3_rJZgTmANW3PjOCQLCcp4iQ==,ext_score_3_pAzpxkhjPsjWldgSX21+zg==,ext_score_3_7h+tk4z7O9brtBSe1rNjxA==,ext_score_3_mX2VRRG38RPiHX+MfjefRw==,ext_score_3_tQUTfUyeuGkhRotd+6WjVg==,ext_score_3_OlDYtdljgSSYM/M1L2CRaQ==,ext_score_3_RO7MTL+j4PH2gNzbhNTq/A==,ext_score_3_emS9xH8CLoRNie2uSmaDAQ==,ext_score_3_vJyc9xom9v7hwFMPTIpmKw==,ext_score_3_wjdj2vxjWoDsEIk0l09ynw==,ext_score_3_ky19q4V1ZqgL3jnHX0wKDw==,ext_score_3_dCm9hFKfdRm7ej3jW+gyxw==,ext_score_3_d/7Hedyz7ovK9Pn1CYN4+A==,ext_score_3_YLGMUI9hObSh6wD/xfanGg==,ext_score_3_osCzpM4hJrxugqWWuZmMWw==,ext_score_3_wkeCdGeu5sEv4/fjwR0aDg==,ext_score_3_NLvAOzzmJba/0zolQnWF5Q==,ext_score_3_/tdlnWjXoZ3OjdtBXzdOJQ==,ext_score_3_dWJRASUFMejk3AHZ1p1Gkg==,ext_score_3_6J1ZMTzN5GKHXnhM4J1JbA==,ext_score_3_tHpS8e9F8d9zg3iOQM9tsA==,ext_score_3_IOVu8au3ISbo6+zmfnYwMg==,ext_score_3_55UK234RR1d7HIWJjmq9tw==,ext_score_3_SaamrHMo23l/3TwXOWgVzw==,ext_score_3_cdpgyOyZS04uXerMNu7uCw==,ext_score_3_A+QuW1n/ABeiVVe/9CRZ9Q==,ext_score_3_7h8PTkrlTWUPP3yuyP4rUg==,ext_score_3_+CxEO4w7jv3QPI/BQbyqAA==,ext_score_3_+2hzpeP1RWr8PEvL1WTUdw==,ext_score_3_5/uMrqKj3OL/Xk5OrGx9fg==,ext_score_3_bopP0NxW3+r8tn9xIHTaOw==
0,37,4560.0,467.0,61.311861,1,1.0,86301.53,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,8.0,9.0,0,1300.0,198.4,2180.0,198.4,1,14,736237,2017,15,6,2,1,2016,9,30,4,39,3,107,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Treino e teste:
- 

In [None]:
# Seleção das variáveis
features = [
        'age', 'amt_active_loan', 'ext_score_1', 'ext_score_2',
       'ext_score_3', 'ext_score_4', 'flag_document_A', 'gender',
       'income', 'occupation_type', 'score_checks', 'score_date', 'start_hour',
       'credit_card_initial_line', 'payment', 'date', 'credit_line',
       'spends'
]

In [None]:
# Remover linhas onde a variável alvo é nula
df = df[df['default'].notna()]

In [None]:
# Definir X e y
X = df[features]
y = df['default']

In [None]:
# Dividir treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

Pré-processamento:
-

In [None]:
# Definir colunas por tipo

# Numéricas
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Categóricas
cat_cols = X.select_dtypes(include=['object']).columns.tolist()

In [None]:
# Transformação e Extração de partes da data (score_date)

for df_ in [X_train, X_test]:
    df_['score_year'] = df_['score_date'].dt.year
    df_['score_month'] = df_['score_date'].dt.month
    df_['score_day'] = df_['score_date'].dt.day

X_train.drop(columns='score_date', inplace=True)
X_test.drop(columns='score_date', inplace=True)

num_cols.extend(['score_year', 'score_month', 'score_day'])


In [None]:
# Criar Pré-processadores

# Pipeline para colunas numéricas
num_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Pipeline para colunas categóricas
cat_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combinando pré-processadores
preprocessor = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])


In [None]:
 # Pipeline com modelo baseline, testando a Regressão Logística


baseline_model = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

baseline_model.fit(X_train, y_train)


        ColumnTransformer

Trata variáveis numéricas e categóricas separadamente:

- num (numéricas):

SimpleImputer: imputa valores ausentes (geralmente com média ou mediana).

StandardScaler: padroniza os dados (z-score).

- cat (categóricas):

SimpleImputer: imputa valores ausentes (geralmente com a moda).

OneHotEncoder: transforma categorias em variáveis dummies.


        LogisticRegression: 

Modelo de regressão logística com os seguintes parâmetros:

- max_iter=1000: aumenta o número máximo de iterações para convergência.

- class_weight='balanced': ajusta automaticamente o peso das classes com base em sua frequência, útil em datasets desbalanceados.

In [None]:
# Treinando o modelo de Random Forest

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced'))
])

rf_pipeline.fit(X_train, y_train)

        Preprocessor (pré-processador) | ColumnTransformer:

- Para variáveis numéricas (num):

SimpleImputer: preenche valores nulos.

StandardScaler: padroniza as variáveis.

- Para variáveis categóricas (cat):

SimpleImputer: preenche valores ausentes.

OneHotEncoder: transforma variáveis categóricas em dummies.

classifier:

        Modelo RandomForestClassifier:

- n_estimators=100: utiliza 100 árvores na floresta.

- random_state=42: garante reprodutibilidade dos resultados.

- class_weight='balanced': ajusta pesos das classes inversamente à sua frequência, lidando com desbalanceamento de forma automática.



In [None]:
# Avaliação dos modelos

y_pred_logreg = baseline_model.predict(X_test)
y_pred_rf = rf_pipeline.predict(X_test)

y_proba_logreg = baseline_model.predict_proba(X_test)[:, 1]
y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]

print("=== Regressão Logística ===")
print(classification_report(y_test, y_pred_logreg))
print("ROC AUC:", roc_auc_score(y_test, y_proba_logreg))

print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_proba_rf))

        Regressão Logística
- Desempenho muito consistente entre precisão, recall e F1 para ambas as classes.

- Alta ROC AUC (0.99437) indica excelente separação entre as classes.

- Leve vantagem em recall da classe minoritária (1.0), capturando mais inadimplentes.

        Random Forest
- Precision da classe 1 perfeita (1.00): todos os positivos previstos realmente eram positivos.

- Porém, recall da classe 1 caiu para 0.91: perdeu parte dos verdadeiros inadimplentes.

- ROC AUC ligeiramente melhor (0.99445), mas a diferença é insignificante na prática.


        Insights:

Ambos os modelos tiveram boa performace com ROC AUC acima de 0.99.

Se detectar inadimplentes é prioridade, por exemplo, para evitar concessão de crédito arriscado, a Regressão Logística é preferível por ter maior recall (0.96 vs 0.91).

Se o custo de falsos positivos, como por exemplo, negar crédito a bons pagadores for mais crítico, a Random Forest, com sua precision de 1.00, pode ser uma boa escolha.




In [None]:
# Curva ROC

fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_logreg)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_proba_rf)

plt.figure(figsize=(10, 6))
plt.plot(fpr_lr, tpr_lr, label=f"Logistic Regression (AUC = {roc_auc_score(y_test, y_proba_logreg):.2f})")
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {roc_auc_score(y_test, y_proba_rf):.2f})")
plt.plot([0, 1], [0, 1], "k--", label="No Skill")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.grid(True)
plt.show()

        Insights:

- As curvas da Regressão Logística (azul) e da Random Forest (laranja) estão praticamente sobrepostas, ambas muito próximas do canto superior esquerdo. Isso confirma que ambos os modelos têm desempenho excelente na separação das classes.

- A leve superioridade visual da Random Forest na parte inicial da curva é consistente com o leve ganho no ROC AUC (0.994448 vs. 0.994372), mas essa diferença é insignificante na prática.

- A curva ROC não favorece claramente nenhum dos dois modelos.

In [None]:
# Matriz de Confusão - Comparação direta entre os dois modelos em termos de erros de classificação

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_logreg, ax=axes[0])
axes[0].set_title("Logistic Regression")

ConfusionMatrixDisplay.from_predictions(y_test, y_pred_rf, ax=axes[1])
axes[1].set_title("Random Forest")

plt.tight_layout()
plt.show()

Cada célula representa:

- Verdadeiros negativos (TN): [0,0] — classe 0 corretamente prevista. RG = 92.874  | RF = 93.671

- Falsos positivos (FP): [0,1] — classe 0 prevista como 1 (erro). RG = 801  | RF = 4

- Falsos negativos (FN): [1,0] — classe 1 prevista como 0 (erro). RG = 821  | RF = 1.715

- Verdadeiros positivos (TP): [1,1] — classe 1 corretamente prevista. RG = 18.154  | RF = 17.260

        Regressãso Logística:
- Falsos negativos (FN): 821 → Casos de inadimplência que o modelo deixou passar.

        Random Forest:
Falsos negativos (FN): 1.715 → Quase dobro da Regressão Logística.


        Insights:
        
- Se seu objetivo é não liberar crédito para quem vai dar calote (evitar FP):

➤ Random Forest é muito mais conservador.

- Se o foco é não deixar bons clientes de fora (evitar FN):

➤ Regressão Logística pode ser preferida, pois rejeita menos inadimplentes por engano.


In [None]:
# Feature Importance (Random Forest)
rf_model = rf_pipeline.named_steps["classifier"]
importances = rf_model.feature_importances_
feature_names = rf_pipeline.named_steps["preprocessor"].get_feature_names_out()

importances_df = pd.DataFrame({
    "Feature": feature_names,
    "Importance": importances
}).sort_values("Importance", ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=importances_df.head(15), x="Importance", y="Feature")
plt.title("Random Forest - Top 15 Feature Importances")
plt.tight_layout()
plt.show()

In [None]:
# Precision-Recall Curve (Para dados desbalanceados):

from sklearn.metrics import precision_recall_curve, average_precision_score
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))

# Regressão Logística
y_scores_lr = baseline_model.predict_proba(X_test)[:, 1]
precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_scores_lr)
ap_lr = average_precision_score(y_test, y_scores_lr)
plt.plot(recall_lr, precision_lr, label=f'Logistic Regression (AP = {ap_lr:.2f})')

        Insights:

Essa curva Precision-Recall indica que seu modelo está com desempenho muito bom.
- Alta Precisão (Precision): poucos falsos positivos.

- Alta Revocação (Recall): poucos falsos negativos.        