# Classificação de Aprovação de Crédito
Este notebook implementa um pipeline simples de classificação supervisionada, como proposto por Han et al. (2012) e Géron (2019).

In [8]:
# %% [markdown]
# # 1️⃣ Configurações iniciais

from pathlib import Path
import pandas as pd
from rf_classifier.dataset import main as download_dataset_main
from rf_classifier.features import main as preprocess_features_main
from rf_classifier.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Paths
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

dataset_path = PROCESSED_DATA_DIR / "dataset.csv"
features_path = PROCESSED_DATA_DIR / "features.csv"


In [9]:

# %% [markdown]
# # 2️⃣ Baixar dataset CRX

# Baixar dataset e salvar em dataset_path
download_dataset_main(
    url="https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
    output_path=dataset_path
)

# %% [markdown]
# # 3️⃣ Ajustar nomes de colunas (CRX dataset)

CRX_COLUMNS = [
    "A1","A2","A3","A4","A5","A6","A7","A8","A9","A10",
    "A11","A12","A13","A14","A15","A16"
]

# Ler CSV sem header e atribuir nomes corretos
df = pd.read_csv(dataset_path, header=None, names=CRX_COLUMNS, na_values='?')

# Salvar novamente com cabeçalhos corretos
df.to_csv(dataset_path, index=False)

# Validar
print("Colunas do dataset:", df.columns.tolist())
print("Número de linhas:", len(df))


[32m2025-10-06 21:22:16.292[0m | [1mINFO    [0m | [36mrf_classifier.dataset[0m:[36mdownload_dataset[0m:[36m16[0m - [1mBaixando dataset de https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data ...[0m
[32m2025-10-06 21:22:16.968[0m | [1mINFO    [0m | [36mrf_classifier.dataset[0m:[36mdownload_dataset[0m:[36m20[0m - [1mDataset baixado com shape (689, 16)[0m


Processing dataset: 100%|██████████| 689/689 [00:00<?, ?it/s]

[32m2025-10-06 21:22:16.968[0m | [1mINFO    [0m | [36mrf_classifier.dataset[0m:[36mmain[0m:[36m42[0m - [1mMeio do processo alcançado...[0m
[32m2025-10-06 21:22:16.984[0m | [32m[1mSUCCESS [0m | [36mrf_classifier.dataset[0m:[36mmain[0m:[36m46[0m - [32m[1mDataset salvo em C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_01_credit_approval_random_forest_ccds\basic_01_rf_classifier\data\processed\dataset.csv[0m
Colunas do dataset: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
Número de linhas: 690





In [10]:
df.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.83,0.0,u,g,w,v,1.25,t,t.1,1,f,g.1,202.0,0.1,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560.0,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824.0,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3.0,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0.0,+


In [11]:

# %% [markdown]
# # 4️⃣ Pré-processar features usando features.py

target_col = "A16"

preprocess_features_main(
    input_path=dataset_path,
    output_path=features_path,
    target_col=target_col
)


[32m2025-10-06 21:22:17.061[0m | [1mINFO    [0m | [36mrf_classifier.features[0m:[36mmain[0m:[36m28[0m - [1mCarregando dataset de C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_01_credit_approval_random_forest_ccds\basic_01_rf_classifier\data\processed\dataset.csv...[0m
[32m2025-10-06 21:22:17.064[0m | [1mINFO    [0m | [36mrf_classifier.features[0m:[36mmain[0m:[36m31[0m - [1mPré-processando features...[0m
[32m2025-10-06 21:22:17.089[0m | [32m[1mSUCCESS [0m | [36mrf_classifier.features[0m:[36mmain[0m:[36m38[0m - [32m[1mFeatures salvas em C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_01_credit_approval_random_forest_ccds\basic_01_rf_classifier\data\processed\features.csv[0m


In [12]:

# Carregar features processadas
df_features = pd.read_csv(features_path)
print("Shape do dataframe de features:", df_features.shape)
print("Colunas:", df_features.columns.tolist())


Shape do dataframe de features: (691, 16)
Colunas: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']


In [13]:

# %% [markdown]
# # 5️⃣ Separar features e target

X = df_features.drop(columns=[target_col])
y = df_features[target_col]

# Validar dados
print("Número de missing values em X:", X.isna().sum().sum())
print("Número de missing values em y:", y.isna().sum())


Número de missing values em X: 0
Número de missing values em y: 0


In [14]:

# %% [markdown]
# # 6️⃣ Treinar modelo RandomForest (exemplo)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinar modelo
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Avaliar no conjunto de teste
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8633093525179856
              precision    recall  f1-score   support

           +       0.85      0.87      0.86        67
           -       0.87      0.86      0.87        72

    accuracy                           0.86       139
   macro avg       0.86      0.86      0.86       139
weighted avg       0.86      0.86      0.86       139

