# Classificação de Aprovação de Crédito
Este notebook implementa um pipeline simples de classificação supervisionada, como proposto por Han et al. (2012) e Géron (2019).

In [6]:
# %% [markdown]
# # 1️⃣ Configurações iniciais

from pathlib import Path
import pandas as pd
from rf_classifier.dataset import main as download_dataset_main
from rf_classifier.features import main as preprocess_features_main
from rf_classifier.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Paths
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

dataset_path = PROCESSED_DATA_DIR / "dataset.csv"
features_path = PROCESSED_DATA_DIR / "features.csv"

# %% [markdown]
# # 2️⃣ Baixar dataset CRX

# Baixar dataset e salvar em dataset_path
download_dataset_main(
    url="https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
    output_path=dataset_path
)

# %% [markdown]
# # 3️⃣ Ajustar nomes de colunas (CRX dataset)

CRX_COLUMNS = [
    "A1","A2","A3","A4","A5","A6","A7","A8","A9","A10",
    "A11","A12","A13","A14","A15","A16"
]

# Ler CSV sem header e atribuir nomes corretos
df = pd.read_csv(dataset_path, header=None, names=CRX_COLUMNS, na_values='?')

# Salvar novamente com cabeçalhos corretos
df.to_csv(dataset_path, index=False)

# Validar
print("Colunas do dataset:", df.columns.tolist())
print("Número de linhas:", len(df))

# %% [markdown]
# # 4️⃣ Pré-processar features usando features.py

target_col = "A16"

preprocess_features_main(
    input_path=dataset_path,
    output_path=features_path,
    target_col=target_col
)

# Carregar features processadas
df_features = pd.read_csv(features_path)
print("Shape do dataframe de features:", df_features.shape)
print("Colunas:", df_features.columns.tolist())

# %% [markdown]
# # 5️⃣ Separar features e target

X = df_features.drop(columns=[target_col])
y = df_features[target_col]

# Validar dados
print("Número de missing values em X:", X.isna().sum().sum())
print("Número de missing values em y:", y.isna().sum())

# %% [markdown]
# # 6️⃣ Treinar modelo RandomForest (exemplo)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Treinar modelo
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Avaliar no conjunto de teste
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Colunas do dataset: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
Número de linhas: 690
[32m2025-10-06 15:21:56.976[0m | [1mINFO    [0m | [36mrf_classifier.features[0m:[36mmain[0m:[36m35[0m - [1mCarregando dataset de C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_01_credit_approval_random_forest_ccds\basic_01_rf_classifier\data\processed\dataset.csv...[0m
[32m2025-10-06 15:21:56.976[0m | [1mINFO    [0m | [36mrf_classifier.features[0m:[36mmain[0m:[36m38[0m - [1mPré-processando features...[0m
[32m2025-10-06 15:21:56.994[0m | [32m[1mSUCCESS [0m | [36mrf_classifier.features[0m:[36mmain[0m:[36m45[0m - [32m[1mFeatures salvas em C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_01_credit_approval_random_forest_ccds\basic_01_rf_classifier\data\processed\features.csv[0m
Shape do dataframe de features: (690, 16)
Colunas: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6