# Classificação de Aprovação de Crédito
Este notebook implementa um pipeline simples de classificação supervisionada, como proposto por Han et al. (2012) e Géron (2019).

In [1]:
# %% [markdown]
# # 1️⃣ Configurações iniciais

from pathlib import Path
import pandas as pd
from knn_classifier.dataset import main as download_dataset_main
from knn_classifier.features import main as preprocess_features_main
from knn_classifier.config import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Paths
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

dataset_path = PROCESSED_DATA_DIR / "dataset.csv"
features_path = PROCESSED_DATA_DIR / "features.csv"

# %% [markdown]
# # 2️⃣ Baixar dataset CRX

# Baixar dataset e salvar em dataset_path
download_dataset_main(
    url="https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data",
    output_path=dataset_path
)

# %% [markdown]
# # 3️⃣ Ajustar nomes de colunas (CRX dataset)

CRX_COLUMNS = [
    "A1","A2","A3","A4","A5","A6","A7","A8","A9","A10",
    "A11","A12","A13","A14","A15","A16"
]

# Ler CSV sem header e atribuir nomes corretos
df = pd.read_csv(dataset_path, header=None, names=CRX_COLUMNS, na_values='?')

# Salvar novamente com cabeçalhos corretos
df.to_csv(dataset_path, index=False)

# Validar
print("Colunas do dataset:", df.columns.tolist())
print("Número de linhas:", len(df))

# %% [markdown]
# # 4️⃣ Pré-processar features usando features.py

target_col = "A16"

preprocess_features_main(
    input_path=dataset_path,
    output_path=features_path,
    target_col=target_col
)

# Carregar features processadas
df_features = pd.read_csv(features_path)
print("Shape do dataframe de features:", df_features.shape)
print("Colunas:", df_features.columns.tolist())

# %% [markdown]
# # 5️⃣ Separar features e target

X = df_features.drop(columns=[target_col])
y = df_features[target_col]

# Validar dados
print("Número de missing values em X:", X.isna().sum().sum())
print("Número de missing values em y:", y.isna().sum())

# %% [markdown]
# # 6️⃣ Treinar modelo KNN (K-Nearest Neighbors)

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Separar features e target
X = df_features.drop(columns=[target_col])
y = df_features[target_col]

# Dividir dataset em treino e teste
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Instanciar modelo KNN
knn_model = KNeighborsClassifier(
    n_neighbors=5,   # número de vizinhos
    weights='distance',  # ponderar pelos vizinhos mais próximos
    metric='minkowski',  # métrica padrão (equivale à Euclidiana para p=2)
    p=2
)

# Treinar modelo
knn_model.fit(X_train, y_train)

# Avaliar no conjunto de teste
y_pred = knn_model.predict(X_test)

print("🔸 Accuracy:", accuracy_score(y_test, y_pred))
print("🔸 Classification Report:")
print(classification_report(y_test, y_pred))


[32m2025-10-06 19:06:40.659[0m | [1mINFO    [0m | [36mknn_classifier.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_02_credit_approval_knn_ccds\basic_01_knn_classifier[0m


[32m2025-10-06 19:06:41.439[0m | [1mINFO    [0m | [36mknn_classifier.dataset[0m:[36mdownload_dataset[0m:[36m16[0m - [1mBaixando dataset de https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data ...[0m
[32m2025-10-06 19:06:42.131[0m | [1mINFO    [0m | [36mknn_classifier.dataset[0m:[36mdownload_dataset[0m:[36m20[0m - [1mDataset baixado com shape (689, 16)[0m


Processing dataset: 100%|██████████| 689/689 [00:00<?, ?it/s]

[32m2025-10-06 19:06:42.131[0m | [1mINFO    [0m | [36mknn_classifier.dataset[0m:[36mmain[0m:[36m42[0m - [1mMeio do processo alcançado...[0m
[32m2025-10-06 19:06:42.146[0m | [32m[1mSUCCESS [0m | [36mknn_classifier.dataset[0m:[36mmain[0m:[36m46[0m - [32m[1mDataset salvo em C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_02_credit_approval_knn_ccds\basic_01_knn_classifier\data\processed\dataset.csv[0m
Colunas do dataset: ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']
Número de linhas: 690
[32m2025-10-06 19:06:42.146[0m | [1mINFO    [0m | [36mknn_classifier.features[0m:[36mmain[0m:[36m74[0m - [1mLendo dataset de C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_02_credit_approval_knn_ccds\basic_01_knn_classifier\data\processed\dataset.csv ...[0m
[32m2025-10-06 19:06:42.146[0m | [1mINFO    [0m | [36mknn_classifier.features[0m:[36mmain[0m:[36m7




[32m2025-10-06 19:06:42.462[0m | [32m[1mSUCCESS [0m | [36mknn_classifier.features[0m:[36mmain[0m:[36m84[0m - [32m[1mFeatures salvas em C:\Users\leona\Documents\dev\mesc-ia-codes\MESC-IA-Projetos-de-IA\basic_02_credit_approval_knn_ccds\basic_01_knn_classifier\data\processed\features.csv[0m
Shape do dataframe de features: (691, 1195)
Colunas: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '10

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.