# Model Training
## Credit Default Prediction

This notebook trains multiple machine learning models
using a consistent preprocessing and evaluation pipeline.

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

In [3]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

In [4]:
from src.preprocessing import (
    load_data, clean_data, encode_features, split_features_target
)
from src.models import get_models

In [6]:
df = load_data("../data/credit_risk_dataset.csv")
df = clean_data(df)
df = encode_features(df)

X, y = split_features_target(df)

## Train-Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## Model Training and Initial Evaluation

In [8]:
models = get_models()
results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    probs = model.predict_proba(X_test)[:,1]
    auc = roc_auc_score(y_test, probs)
    results[name] = auc

results

{'Logistic Regression': np.float64(0.8614435794192445),
 'Random Forest': np.float64(0.9279665833807291),
 'Gradient Boosting': np.float64(0.9231729876999846),
 'SVM': np.float64(0.8893236562891778),
 'XGBoost': np.float64(0.9434591865083606)}

### Initial Results
- Ensemble models outperform linear models
- XGBoost, Random Forest and Gradient Boosting show strongest performance
