In [7]:
# ================== TRAIN_MODEL.IPYNB ==================

import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

# ---------------- LOAD DATA ----------------
df = pd.read_csv("Loan_default.csv")

TARGET = "Default"   # IMPORTANT (case-sensitive)

# Drop identifier column ONLY
if "LoanID" in df.columns:
    df = df.drop(columns=["LoanID"])

X = df.drop(columns=[TARGET])
y = df[TARGET]

# ---------------- COLUMN TYPES ----------------
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

print("Numerical columns:", list(num_cols))
print("Categorical columns:", list(cat_cols))

# ---------------- PREPROCESSOR ----------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ]
)


# ---------------- MODEL ----------------
model = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="logloss",
    random_state=42
)

pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# ---------------- TRAIN / TEST ----------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

pipeline.fit(X_train, y_train)

# ---------------- EVALUATE ----------------
probs = pipeline.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, probs))

# ---------------- SAVE ----------------
joblib.dump(pipeline, "credit_model.pkl")
print("✅ Pipeline model saved as credit_model.pkl")


Numerical columns: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
Categorical columns: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']
ROC-AUC: 0.7587327260236107
✅ Pipeline model saved as credit_model.pkl
