# 03 â€” Modeling & Evaluation

Train baseline models and generate plots.

In [None]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve,
    precision_recall_curve, average_precision_score,
    confusion_matrix
)

plt.rcParams["figure.figsize"] = (8, 5)
plt.rcParams["axes.grid"] = True
plt.rcParams["font.size"] = 11

OUT = Path("..") / "outputs"
X_train, X_test, y_train, y_test = joblib.load(OUT / "data_splits.joblib")
preprocessor = joblib.load(OUT / "preprocessor.joblib")


In [None]:
def plot_confusion_matrix(cm, title="Confusion Matrix"):
    plt.figure()
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, ["No", "Yes"])
    plt.yticks(tick_marks, ["No", "Yes"])
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.show()

def plot_roc_curve(y_true, y_proba, title="ROC Curve"):
    fpr, tpr, _ = roc_curve(y_true, y_proba)
    auc = roc_auc_score(y_true, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"AUC = {auc:.4f}")
    plt.plot([0, 1], [0, 1], linestyle="--", label="Random")
    plt.title(title)
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.legend()
    plt.show()

def plot_pr_curve(y_true, y_proba, title="Precision-Recall Curve"):
    precision, recall, _ = precision_recall_curve(y_true, y_proba)
    ap = average_precision_score(y_true, y_proba)
    plt.figure()
    plt.plot(recall, precision, label=f"AP = {ap:.4f}")
    plt.title(title)
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend()
    plt.show()

def run_model(model, model_name):
    pipe = Pipeline(steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    y_proba = pipe.predict_proba(X_test)[:, 1] if hasattr(pipe, "predict_proba") else None

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    cm = confusion_matrix(y_test, y_pred)

    print("\n" + "="*60)
    print(f"MODEL: {model_name}")
    print("="*60)
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    if auc is not None:
        print(f"ROC-AUC  : {auc:.4f}")

    plot_confusion_matrix(cm, title=f"{model_name} - Confusion Matrix")
    if y_proba is not None:
        plot_roc_curve(y_test, y_proba, title=f"{model_name} - ROC Curve")
        plot_pr_curve(y_test, y_proba, title=f"{model_name} - PR Curve")

    return {
        "model": model_name,
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "roc_auc": auc
    }


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# XGBoost
try:
    from xgboost import XGBClassifier
except Exception:
    !pip -q install xgboost
    from xgboost import XGBClassifier

results = []

results.append(run_model(LogisticRegression(max_iter=4000, class_weight="balanced", random_state=42), "Logistic Regression"))
results.append(run_model(DecisionTreeClassifier(random_state=42, class_weight="balanced"), "Decision Tree"))
results.append(run_model(RandomForestClassifier(n_estimators=400, random_state=42, class_weight="balanced", n_jobs=-1), "Random Forest"))
results.append(run_model(GradientBoostingClassifier(random_state=42), "Gradient Boosting"))

neg = (y_train == 0).sum()
pos = (y_train == 1).sum()
spw = float(neg) / float(pos)

results.append(run_model(XGBClassifier(
    n_estimators=600,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    random_state=42,
    eval_metric="logloss",
    n_jobs=-1,
    scale_pos_weight=spw
), "XGBoost"))

results_df = pd.DataFrame(results).sort_values(by="roc_auc", ascending=False)
results_df


In [None]:
results_df.to_csv(OUT / "model_metrics.csv", index=False)
print("Saved:", OUT / "model_metrics.csv")

plt.figure()
plt.bar(results_df["model"], results_df["roc_auc"])
plt.title("Model Comparison - ROC-AUC")
plt.xlabel("Model")
plt.ylabel("ROC-AUC")
plt.xticks(rotation=15)
plt.show()

plt.figure()
plt.bar(results_df["model"], results_df["f1"])
plt.title("Model Comparison - F1 Score")
plt.xlabel("Model")
plt.ylabel("F1 Score")
plt.xticks(rotation=15)
plt.show()
