1: Imports & Load Splits

In [None]:
# %% Cell 1: Imports & Load Splits

import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

import pickle

# Load the previously saved train/test splits
with open("X_train.pkl", "rb") as f:
    X_train = pickle.load(f)
with open("X_test.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("y_train.pkl", "rb") as f:
    y_train = pickle.load(f)
with open("y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

print("✅ Loaded train/test splits (pruned run).")
print(f"  • X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  • X_test : {X_test.shape},  y_test : {y_test.shape}")


2: Compute Cost-Complexity Pruning (CCP) Path

In [None]:
# %% Cell 2: Compute CCP Path

# 1. Train a full (unpruned) decision tree on the training set
dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)

# 2. Extract the cost-complexity pruning path
path = dt_full.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

# 3. Plot Impurity vs. CCP Alpha (to see how impurity grows with alpha)
plt.figure(figsize=(8, 5))
plt.plot(ccp_alphas, impurities, marker="o", drawstyle="steps-post")
plt.xlabel("ccp_alpha")
plt.ylabel("Impurity (Total Cost-Complexity)")
plt.title("Impurity vs. CCP Alpha (Training Data)")
plt.show()

# 4. Select ~15–20 α candidates for grid search (evenly spaced indices)
alpha_candidates = ccp_alphas[np.linspace(0, len(ccp_alphas)-1, 15, dtype=int)]
print("▶️ Selected CCP Alpha candidates:", np.round(alpha_candidates, 5))


3: Hyperparameter Grid Search

In [None]:
# Cell 3: Hyperparameter Grid Search

from sklearn.model_selection import GridSearchCV

# 1. Define the parameter grid
param_grid = {
    "max_depth": [5, 10, 15, None],
    "min_samples_split": [2, 10, 20],
    "ccp_alpha": list(alpha_candidates)
}

# 2. Initialize a Decision Tree classifier
dt = DecisionTreeClassifier(random_state=42)

# 3. Set up 5-fold CV grid search optimizing for ROC-AUC
grid_search = GridSearchCV(
    estimator=dt,
    param_grid=param_grid,
    cv=5,
    scoring="roc_auc",
    n_jobs=-1,
    return_train_score=True
)

# 4. Run the grid search on the training set
grid_search.fit(X_train, y_train)

# 5. Extract and print best parameters and best CV ROC-AUC
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print("▶️ Best Params:", best_params)
print(f"▶️ Best CV ROC-AUC: {best_score:.4f}")


4: Plot CV Results vs. CCP Alpha

In [None]:
# %% Cell 4: CV Curves for CCP Alpha

import pandas as pd
import matplotlib.pyplot as plt

# 1. Convert the grid search results into a DataFrame
results = pd.DataFrame(grid_search.cv_results_)

# 2. Filter rows for the best max_depth and min_samples_split
mask = (
    (results["param_max_depth"] == best_params["max_depth"]) &
    (results["param_min_samples_split"] == best_params["min_samples_split"])
)
subset = results[mask].sort_values(by="param_ccp_alpha")

# 3. Plot mean_train_score (ROC-AUC) vs. ccp_alpha
plt.figure(figsize=(8, 6))
plt.plot(
    subset["param_ccp_alpha"].astype(float), 
    subset["mean_train_score"], 
    label="Train ROC-AUC", 
    marker="o"
)

# 4. Plot mean_test_score (ROC-AUC) vs. ccp_alpha
plt.plot(
    subset["param_ccp_alpha"].astype(float), 
    subset["mean_test_score"], 
    label="Validation ROC-AUC", 
    marker="o"
)

plt.xscale("log")
plt.xlabel("ccp_alpha (log scale)")
plt.ylabel("ROC-AUC")
plt.title("Train vs. Validation ROC-AUC vs. CCP Alpha")
plt.legend()
plt.show()

# 5. Print the subset DataFrame for reference (showing the columns of interest)
print("\n▶️ Subset of CV results (for best max_depth & min_samples_split):")
print(subset[[
    "param_ccp_alpha", "mean_train_score", "std_train_score", 
    "mean_test_score", "std_test_score"
]])


5: Train Final Pruned Tree & Evaluate on Test Set

In [None]:
# %% Cell 5: Train Final Pruned Tree & Evaluate on Test Set

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

# 1. Extract best hyperparameters
alpha_best = best_params["ccp_alpha"]
depth_best = best_params["max_depth"]
split_best = best_params["min_samples_split"]

# 2. Initialize the pruned Decision Tree with those parameters
dt_pruned = DecisionTreeClassifier(
    random_state=42,
    max_depth=depth_best,
    min_samples_split=split_best,
    ccp_alpha=alpha_best
)

# 3. Fit on the full training set
dt_pruned.fit(X_train, y_train)

# 4. Predict on the test set
y_pred_pruned = dt_pruned.predict(X_test)
y_proba_pruned = dt_pruned.predict_proba(X_test)[:, 1]

# 5. Compute test metrics
test_acc = accuracy_score(y_test, y_pred_pruned)
test_roc_auc = roc_auc_score(y_test, y_proba_pruned)

# 6. Print final performance
print("▶️ Pruned Decision Tree Performance on Test Set:")
print(f"  • Test Accuracy: {test_acc:.4f}")
print(f"  • Test ROC-AUC : {test_roc_auc:.4f}")
