1: Imports & Load Train/Test Splits

In [None]:
# Imports & Load Train/Test Splits

import pickle
import pandas as pd
import numpy as np

# Models for comparison
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Metrics
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

# Visualization (for tree)
from sklearn import tree as sktree
import matplotlib.pyplot as plt

# 1. Load train/test splits (make sure these .pkl files exist in working directory)
with open("X_train.pkl", "rb") as f:
    X_train = pickle.load(f)
with open("X_test.pkl", "rb") as f:
    X_test = pickle.load(f)
with open("y_train.pkl", "rb") as f:
    y_train = pickle.load(f)
with open("y_test.pkl", "rb") as f:
    y_test = pickle.load(f)

print("✅ Reloaded train/test splits:")
print(f"  • X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"  • X_test : {X_test.shape},  y_test : {y_test.shape}")


2: Retrain Final Pruned Decision Tree

In [None]:
# Retrain & Evaluate Final Pruned Decision Tree

# Best hyperparameters from Member 3
alpha_best = 9.679368290900915e-05
depth_best = 10
split_best = 20

# 1. Instantiate pruned DecisionTreeClassifier with those hyperparameters
dt_pruned = DecisionTreeClassifier(
    random_state=42,
    max_depth=depth_best,
    min_samples_split=split_best,
    ccp_alpha=alpha_best
)

# 2. Fit on training data
dt_pruned.fit(X_train, y_train)

# 3. Evaluate on the test set
y_pred_pruned = dt_pruned.predict(X_test)
y_proba_pruned = dt_pruned.predict_proba(X_test)[:, 1]
pruned_acc = accuracy_score(y_test, y_pred_pruned)
pruned_roc_auc = roc_auc_score(y_test, y_proba_pruned)

print("▶️ Pruned Decision Tree Performance on Test Set:")
print(f"  • Test Accuracy: {pruned_acc:.4f}")
print(f"  • Test ROC-AUC : {pruned_roc_auc:.4f}")
print("\n▶️ Pruned Tree Sample Counts (sanity check):")
print(f"  • Number of nodes: {dt_pruned.tree_.node_count}")


3: Train & Evaluate Random Forest

In [None]:
# Train & Evaluate Random Forest

# 1. Instantiate RandomForest with default hyperparameters (100 trees)
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

# 2. Fit on the same training set
rf.fit(X_train, y_train)

# 3. Predict on test set
y_pred_rf = rf.predict(X_test)
y_proba_rf = rf.predict_proba(X_test)[:, 1]

# 4. Compute metrics
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_roc_auc = roc_auc_score(y_test, y_proba_rf)
rf_cm = confusion_matrix(y_test, y_pred_rf)

# 5. Print results
print("▶️ Random Forest Performance on Test Set:")
print(f"  • Test Accuracy: {rf_acc:.4f}")
print(f"  • Test ROC-AUC : {rf_roc_auc:.4f}")
print("\n▶️ Random Forest Confusion Matrix:")
print(rf_cm)


4: Visualize & Interpret the Pruned Tree

In [None]:
# Visualize & Interpret the Pruned Tree

import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

plt.figure(figsize=(18, 10))
plot_tree(
    dt_pruned,
    feature_names=X_train.columns,
    class_names=["No", "Yes"],
    filled=True,
    rounded=True,
    max_depth=3,      # adjust to show more or fewer levels
    fontsize=12
)
plt.title("Pruned Decision Tree (first 3 levels)", fontsize=16)
plt.show()


Business Interpretation

In [None]:
from sklearn.tree import _tree
import numpy as np
from collections import Counter, defaultdict



def print_decision_paths_with_counts(tree_model, X_train, y_train, feature_names):
    
    # 1. Determine leaf index for each training sample
    leaf_indices = tree_model.apply(X_train)  # array of shape (n_train,)
    
    # 2. Count total and yes samples per leaf
    total_counts = Counter(leaf_indices)
    yes_counts   = Counter(idx for idx, y in zip(leaf_indices, y_train) if y == 1)
    
    # 3. Helper to get counts given a node_id (leaf index)
    def get_leaf_counts(node_id):
        total = total_counts.get(node_id, 0)
        yes   = yes_counts.get(node_id, 0)
        no    = total - yes
        return total, no, yes

    tree = tree_model.tree_

    def recurse(node, path):
        # If this node is not a leaf, split further
        if tree.feature[node] != _tree.TREE_UNDEFINED:
            feat_idx = tree.feature[node]
            threshold = tree.threshold[node]
            name = feature_names[feat_idx]

            # Left branch: feature ≤ threshold
            recurse(tree.children_left[node],
                    path + [f"{name} ≤ {threshold:.3f}"])
            # Right branch: feature > threshold
            recurse(tree.children_right[node],
                    path + [f"{name} > {threshold:.3f}"])
        else:
            # Leaf node: fetch counts from our dictionaries
            total, no_count, yes_count = get_leaf_counts(node)
            prediction = 'Yes' if yes_count > no_count else 'No'

            # Print the path and leaf stats
            print("→ " + " AND ".join(path))
            print(f"  → Leaf Node: Predict = {prediction} "
                  f"(samples = {total}, No = {no_count}, Yes = {yes_count})\n")

    # Start recursion at the root node (index 0)
    recurse(0, [])

# === Usage ===
# Make sure dt_pruned has been trained on X_train, y_train
feature_list = list(X_train.columns)
print_decision_paths_with_counts(dt_pruned, X_train, y_train, feature_names=feature_list)
