# Decision Trees Category - Kaggle Playground Series S5E8

**Category**: Decision Trees  
**Sub-models**: DecisionTreeClassifier with controlled depth/min_samples grids  
**Split Strategy**: 70/30 stratified split  
**Cross-Validation**: 5-fold StratifiedKFold  
**Random Seed**: 42  
**Artifact Paths**: outputs/trees/  

This notebook compares different decision tree variants using the same data preprocessing and evaluation protocol.

In [None]:
# Bootstrap installation and imports
%pip install numpy pandas scikit-learn matplotlib --quiet

import os, json, random, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    roc_auc_score, average_precision_score, f1_score, accuracy_score,
    precision_score, recall_score, log_loss, roc_curve, precision_recall_curve,
    confusion_matrix, calibration_curve
)
import warnings
warnings.filterwarnings('ignore')

# Set random seeds
os.makedirs('outputs', exist_ok=True)
np.random.seed(42)
random.seed(42)

print("Decision Trees Category - Setup Complete")

In [None]:
# Load and prepare data
train_df = pd.read_csv('../playground-series-s5e8/train.csv')
test_df = pd.read_csv('../playground-series-s5e8/test.csv')

feature_cols = [col for col in train_df.columns if col not in ['id', 'target']]
X = train_df[feature_cols]
y = train_df['target']

X_train_pool, X_test_holdout, y_train_pool, y_test_holdout = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
print(f"Data loaded: {X_train_pool.shape} train, {X_test_holdout.shape} test")
print(f"Features: {len(feature_cols)}")

In [None]:
# Define Decision Tree models with different configurations
models_config = {
    'DecisionTree_balanced': {
        'estimator': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
        'param_grid': {
            'classifier__max_depth': [4, 8, 12],
            'classifier__min_samples_leaf': [1, 5, 20]
        }
    },
    'DecisionTree_gini': {
        'estimator': DecisionTreeClassifier(random_state=42, criterion='gini'),
        'param_grid': {
            'classifier__max_depth': [4, 8, 12],
            'classifier__min_samples_leaf': [1, 5, 20]
        }
    },
    'DecisionTree_entropy': {
        'estimator': DecisionTreeClassifier(random_state=42, criterion='entropy'),
        'param_grid': {
            'classifier__max_depth': [4, 8, 12],
            'classifier__min_samples_leaf': [1, 5, 20]
        }
    },
    'DecisionTree_log_loss': {
        'estimator': DecisionTreeClassifier(random_state=42, criterion='log_loss'),
        'param_grid': {
            'classifier__max_depth': [4, 8, 12],
            'classifier__min_samples_leaf': [1, 5, 20]
        }
    }
}

print(f"Configured {len(models_config)} Decision Tree variants:")
for name in models_config.keys():
    print(f"  - {name}")

In [None]:
# Helper functions
def create_pipeline(estimator):
    """Create preprocessing pipeline - trees don't need scaling"""
    return Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('classifier', estimator)
    ])

def get_probabilities(estimator, X):
    return estimator.predict_proba(X)[:, 1]

def compute_metrics(y_true, y_prob, threshold=0.5):
    y_pred = (y_prob >= threshold).astype(int)
    return {
        'roc_auc': roc_auc_score(y_true, y_prob),
        'average_precision': average_precision_score(y_true, y_prob),
        'f1': f1_score(y_true, y_pred),
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'logloss': log_loss(y_true, y_prob)
    }

def find_best_threshold(y_true, y_prob):
    fpr, tpr, thresholds = roc_curve(y_true, y_prob)
    j_scores = tpr - fpr
    best_idx = np.argmax(j_scores)
    return thresholds[best_idx]

print("Helper functions defined")

In [None]:
# Main evaluation loop
results = {}

for model_name, config in models_config.items():
    print(f"\nEvaluating: {model_name}")
    
    model_dir = f"../outputs/trees/{model_name}"
    os.makedirs(f"{model_dir}/logs", exist_ok=True)
    os.makedirs(f"{model_dir}/models", exist_ok=True)
    os.makedirs(f"{model_dir}/figures", exist_ok=True)
    
    pipeline = create_pipeline(config['estimator'])
    
    grid_search = GridSearchCV(
        pipeline, config['param_grid'], cv=cv, 
        scoring='roc_auc', n_jobs=-1, verbose=1
    )
    
    grid_search.fit(X_train_pool, y_train_pool)
    best_pipeline = grid_search.best_estimator_
    
    print(f"Best params: {grid_search.best_params_}")
    print(f"Best CV AUC: {grid_search.best_score_:.4f}")
    
    # CV analysis
    cv_metrics = []
    cv_roc_curves = []
    cv_pr_curves = []
    cv_thresholds = []
    
    for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X_train_pool, y_train_pool)):
        X_fold_train = X_train_pool.iloc[train_idx]
        X_fold_val = X_train_pool.iloc[val_idx]
        y_fold_train = y_train_pool.iloc[train_idx]
        y_fold_val = y_train_pool.iloc[val_idx]
        
        fold_pipeline = create_pipeline(config['estimator'])
        fold_pipeline.set_params(**grid_search.best_params_)
        fold_pipeline.fit(X_fold_train, y_fold_train)
        
        y_val_prob = get_probabilities(fold_pipeline, X_fold_val)
        best_threshold = find_best_threshold(y_fold_val, y_val_prob)
        cv_thresholds.append(best_threshold)
        
        fold_metrics = compute_metrics(y_fold_val, y_val_prob, best_threshold)
        fold_metrics['fold'] = fold_idx + 1
        fold_metrics['threshold'] = best_threshold
        cv_metrics.append(fold_metrics)
        
        fpr, tpr, _ = roc_curve(y_fold_val, y_val_prob)
        precision, recall, _ = precision_recall_curve(y_fold_val, y_val_prob)
        cv_roc_curves.append((fpr, tpr))
        cv_pr_curves.append((precision, recall))
    
    # Test evaluation
    mean_threshold = np.mean(cv_thresholds)
    y_test_prob = get_probabilities(best_pipeline, X_test_holdout)
    test_metrics = compute_metrics(y_test_holdout, y_test_prob, mean_threshold)
    test_metrics['chosen_threshold'] = mean_threshold
    test_metrics['confusion_matrix'] = confusion_matrix(
        y_test_holdout, (y_test_prob >= mean_threshold).astype(int)
    ).tolist()
    
    print(f"Test AUC: {test_metrics['roc_auc']:.4f}")
    
    # Store results
    results[model_name] = {
        'cv_metrics': cv_metrics,
        'test_metrics': test_metrics,
        'cv_roc_curves': cv_roc_curves,
        'cv_pr_curves': cv_pr_curves,
        'best_params': grid_search.best_params_,
        'model_dir': model_dir
    }
    
    # Save artifacts
    cv_df = pd.DataFrame(cv_metrics)
    cv_df.to_csv(f"{model_dir}/logs/cv_metrics.csv", index=False)
    
    with open(f"{model_dir}/logs/test_metrics.json", 'w') as f:
        json.dump(test_metrics, f, indent=2)
    
    with open(f"{model_dir}/models/final_model.pkl", 'wb') as f:
        pickle.dump(best_pipeline, f)

print("\nAll Decision Tree models evaluated!")

In [None]:
# Generate figures with tree-specific plots
for model_name, model_results in results.items():
    model_dir = model_results['model_dir']
    
    print(f"Generating figures for {model_name}...")
    
    # Load final model
    with open(f"{model_dir}/models/final_model.pkl", 'rb') as f:
        final_model = pickle.load(f)
    
    tree_estimator = final_model.named_steps['classifier']
    
    # 1. Feature importance
    plt.figure(figsize=(10, 6))
    
    importances = tree_estimator.feature_importances_
    indices = np.argsort(importances)[-20:]  # Top 20
    
    plt.barh(range(len(indices)), importances[indices], alpha=0.7)
    plt.yticks(range(len(indices)), [feature_cols[i] for i in indices])
    plt.xlabel('Feature Importance (Gini/Entropy Reduction)')
    plt.title(f'Top 20 Feature Importances - {model_name}')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/feature_importance.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 2. Tree visualization (simplified for top levels)
    plt.figure(figsize=(15, 10))
    
    try:
        # Plot only first few levels to avoid overcrowding
        plot_tree(tree_estimator, 
                 feature_names=feature_cols,
                 class_names=['Negative', 'Positive'],
                 filled=True, 
                 max_depth=3,  # Limit depth for readability
                 fontsize=8)
        plt.title(f'Decision Tree Structure (Top 3 Levels) - {model_name}')
    except:
        plt.text(0.5, 0.5, 'Tree visualization failed\n(likely too complex)', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title(f'Decision Tree Structure - {model_name}')
    
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/tree_structure.png", dpi=200, bbox_inches='tight')
    plt.close()
    
    # 3. ROC Curve
    plt.figure(figsize=(8, 6))
    mean_fpr = np.linspace(0, 1, 100)
    tprs = []
    for fpr, tpr in model_results['cv_roc_curves']:
        tprs.append(np.interp(mean_fpr, fpr, tpr))
    
    mean_tpr = np.mean(tprs, axis=0)
    std_tpr = np.std(tprs, axis=0)
    
    plt.plot(mean_fpr, mean_tpr, 'b-', 
             label=f'Mean ROC (AUC = {np.mean([cv["roc_auc"] for cv in model_results["cv_metrics"]]):.3f})')
    plt.fill_between(mean_fpr, mean_tpr - std_tpr, mean_tpr + std_tpr, alpha=0.2)
    plt.plot([0, 1], [0, 1], 'k--', label='Random')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {model_name}')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_dir}/figures/roc_cv.png", dpi=200, bbox_inches='tight')
    plt.close()

print("All figures generated!")

In [None]:
# Create summary
summary_data = []
for model_name, model_results in results.items():
    test_metrics = model_results['test_metrics']
    cv_metrics = model_results['cv_metrics']
    
    summary_data.append({
        'model': model_name,
        'test_auc': test_metrics['roc_auc'],
        'test_ap': test_metrics['average_precision'],
        'test_f1': test_metrics['f1'],
        'cv_auc_mean': np.mean([cv['roc_auc'] for cv in cv_metrics]),
        'cv_auc_std': np.std([cv['roc_auc'] for cv in cv_metrics]),
        'best_params': str(model_results['best_params']),
        'artifacts_path': model_results['model_dir']
    })

summary_df = pd.DataFrame(summary_data).sort_values('test_auc', ascending=False)
os.makedirs('../outputs/trees', exist_ok=True)
summary_df.to_csv('../outputs/trees/summary.csv', index=False)

print("\nDECISION TREES CATEGORY - FINAL RESULTS")
print("=" * 50)
for idx, row in summary_df.iterrows():
    print(f"{row['model']:25s} | AUC: {row['test_auc']:.4f} | AP: {row['test_ap']:.4f}")

print(f"\nBest Model: {summary_df.iloc[0]['model']} (AUC: {summary_df.iloc[0]['test_auc']:.4f})")
print(f"Summary saved to: ../outputs/trees/summary.csv")