# 03: SHAP Explanations

**Student**: Keisuke Nishioka (Matrikelnummer: 10081049)  
**Project**: Stability and Faithfulness Analysis of SHAP Explanations

In [None]:
import sys
import os
sys.path.append('../src')

import pandas as pd
import numpy as np
from models import load_model
from shap_analysis import (
    compute_shap_for_model, compute_shap_multiple_seeds,
    save_shap_values
)
import config

## Load Data and Models

In [None]:
# Load data
X_train = pd.read_csv('../data/processed/X_train.csv')
X_test = pd.read_csv('../data/processed/X_test.csv')

# Number of samples to explain
n_samples = config.STABILITY_CONFIG['n_test_samples']
print(f"Will explain {n_samples} test samples")

## Compute SHAP for XGBoost (TreeSHAP)

In [None]:
# Load XGBoost models
xgboost_models = {}
for seed in config.RANDOM_SEEDS:
    xgboost_models[seed] = load_model(f'../results/models/xgboost_seed_{seed}.pkl')

# Compute SHAP values for all seeds
xgboost_shap_results = compute_shap_multiple_seeds(
    xgboost_models, X_train, X_test,
    model_type='xgboost',
    random_seeds=config.RANDOM_SEEDS,
    n_samples=n_samples,
    save_dir='../results/shap_values'
)

print(f"Computed SHAP for {len(xgboost_shap_results)} XGBoost models")

## Compute SHAP for Random Forest (TreeSHAP)

In [None]:
# Load Random Forest models
rf_models = {}
for seed in config.RANDOM_SEEDS:
    rf_models[seed] = load_model(f'../results/models/random_forest_seed_{seed}.pkl')

# Compute SHAP values for all seeds
rf_shap_results = compute_shap_multiple_seeds(
    rf_models, X_train, X_test,
    model_type='random_forest',
    random_seeds=config.RANDOM_SEEDS,
    n_samples=n_samples,
    save_dir='../results/shap_values'
)

print(f"Computed SHAP for {len(rf_shap_results)} Random Forest models")

## Compute SHAP for Logistic Regression (KernelSHAP)

In [None]:
# Load Logistic Regression models
lr_models = {}
for seed in config.RANDOM_SEEDS:
    lr_models[seed] = load_model(f'../results/models/logistic_regression_seed_{seed}.pkl')

# Compute SHAP values for all seeds
# Note: KernelSHAP is slower, so we use fewer seeds for demonstration
lr_seeds = config.RANDOM_SEEDS[:5]  # Use first 5 seeds for KernelSHAP
lr_models_subset = {seed: lr_models[seed] for seed in lr_seeds}

lr_shap_results = compute_shap_multiple_seeds(
    lr_models_subset, X_train, X_test,
    model_type='logistic_regression',
    random_seeds=lr_seeds,
    n_samples=n_samples,
    save_dir='../results/shap_values'
)

print(f"Computed SHAP for {len(lr_shap_results)} Logistic Regression models")

## Summary

In [None]:
print("SHAP computation completed!")
print(f"\nXGBoost: {len(xgboost_shap_results)} runs")
print(f"Random Forest: {len(rf_shap_results)} runs")
print(f"Logistic Regression: {len(lr_shap_results)} runs")
print("\nAll SHAP values saved to ../results/shap_values/")