In [7]:
import numpy as np
import pandas as pd
import shap
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance

RANDOM_STATE = 42

# -----------------------------
# Load Adult dataset (already numeric)
# -----------------------------
X, y = shap.datasets.adult()

# -----------------------------
# Train / test split
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# -----------------------------
# Baseline model
# -----------------------------
model = xgb.XGBClassifier(
    eval_metric="logloss",
    random_state=RANDOM_STATE,
    n_jobs=-1
)

model.fit(X_train, y_train)

# -----------------------------
# SHAP on TEST set (baseline)
# -----------------------------
explainer = shap.TreeExplainer(model, X_train)
shap_values = explainer(X_test)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
shap_importance = pd.Series(mean_abs_shap, index=X.columns)

print("Baseline SHAP importance (test set):")
print(shap_importance.sort_values(ascending=False))

# -----------------------------
# Stress test: noise + redundancy
# -----------------------------
X_stress = X.copy()

# Inject pure noise (Dummy axiom)
X_stress["X_noise"] = np.random.normal(0, 1, size=len(X_stress))

# Inject redundancy (Symmetry axiom)
# Feature fixed a priori
X_stress["X_Age_copy"] = X_stress["Age"]

# -----------------------------
# Re-split stressed dataset
# -----------------------------
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_stress, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

# -----------------------------
# Retrain model on stressed data
# -----------------------------
model.fit(X_train_s, y_train_s)

# -----------------------------
# SHAP on TEST set (stress test)
# -----------------------------
explainer_s = shap.TreeExplainer(model)
shap_values_s = explainer_s(X_test_s)

mean_abs_shap_s = np.abs(shap_values_s.values).mean(axis=0)
shap_importance_s = pd.Series(mean_abs_shap_s, index=X_stress.columns)

print("\nSHAP importance (stress test, test set):")
print(shap_importance_s.sort_values(ascending=False))

# -----------------------------
# Permutation Feature Importance (stress test)
# -----------------------------
pfi = permutation_importance(
    model,
    X_test_s,
    y_test_s,
    n_repeats=30,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

pfi_importance = pd.Series(pfi.importances_mean, index=X_stress.columns)

print("\nPFI importance (stress test):")
print(pfi_importance.sort_values(ascending=False))




Baseline SHAP importance (test set):
Age               0.870599
Relationship      0.796658
Marital Status    0.515516
Capital Gain      0.484936
Education-Num     0.465093
Hours per week    0.458493
Occupation        0.444062
Sex               0.227185
Capital Loss      0.222908
Workclass         0.090925
Race              0.062655
Country           0.060435
dtype: float64

SHAP importance (stress test, test set):
Relationship      1.135210
Age               0.869688
Capital Gain      0.586664
Education-Num     0.527595
Occupation        0.403870
Hours per week    0.397316
Marital Status    0.290548
Capital Loss      0.171448
X_noise           0.169013
Sex               0.135609
Workclass         0.098739
Race              0.062480
Country           0.048639
X_Age_copy        0.000000
dtype: float32

PFI importance (stress test):
Capital Gain      0.051558
Relationship      0.029224
Education-Num     0.028098
Age               0.018578
Occupation        0.017974
Capital Loss      0.014