In [None]:
import numpy as np
import pandas as pd
import shap

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.inspection import permutation_importance

RANDOM_STATE = 42

# -----------------------------
# Load dataset
# -----------------------------
data = load_breast_cancer(as_frame=True)
X = data.data.copy()
y = data.target

# -----------------------------
# Baseline model
# -----------------------------
model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

model.fit(X_train, y_train)

# -----------------------------
# SHAP baseline importance
# -----------------------------
explainer = shap.Explainer(model, X_train)
shap_values = explainer(X_train)

mean_abs_shap = np.abs(shap_values.values).mean(axis=0)
shap_importance = pd.Series(mean_abs_shap, index=X.columns)

top_feature = shap_importance.idxmax()
print(f"Top SHAP feature (baseline): {top_feature}")

# -----------------------------
# Stress test: noise + redundancy
# -----------------------------
X_stress = X.copy()

# Inject noise
X_stress["X_noise"] = np.random.normal(0, 1, size=len(X_stress))

# Inject redundancy
X_stress[f"X_{top_feature}_copy"] = X_stress[top_feature]

# Re-split
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(
    X_stress, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)

model.fit(X_train_s, y_train_s)

# -----------------------------
# SHAP after stress
# -----------------------------
explainer_s = shap.TreeExplainer(model)
shap_values_s = explainer_s(X_train_s)

mean_abs_shap_s = np.abs(shap_values_s.values).mean(axis=0)
shap_importance_s = pd.Series(mean_abs_shap_s, index=X_stress.columns)

print("\nSHAP importance (stress test):")
print(shap_importance_s.sort_values(ascending=False))

# -----------------------------
# PFI after stress
# -----------------------------
pfi = permutation_importance(
    model, X_test_s, y_test_s,
    n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1
)

pfi_importance = pd.Series(pfi.importances_mean, index=X_stress.columns)

print("\nPFI importance (stress test):")
print(pfi_importance.sort_values(ascending=False))
