In [None]:
# === Load saved XGB model + scaler, compute SHAP feature importance (no plots) ===
import pickle
import numpy as np
import pandas as pd
import shap
import xgboost as xgb  # required so pickle can deserialize XGBRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler  # for type checking (optional)

In [None]:
# 0) Paths (edit as needed)
# -------------------------
model_path = r"C:/Users/hangang/Desktop/best_xgb_model_full.pkl"
data_path  = r"C:/Users/hangang/Desktop/01_data_full.csv"
out_path   = r"C:/Users/hangang/Desktop/feature_importance_full.csv"

In [None]:
# 1) Load saved object
#    - Expected: dict with keys {"model","scaler","features"} (as in our training templates)
#    - Fallback: raw model only -> we raise a clear error (scaler is required)
# -------------------------
with open(model_path, "rb") as f:
    obj = pickle.load(f)

if isinstance(obj, dict) and "model" in obj:
    model   = obj["model"]
    scaler  = obj.get("scaler", None)
    columns_saved = obj.get("features", None)
else:
    # Raw model without scaler — not allowed per your requirement (auto-load scaler)
    raise ValueError(
        "The saved file does not contain a scaler. Please save the model together with the scaler "
        "({'model','scaler','features'}) during training."
    )

if scaler is None:
    raise ValueError("No scaler found in the saved artifact. Unable to transform X consistently.")

In [None]:
# 2) Load data and align columns (if feature list was saved)
# -------------------------
df = pd.read_csv(data_path, encoding="utf-8")
if "Chl-a" not in df.columns:
    raise KeyError("'Chl-a' column not found in the CSV.")

X = df.drop("Chl-a", axis=1)

# If features list was saved, align order and subset to those features
if columns_saved is not None:
    missing = [c for c in columns_saved if c not in X.columns]
    if missing:
        raise ValueError(f"The following features expected by the model are missing in the CSV: {missing}")
    X = X[columns_saved]
    feature_names = columns_saved
else:
    feature_names = X.columns.tolist()

In [None]:
# 3) Transform with the saved (fitted) scaler
# -------------------------
try:
    X_scaled = scaler.transform(X)
except Exception as e:
    raise RuntimeError(
        "Failed to transform with the saved scaler. "
        "Ensure the scaler in the pickle was already fitted during training."
    ) from e

In [None]:
# 4) SHAP values and Mean(|SHAP|) importance
# -------------------------
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_scaled, check_additivity=False)  # ndarray: (n_samples, n_features)

importance = np.mean(np.abs(shap_values), axis=0)  # shape: (n_features,)
importance_df = pd.DataFrame({"feature": feature_names, "importance": importance}) \
                  .sort_values("importance", ascending=False)

In [None]:
# 5) Save importance
# -------------------------
importance_df.to_csv(out_path, index=False)
print(f"SHAP feature importance saved to:\n{out_path}")