In [35]:
import shap
import pickle
import xgboost as xgb
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from pdpbox import pdp, info_plots
import os

In [None]:
# saving folder
save_dir = "C:/Users/hangang/Desktop/sci/02. LC-OCD for algal bloom/4_그림 모음/optimalXGBSHAP"
os.makedirs(save_dir, exist_ok=True)

In [None]:
# ✅ 1. model load
model_path = "C:/Users/hangang/Desktop/sci/02. LC-OCD for algal bloom/3_모델 개발 중/best_xgb_model_19_features.pkl"
with open(model_path, 'rb') as f:
    model = pickle.load(f)

print(f"✅ 모델 로드 완료: {model_path}")

In [None]:
# ✅ 2. data load
data_path = "C:/Users/hangang/Desktop/sci/02. LC-OCD for algal bloom/1_raw data/01_data_full.csv"
data = pd.read_csv(data_path, encoding='utf-8')

In [None]:
# ✅ 3. Selecting top 19 features
shap_top_features = [
    'TN', 'NO3-N', 'BOD', 'BODTOC ratio', 'Molecularity', 'SR', 'TOC', 'WT', 'DO', 'SS',
    'HS-N', 'NH3-N', 'HS', 'COD', 'PO4-P', 'pH', 'S275-295', 'Aromaticity', 'EC'
]

X = data[shap_top_features]
y = data['Chl-a']

In [40]:
# ✅ 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [None]:
# ✅ 4. TreeSHAP
explainer = shap.TreeExplainer(model)
shap_values = explainer(X)

In [None]:
# ✅ SHAP Summary Plot (dot)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, show=False)
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "shap_summary_dot.png"), dpi=300)
plt.show()

In [None]:
# ✅ SHAP Summary Plot (bar)
plt.figure(figsize=(10, 6))
shap.summary_plot(shap_values, X, plot_type="bar", show=False)
plt.tight_layout()
plt.savefig(os.path.join(save_dir, "shap_summary_bar.png"), dpi=300)
plt.show()

In [None]:
# ✅ SHAP dependence plot(6 top variables from optimized xgb model shap analysis)
top6_features = ['TN', 'TOC', 'BODTOC ratio', 'SR', 'BOD', 'Molecularity']
num_features = len(top6_features)
num_cols = 3
num_rows = (num_features + num_cols - 1) // num_cols

fig, axes = plt.subplots(num_rows, num_cols, figsize=(18, num_rows * 5))
axes = axes.flatten()

for i, feature in enumerate(top6_features):
    shap.dependence_plot(feature, shap_values.values, X, ax=axes[i], interaction_index=None, show=False)
    axes[i].tick_params(axis='both', which='major', labelsize=14)
    axes[i].set_xlabel(feature, fontsize=16)
    axes[i].set_ylabel("SHAP value", fontsize=16)

# 빈 plot 제거
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

fig.suptitle("SHAP Partial Dependence Plots (Top 6 Features)", fontsize=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.savefig(os.path.join(save_dir, "shap_dependence_top6_allinone.png"), dpi=300)
plt.show()