In [None]:
import os
import ast
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
import joblib
import warnings
from sklearn.exceptions import InconsistentVersionWarning

# =========================
# Konfigurasi path
# =========================

TRAIN_PATH = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\AMES Mutagenicity\Train set.xlsx"

MODEL_MACCS  = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\AMES Mutagenicity\QSAR\AMES_rf_macckeys.pkl"
MODEL_MORGAN = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\AMES Mutagenicity\QSAR\AMES_rf_morgan.pkl"
MODEL_APF    = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\AMES Mutagenicity\QSAR\AMES_xgb_apf.pkl"
MODEL_PHYS   = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\AMES Mutagenicity\QSAR\AMES_rf_rdkitcdk.pkl"

BASE_OUT = r"C:\Fauzan\Manuskrip QSAR 3\SHAP"
ENDPOINT_NAME = "AMES_Mutagenicity"
OUT_DIR = os.path.join(BASE_OUT, ENDPOINT_NAME)
os.makedirs(OUT_DIR, exist_ok=True)

# Opsional: sembunyikan warning mismatch sklearn version
warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

# =========================
# Helper functions
# =========================

def load_model(model_path):
    """Load model dengan joblib."""
    return joblib.load(model_path)

def get_model_type(model):
    """Identifikasi tipe model (tree vs lainnya)."""
    name = model.__class__.__name__.lower()
    if "xgb" in name or "forest" in name or "tree" in name or "boost" in name:
        return "tree"
    elif "svc" in name or "svm" in name:
        return "svm"
    else:
        return "other"

def subsample_X(X, n=1000, random_state=42):
    """Subsampling supaya SHAP jauh lebih cepat."""
    if len(X) > n:
        return X.sample(n, random_state=random_state)
    return X

def compute_shap_values(model, X, model_type, n_background=500, class_index=1):
    """
    Hitung SHAP untuk satu kelas (class_index, mis. 1 = toxic).
    Return shap_values dengan shape (n_samples, n_features). [web:1][web:31]
    """
    if model_type == "tree":
        if len(X) > n_background:
            background = X.sample(n_background, random_state=42)
        else:
            background = X

        explainer = shap.TreeExplainer(
            model,
            data=background,
            feature_perturbation="interventional",
            model_output="probability"
        )
        sv = explainer.shap_values(X, check_additivity=False)
    else:
        if len(X) > n_background:
            background = X.sample(n_background, random_state=42)
        else:
            background = X
        explainer = shap.KernelExplainer(model.predict_proba, background)
        sv = explainer.shap_values(X)

    # Normalisasi bentuk: ambil SHAP untuk kelas target
    if isinstance(sv, list):
        sv = np.array(sv[class_index])  # (n_samples, n_features)
    else:
        sv = np.array(sv)
        # kasus (n_samples, n_features, n_classes)
        if sv.ndim == 3:
            sv = sv[:, :, class_index]

    # kalau masih >2D, flatten fitur
    if sv.ndim > 2:
        sv = sv.reshape(sv.shape[0], -1)

    return explainer, sv  # sv: (n_samples, n_features)

def save_shap_summary(shap_values, X, out_path_png, max_display=20):
    """Summary plot SHAP (global importance) untuk satu kelas."""
    plt.figure(figsize=(8, 6))
    sv = np.array(shap_values)  # (n_samples, n_features)
    shap.summary_plot(
        sv,
        X,
        feature_names=X.columns.tolist(),
        show=False,
        max_display=max_display
    )
    plt.tight_layout()
    plt.savefig(out_path_png, dpi=300, bbox_inches="tight")
    plt.close()

def expand_list_column(df, col_name, prefix):
    """
    df[col_name] berisi string/list seperti '[0, 1, 0, ...]'.
    Return: DataFrame dengan kolom prefix_0, prefix_1, ...
    """
    series = df[col_name].apply(lambda x: ast.literal_eval(str(x)))
    n_bits = len(series.iloc[0])
    arr = np.vstack(series.values)
    cols = [f"{prefix}_{i}" for i in range(n_bits)]
    return pd.DataFrame(arr, columns=cols, index=df.index)

def get_top_shap_features(shap_values, X, top_n=20):
    """
    Ambil top-N fitur berdasarkan mean(|SHAP|) untuk satu kelas.
    shap_values diharapkan (n_samples, n_features). [web:63]
    """
    sv = np.array(shap_values)
    if sv.ndim > 2:
        sv = sv.reshape(sv.shape[0], -1)

    mean_abs = np.mean(np.abs(sv), axis=0)   # (n_features,)

    # cek konsistensi dengan X
    n_feat_X = X.shape[1]
    if mean_abs.shape[0] != n_feat_X:
        n_common = min(mean_abs.shape[0], n_feat_X)
        print(f"[WARNING] n_features SHAP ({mean_abs.shape[0]}) != n_features X ({n_feat_X}), "
              f"pakai {n_common} pertama.")
        mean_abs = mean_abs[:n_common]
        cols = np.array(X.columns)[:n_common]
    else:
        cols = np.array(X.columns)

    idx = np.argsort(mean_abs)[-top_n:][::-1]
    idx = np.asarray(idx).ravel().astype(int)

    features = cols[idx]
    values = mean_abs[idx]

    data = {
        "rank": np.arange(1, len(idx) + 1),
        "feature": features,
        "mean_abs_shap": values,
    }
    return pd.DataFrame(data)

def save_shap_dependence(shap_values, X, feature_name, out_path_png):
    """Dependence plot untuk satu fitur, satu kelas."""
    sv = np.array(shap_values)  # (n_samples, n_features)
    plt.figure(figsize=(6, 5))
    shap.dependence_plot(
        feature_name,
        sv,
        X,
        show=False
    )
    plt.tight_layout()
    plt.savefig(out_path_png, dpi=300, bbox_inches="tight")
    plt.close()

# =========================
# 1. Load TRAIN set
# =========================

df_all = pd.read_excel(TRAIN_PATH)
print("AMES train shape:", df_all.shape)
print("Columns:", df_all.columns.tolist())

# =========================
# 2. Expand fingerprint columns (MACCS, Morgan, APF)
# =========================

FP_COL_MORGAN = "Morgan_Descriptors"
FP_COL_MACCS  = "MACCS_Descriptors"
FP_COL_APF    = "APF_Descriptors"

maccs_bits  = expand_list_column(df_all, FP_COL_MACCS,  "MACCS")
morgan_bits = expand_list_column(df_all, FP_COL_MORGAN, "Morgan")
apf_bits    = expand_list_column(df_all, FP_COL_APF,    "APF")

df_all = pd.concat([df_all, maccs_bits, morgan_bits, apf_bits], axis=1)

# Hapus kolom fingerprint mentah
df_all = df_all.drop(columns=[FP_COL_MACCS, FP_COL_MORGAN, FP_COL_APF])

# =========================
# 3. Definisi blok fitur per descriptor
# =========================

NON_FEATURE_COLS = [
    "SMILES",
    "Outcome",
    "RDK_Descriptors",
]

maccs_cols  = [c for c in df_all.columns if c.startswith("MACCS_")]
morgan_cols = [c for c in df_all.columns if c.startswith("Morgan_")]
apf_cols    = [c for c in df_all.columns if c.startswith("APF_")]

physchem_cols = [
    c for c in df_all.columns
    if c not in NON_FEATURE_COLS
    and c not in maccs_cols
    and c not in morgan_cols
    and c not in apf_cols
]

print("Num MACCS features:", len(maccs_cols))
print("Num Morgan features:", len(morgan_cols))
print("Num APF features:", len(apf_cols))
print("Num physchem features:", len(physchem_cols))

# =========================
# 4. Siapkan X per descriptor + paksa numeric
# =========================

X_maccs  = df_all[maccs_cols].copy()
X_morgan = df_all[morgan_cols].copy()
X_apf    = df_all[apf_cols].copy()
X_phys   = df_all[physchem_cols].copy()

for name, X in [("MACCS", X_maccs), ("Morgan", X_morgan), ("APF", X_apf), ("Physchem", X_phys)]:
    X_num = X.apply(pd.to_numeric, errors="raise")
    if name == "MACCS":
        X_maccs = X_num
    elif name == "Morgan":
        X_morgan = X_num
    elif name == "APF":
        X_apf = X_num
    else:
        X_phys = X_num

# =========================
# 4b. Subsample untuk SHAP (percepat)
# =========================

N_SHAP_SAMPLE = 1000  # bisa turunkan ke 500 kalau masih berat

X_maccs_sub  = subsample_X(X_maccs,  N_SHAP_SAMPLE)
X_morgan_sub = subsample_X(X_morgan, N_SHAP_SAMPLE)
X_apf_sub    = subsample_X(X_apf,    N_SHAP_SAMPLE)
X_phys_sub   = subsample_X(X_phys,   N_SHAP_SAMPLE)

# =========================
# 5. SHAP untuk tiap model/descriptor
# =========================

TOP_DEP = 5  # berapa banyak fitur top yang dibuat dependence plot

# --- MACCS ---
print("AMES (train) - MACCS (RF) SHAP")
model_maccs = load_model(MODEL_MACCS)
type_maccs = get_model_type(model_maccs)
_, shap_maccs = compute_shap_values(model_maccs, X_maccs_sub, type_maccs)

out_maccs = os.path.join(OUT_DIR, "AMES_train_MACCS_SHAP_summary.png")
save_shap_summary(shap_maccs, X_maccs_sub, out_maccs, max_display=20)

df_top_maccs = get_top_shap_features(shap_maccs, X_maccs_sub, top_n=20)
csv_maccs = os.path.join(OUT_DIR, "AMES_train_MACCS_SHAP_top20.csv")
df_top_maccs.to_csv(csv_maccs, index=False)

for feat in df_top_maccs["feature"].iloc[:TOP_DEP]:
    dep_maccs = os.path.join(OUT_DIR, f"AMES_train_MACCS_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_maccs, X_maccs_sub, feat, dep_maccs)

# --- Morgan ---
print("AMES (train) - Morgan (RF) SHAP")
model_morgan = load_model(MODEL_MORGAN)
type_morgan = get_model_type(model_morgan)
_, shap_morgan = compute_shap_values(model_morgan, X_morgan_sub, type_morgan)

out_morgan = os.path.join(OUT_DIR, "AMES_train_Morgan_SHAP_summary.png")
save_shap_summary(shap_morgan, X_morgan_sub, out_morgan, max_display=20)

df_top_morgan = get_top_shap_features(shap_morgan, X_morgan_sub, top_n=20)
csv_morgan = os.path.join(OUT_DIR, "AMES_train_Morgan_SHAP_top20.csv")
df_top_morgan.to_csv(csv_morgan, index=False)

for feat in df_top_morgan["feature"].iloc[:TOP_DEP]:
    dep_morgan = os.path.join(OUT_DIR, f"AMES_train_Morgan_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_morgan, X_morgan_sub, feat, dep_morgan)

# --- APF (XGB) ---
print("AMES (train) - APF (XGB) SHAP")
model_apf = load_model(MODEL_APF)
type_apf = get_model_type(model_apf)
_, shap_apf = compute_shap_values(model_apf, X_apf_sub, type_apf)

out_apf = os.path.join(OUT_DIR, "AMES_train_APF_SHAP_summary.png")
save_shap_summary(shap_apf, X_apf_sub, out_apf, max_display=20)

df_top_apf = get_top_shap_features(shap_apf, X_apf_sub, top_n=20)
csv_apf = os.path.join(OUT_DIR, "AMES_train_APF_SHAP_top20.csv")
df_top_apf.to_csv(csv_apf, index=False)

for feat in df_top_apf["feature"].iloc[:TOP_DEP]:
    dep_apf = os.path.join(OUT_DIR, f"AMES_train_APF_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_apf, X_apf_sub, feat, dep_apf)

# --- Physicochemical (RF) ---
print("AMES (train) - Physchem (RF) SHAP")
model_phys = load_model(MODEL_PHYS)
type_phys = get_model_type(model_phys)
_, shap_phys = compute_shap_values(model_phys, X_phys_sub, type_phys)

out_phys = os.path.join(OUT_DIR, "AMES_train_Physchem_SHAP_summary.png")
save_shap_summary(shap_phys, X_phys_sub, out_phys, max_display=20)

df_top_phys = get_top_shap_features(shap_phys, X_phys_sub, top_n=20)
csv_phys = os.path.join(OUT_DIR, "AMES_train_Physchem_SHAP_top20.csv")
df_top_phys.to_csv(csv_phys, index=False)

for feat in df_top_phys["feature"].iloc[:TOP_DEP]:
    dep_phys = os.path.join(OUT_DIR, f"AMES_train_Physchem_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_phys, X_phys_sub, feat, dep_phys)

print("Done. SHAP train-set (subsample) plots and tables saved to:", OUT_DIR)


In [None]:
import os
import ast
import numpy as np
import pandas as pd
import shap
import matplotlib.pyplot as plt
import joblib
import warnings
from sklearn.exceptions import InconsistentVersionWarning

# =========================
# Konfigurasi path
# =========================

TRAIN_PATH = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\Development Toxicity\Dev_Train set_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx"

MODEL_MACCS  = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\Development Toxicity\QSAR\Dev_SVM_MACCS.pkl"
MODEL_MORGAN = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\Development Toxicity\QSAR\Dev_xgb_morgan.pkl"
MODEL_APF    = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\Development Toxicity\QSAR\Dev_rf_apf.pkl"
MODEL_PHYS   = r"C:\Fauzan\Manuscripts QSAR-RA 2\q-RASAR\Development Toxicity\QSAR\Dev_rf_rdkitcdk.pkl"

BASE_OUT = r"C:\Fauzan\Manuskrip QSAR 3\SHAP"
ENDPOINT_NAME = "Development_Toxicity"
OUT_DIR = os.path.join(BASE_OUT, ENDPOINT_NAME)
os.makedirs(OUT_DIR, exist_ok=True)

# Opsional: sembunyikan warning mismatch sklearn version
warnings.filterwarnings("ignore", category=InconsistentVersionWarning)

# =========================
# Helper functions
# =========================

def load_model(model_path):
    """Load model dengan joblib."""
    return joblib.load(model_path)

def get_model_type(model):
    """Identifikasi tipe model (tree vs lainnya)."""
    name = model.__class__.__name__.lower()
    if "xgb" in name or "forest" in name or "tree" in name or "boost" in name:
        return "tree"
    elif "svc" in name or "svm" in name:
        return "svm"
    else:
        return "other"

def subsample_X(X, n=1000, random_state=42):
    """Subsampling supaya SHAP jauh lebih cepat."""
    if len(X) > n:
        return X.sample(n, random_state=random_state)
    return X

def compute_shap_values(model, X, model_type, n_background=500, class_index=1):
    """
    Hitung SHAP untuk satu kelas (class_index, mis. 1 = toxic).
    Return shap_values dengan shape (n_samples, n_features).
    """
    if model_type == "tree":
        if len(X) > n_background:
            background = X.sample(n_background, random_state=42)
        else:
            background = X

        explainer = shap.TreeExplainer(
            model,
            data=background,
            feature_perturbation="interventional",
            model_output="probability"
        )
        sv = explainer.shap_values(X, check_additivity=False)
    else:
        if len(X) > n_background:
            background = X.sample(n_background, random_state=42)
        else:
            background = X
        explainer = shap.KernelExplainer(model.predict_proba, background)
        sv = explainer.shap_values(X)

    # Normalisasi bentuk: ambil SHAP untuk kelas target
    if isinstance(sv, list):
        sv = np.array(sv[class_index])  # (n_samples, n_features)
    else:
        sv = np.array(sv)
        # kasus (n_samples, n_features, n_classes)
        if sv.ndim == 3:
            sv = sv[:, :, class_index]

    # kalau masih >2D, flatten fitur
    if sv.ndim > 2:
        sv = sv.reshape(sv.shape[0], -1)

    return explainer, sv  # sv: (n_samples, n_features)

def save_shap_summary(shap_values, X, out_path_png, max_display=20):
    """Summary plot SHAP (global importance) untuk satu kelas."""
    plt.figure(figsize=(8, 6))
    sv = np.array(shap_values)  # (n_samples, n_features)
    shap.summary_plot(
        sv,
        X,
        feature_names=X.columns.tolist(),
        show=False,
        max_display=max_display
    )
    plt.tight_layout()
    plt.savefig(out_path_png, dpi=300, bbox_inches="tight")
    plt.close()

def expand_list_column(df, col_name, prefix):
    """
    df[col_name] berisi string/list seperti '[0, 1, 0, ...]'.
    Return: DataFrame dengan kolom prefix_0, prefix_1, ...
    """
    series = df[col_name].apply(lambda x: ast.literal_eval(str(x)))
    n_bits = len(series.iloc[0])
    arr = np.vstack(series.values)
    cols = [f"{prefix}_{i}" for i in range(n_bits)]
    return pd.DataFrame(arr, columns=cols, index=df.index)

def get_top_shap_features(shap_values, X, top_n=20):
    """
    Ambil top-N fitur berdasarkan mean(|SHAP|) untuk satu kelas.
    shap_values diharapkan (n_samples, n_features).
    """
    sv = np.array(shap_values)
    if sv.ndim > 2:
        sv = sv.reshape(sv.shape[0], -1)

    mean_abs = np.mean(np.abs(sv), axis=0)   # (n_features,)

    # cek konsistensi dengan X
    n_feat_X = X.shape[1]
    if mean_abs.shape[0] != n_feat_X:
        n_common = min(mean_abs.shape[0], n_feat_X)
        print(f"[WARNING] n_features SHAP ({mean_abs.shape[0]}) != n_features X ({n_feat_X}), "
              f"pakai {n_common} pertama.")
        mean_abs = mean_abs[:n_common]
        cols = np.array(X.columns)[:n_common]
    else:
        cols = np.array(X.columns)

    idx = np.argsort(mean_abs)[-top_n:][::-1]
    idx = np.asarray(idx).ravel().astype(int)

    features = cols[idx]
    values = mean_abs[idx]

    data = {
        "rank": np.arange(1, len(idx) + 1),
        "feature": features,
        "mean_abs_shap": values,
    }
    return pd.DataFrame(data)

def save_shap_dependence(shap_values, X, feature_name, out_path_png):
    """Dependence plot untuk satu fitur, satu kelas."""
    sv = np.array(shap_values)  # (n_samples, n_features)
    plt.figure(figsize=(6, 5))
    shap.dependence_plot(
        feature_name,
        sv,
        X,
        show=False
    )
    plt.tight_layout()
    plt.savefig(out_path_png, dpi=300, bbox_inches="tight")
    plt.close()

# =========================
# 1. Load TRAIN set
# =========================

df_all = pd.read_excel(TRAIN_PATH)
print("Dev train shape:", df_all.shape)
print("Columns:", df_all.columns.tolist())

# =========================
# 2. Expand fingerprint columns (MACCS, Morgan, APF)
# =========================

FP_COL_MORGAN = "Morgan_Descriptors"
FP_COL_MACCS  = "MACCS_Descriptors"
FP_COL_APF    = "APF_Descriptors"

maccs_bits  = expand_list_column(df_all, FP_COL_MACCS,  "MACCS")
morgan_bits = expand_list_column(df_all, FP_COL_MORGAN, "Morgan")
apf_bits    = expand_list_column(df_all, FP_COL_APF,    "APF")

df_all = pd.concat([df_all, maccs_bits, morgan_bits, apf_bits], axis=1)

# Hapus kolom fingerprint mentah
df_all = df_all.drop(columns=[FP_COL_MACCS, FP_COL_MORGAN, FP_COL_APF])

# =========================
# 3. Definisi blok fitur per descriptor
# =========================

NON_FEATURE_COLS = [
    "SMILES",
    "Outcome",
    "RDK_Descriptors",
]

maccs_cols  = [c for c in df_all.columns if c.startswith("MACCS_")]
morgan_cols = [c for c in df_all.columns if c.startswith("Morgan_")]
apf_cols    = [c for c in df_all.columns if c.startswith("APF_")]

physchem_cols = [
    c for c in df_all.columns
    if c not in NON_FEATURE_COLS
    and c not in maccs_cols
    and c not in morgan_cols
    and c not in apf_cols
]

print("Num MACCS features:", len(maccs_cols))
print("Num Morgan features:", len(morgan_cols))
print("Num APF features:", len(apf_cols))
print("Num physchem features:", len(physchem_cols))

# =========================
# 4. Siapkan X per descriptor + paksa numeric
# =========================

X_maccs  = df_all[maccs_cols].copy()
X_morgan = df_all[morgan_cols].copy()
X_apf    = df_all[apf_cols].copy()
X_phys   = df_all[physchem_cols].copy()

for name, X in [("MACCS", X_maccs), ("Morgan", X_morgan), ("APF", X_apf), ("Physchem", X_phys)]:
    X_num = X.apply(pd.to_numeric, errors="raise")
    if name == "MACCS":
        X_maccs = X_num
    elif name == "Morgan":
        X_morgan = X_num
    elif name == "APF":
        X_apf = X_num
    else:
        X_phys = X_num

# =========================
# 4b. Subsample untuk SHAP (percepat)
# =========================

N_SHAP_SAMPLE = 1000  # bisa turunkan ke 500 kalau masih berat

X_maccs_sub  = subsample_X(X_maccs,  N_SHAP_SAMPLE)
X_morgan_sub = subsample_X(X_morgan, N_SHAP_SAMPLE)
X_apf_sub    = subsample_X(X_apf,    N_SHAP_SAMPLE)
X_phys_sub   = subsample_X(X_phys,   N_SHAP_SAMPLE)

# =========================
# 5. SHAP untuk tiap model/descriptor
# =========================

TOP_DEP = 5  # berapa banyak fitur top yang dibuat dependence plot

# --- MACCS (SVM) ---
print("Dev (train) - MACCS (SVM) SHAP")
model_maccs = load_model(MODEL_MACCS)
type_maccs = get_model_type(model_maccs)
_, shap_maccs = compute_shap_values(model_maccs, X_maccs_sub, type_maccs)

out_maccs = os.path.join(OUT_DIR, "Dev_train_MACCS_SHAP_summary.png")
save_shap_summary(shap_maccs, X_maccs_sub, out_maccs, max_display=20)

df_top_maccs = get_top_shap_features(shap_maccs, X_maccs_sub, top_n=20)
csv_maccs = os.path.join(OUT_DIR, "Dev_train_MACCS_SHAP_top20.csv")
df_top_maccs.to_csv(csv_maccs, index=False)

for feat in df_top_maccs["feature"].iloc[:TOP_DEP]:
    dep_maccs = os.path.join(OUT_DIR, f"Dev_train_MACCS_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_maccs, X_maccs_sub, feat, dep_maccs)

# --- Morgan (XGB) ---
print("Dev (train) - Morgan (XGB) SHAP")
model_morgan = load_model(MODEL_MORGAN)
type_morgan = get_model_type(model_morgan)
_, shap_morgan = compute_shap_values(model_morgan, X_morgan_sub, type_morgan)

out_morgan = os.path.join(OUT_DIR, "Dev_train_Morgan_SHAP_summary.png")
save_shap_summary(shap_morgan, X_morgan_sub, out_morgan, max_display=20)

df_top_morgan = get_top_shap_features(shap_morgan, X_morgan_sub, top_n=20)
csv_morgan = os.path.join(OUT_DIR, "Dev_train_Morgan_SHAP_top20.csv")
df_top_morgan.to_csv(csv_morgan, index=False)

for feat in df_top_morgan["feature"].iloc[:TOP_DEP]:
    dep_morgan = os.path.join(OUT_DIR, f"Dev_train_Morgan_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_morgan, X_morgan_sub, feat, dep_morgan)

# --- APF (RF) ---
print("Dev (train) - APF (RF) SHAP")
model_apf = load_model(MODEL_APF)
type_apf = get_model_type(model_apf)
_, shap_apf = compute_shap_values(model_apf, X_apf_sub, type_apf)

out_apf = os.path.join(OUT_DIR, "Dev_train_APF_SHAP_summary.png")
save_shap_summary(shap_apf, X_apf_sub, out_apf, max_display=20)

df_top_apf = get_top_shap_features(shap_apf, X_apf_sub, top_n=20)
csv_apf = os.path.join(OUT_DIR, "Dev_train_APF_SHAP_top20.csv")
df_top_apf.to_csv(csv_apf, index=False)

for feat in df_top_apf["feature"].iloc[:TOP_DEP]:
    dep_apf = os.path.join(OUT_DIR, f"Dev_train_APF_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_apf, X_apf_sub, feat, dep_apf)

# --- Physicochemical (RF) ---
print("Dev (train) - Physchem (RF) SHAP")
model_phys = load_model(MODEL_PHYS)
type_phys = get_model_type(model_phys)
_, shap_phys = compute_shap_values(model_phys, X_phys_sub, type_phys)

out_phys = os.path.join(OUT_DIR, "Dev_train_Physchem_SHAP_summary.png")
save_shap_summary(shap_phys, X_phys_sub, out_phys, max_display=20)

df_top_phys = get_top_shap_features(shap_phys, X_phys_sub, top_n=20)
csv_phys = os.path.join(OUT_DIR, "Dev_train_Physchem_SHAP_top20.csv")
df_top_phys.to_csv(csv_phys, index=False)

for feat in df_top_phys["feature"].iloc[:TOP_DEP]:
    dep_phys = os.path.join(OUT_DIR, f"Dev_train_Physchem_SHAP_dependence_{feat}.png")
    save_shap_dependence(shap_phys, X_phys_sub, feat, dep_phys)

print("Done. SHAP train-set (subsample) plots and tables saved to:", OUT_DIR)


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

files = [
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Development_Toxicity\Dev_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Development_Toxicity\Dev_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Development_Toxicity\Dev_train_MACCS_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Development_Toxicity\Dev_train_Morgan_SHAP_top20.csv",
]

for path in files:
    df = pd.read_csv(path)

    # sesuaikan kalau nama kolom berbeda
    feat_col = "feature"
    shap_col = "mean_abs_shap"

    df_plot = df.sort_values(shap_col, ascending=True)

    plt.figure(figsize=(6, 8))
    plt.barh(df_plot[feat_col], df_plot[shap_col], color="red")
    plt.xlabel("Mean |SHAP| value")
    plt.ylabel("Feature")

    base = os.path.basename(path)
    title = os.path.splitext(base)[0]
    plt.title(title)

    plt.tight_layout()

    folder = os.path.dirname(path)
    png_name = os.path.splitext(base)[0] + "_SHAP_barh.png"
    out_path = os.path.join(folder, png_name)

    plt.savefig(out_path, dpi=300)
    plt.close()

    print("Saved:", out_path)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

paths = [
    # Baris 1: Morgan (barh, summary)
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_Morgan_SHAP_top20_SHAP_barh.png",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_Morgan_SHAP_summary.png",

    # Baris 1: MACCS (barh, summary)
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_MACCS_SHAP_top20_SHAP_barh.png",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_MACCS_SHAP_summary.png",

    # Baris 2: APF (barh, summary)
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_APF_SHAP_top20_SHAP_barh.png",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_APF_SHAP_summary.png",

    # Baris 2: Physchem (barh, summary)
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_Physchem_SHAP_top20_SHAP_barh.png",
    r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_train_Physchem_SHAP_summary.png",
]

fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(16, 8))

for ax, path in zip(axes.ravel(), paths):
    img = mpimg.imread(path)
    ax.imshow(img)
    ax.axis("off")

fig.subplots_adjust(
    left=0.0, right=1.0, top=1.0, bottom=0.0,
    wspace=0.0, hspace=0.0
)

out_path = r"C:\Fauzan\Manuskrip QSAR 3\SHAP\Neurotoxicity\Neurotox_SHAP_Morgan_MACCS_APF_Physchem_2x4.png"
plt.savefig(out_path, dpi=300)
plt.close()

print("Saved:", out_path)


In [None]:
import pandas as pd
from pathlib import Path

# ------------------------------------------------------------------
# Daftar file SHAP top-20
# ------------------------------------------------------------------
files = [
    # AMES
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\AMES_Mutagenicity\AMES_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\AMES_Mutagenicity\AMES_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\AMES_Mutagenicity\AMES_train_MACCS_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\AMES_Mutagenicity\AMES_train_Morgan_SHAP_top20.csv",

    # Cardiotoxicity
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Cardiotoxicity_hERG\Cardiotox_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Cardiotoxicity_hERG\Cardiotox_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Cardiotoxicity_hERG\Cardiotox_train_MACCS_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Cardiotoxicity_hERG\Cardiotox_train_Morgan_SHAP_top20.csv",

    # Developmental toxicity
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Development_Toxicity\Dev_train_Morgan_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Development_Toxicity\Dev_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Development_Toxicity\Dev_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Development_Toxicity\Dev_train_MACCS_SHAP_top20.csv",

    # DIN
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Drug_Induced_Nephrotoxicity_DIN\DIN_train_Morgan_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Drug_Induced_Nephrotoxicity_DIN\DIN_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Drug_Induced_Nephrotoxicity_DIN\DIN_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Drug_Induced_Nephrotoxicity_DIN\DIN_train_MACCS_SHAP_top20.csv",

    # Hepatotoxicity
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Hepatotoxicity\Hepato_train_Morgan_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Hepatotoxicity\Hepato_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Hepatotoxicity\Hepato_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Hepatotoxicity\Hepato_train_MACCS_SHAP_top20.csv",

    # Neurotoxicity
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Neurotoxicity\Neurotox_train_Morgan_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Neurotoxicity\Neurotox_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Neurotoxicity\Neurotox_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Neurotoxicity\Neurotox_train_MACCS_SHAP_top20.csv",

    # Respiratory irritation
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Respiratory_Irritation\Resp_train_Morgan_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Respiratory_Irritation\Resp_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Respiratory_Irritation\Resp_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Respiratory_Irritation\Resp_train_MACCS_SHAP_top20.csv",

    # Skin sensitization
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Skin_Sensitization\SkinSen_train_Morgan_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Skin_Sensitization\SkinSen_train_Physchem_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Skin_Sensitization\SkinSen_train_APF_SHAP_top20.csv",
    r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\Skin_Sensitization\SkinSen_train_MACCS_SHAP_top20.csv",
]

# ------------------------------------------------------------------
# Baca & gabungkan
# ------------------------------------------------------------------
dfs = []

for f in files:
    df = pd.read_csv(f)

    # Ambil hanya kolom penting
    df = df[["rank", "feature"]].copy()

    # Ambil metadata dari nama file
    fname = Path(f).stem
    parts = fname.split("_")

    df["endpoint"] = parts[0]
    df["descriptor"] = parts[2]  # Physchem / APF / MACCS / Morgan

    dfs.append(df)

shap_summary = pd.concat(dfs, ignore_index=True)

# Urutkan rapi
shap_summary = shap_summary.sort_values(
    by=["endpoint", "descriptor", "rank"]
).reset_index(drop=True)

# ------------------------------------------------------------------
# Simpan hasil
# ------------------------------------------------------------------
out_file = r"C:\Fauzan\Manuskrip QSAR 3\Revisi 1\SHAP\SHAP_top20_rank_feature_summary.csv"
shap_summary.to_csv(out_file, index=False)

print("Ringkasan SHAP berhasil dibuat:")
print(shap_summary.head())
