# 1. Import Module

In [None]:
# =========================
# Standard Libraries
# =========================
import sys
import os
import csv
import bz2
import pickle
import _pickle as cPickle
import multiprocessing
import warnings
from glob import glob

# Suppress warnings
def warn(*args, **kwargs):
    pass
warnings.filterwarnings("ignore")
warnings.warn = warn

# =========================
# IPython Extensions
# =========================
%reload_ext autoreload
%autoreload 2

# =========================
# Data Handling
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm

# =========================
# RDKit
# =========================
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, PandasTools, Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Descriptors import MolLogP
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import ExplicitBitVect
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors

# =========================
# Standardiser
# =========================
from standardiser import break_bonds, neutralise, rules, unsalt
from standardiser.utils import StandardiseException, sanity_check

# =========================
# Machine Learning / Scikit-learn
# =========================
from sklearn.metrics import (
    accuracy_score, roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score, cohen_kappa_score, make_scorer
)
from sklearn.model_selection import (
    ShuffleSplit, RepeatedStratifiedKFold, StratifiedShuffleSplit,
    GridSearchCV, StratifiedKFold
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
import joblib


In [None]:
import pandas as pd
from rdkit.Chem.Scaffolds import MurckoScaffold

# ==========================
# 데이터셋 불러오기
# ==========================
train_df = pd.read_excel(r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\FDA MDD (manual split)\Dataset\Train_set_FDAMMD_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx")

# ==========================
# 1️⃣ 데이터셋 크기와 클래스 분포 확인
# ==========================
print("=== Dataset Info ===")
# 데이터 행(row) 수와 특징(feature) 수 출력 (SMILES와 Outcome 제외)
print(f"Total rows: {train_df.shape[0]}, Total features (excluding SMILES/target): {train_df.shape[1]-2}")

# 클래스 분포 출력
print("\nClass distribution:")
print(train_df['Outcome'].value_counts())

# 클래스 비율 출력
print("\nClass ratio:")
print(train_df['Outcome'].value_counts(normalize=True))

# ==========================
# 2️⃣ SMILES 중복 확인
# ==========================
if 'SMILES' in train_df.columns:
    dup_count = train_df['SMILES'].duplicated().sum()
    print(f"\nNumber of duplicated SMILES: {dup_count}")
else:
    print("\nSMILES column not found for duplicate check.")  # SMILES 컬럼이 없을 경우 메시지 출력

# ==========================
# 3️⃣ 결측치(NaN) 및 특징 타입 확인
# ==========================
# SMILES와 Outcome 컬럼 제외 후 숫자형(numeric) 특징 선택
numeric_features = train_df.drop(columns=['SMILES','Outcome'], errors='ignore').select_dtypes(include=['int64','float64'])

print(f"\nNumber of numeric features: {numeric_features.shape[1]}")  # 숫자형 feature 수
print(f"Any missing values: {numeric_features.isna().sum().sum()}")   # 결측치 확인

print("\nFeature types:")
print(train_df.dtypes.value_counts())  # 각 컬럼 타입 개수 확인

# ==========================
# 4️⃣ 특징과 타겟 간 상관관계 확인
# ==========================
# 숫자형 특징과 Outcome 간 상관계수 계산
corr = numeric_features.corrwith(train_df['Outcome'])
# 절댓값 기준 상위 10개 특징 출력
top_corr = corr.abs().sort_values(ascending=False).head(10)
print("\nTop 10 features most correlated with target:")
print(top_corr)

# ==========================
# 5️⃣ Murcko Scaffold 고유성 확인 (옵션)
# ==========================
if 'SMILES' in train_df.columns:
    try:
        # 각 SMILES에 대해 Murcko Scaffold 추출
        scaffolds = train_df['SMILES'].apply(lambda s: MurckoScaffold.MurckoScaffoldSmiles(smiles=s))
        # 고유한 Scaffold 수 출력
        print(f"\nNumber of unique Murcko scaffolds: {scaffolds.nunique()}")
    except Exception as e:
        print(f"Error computing scaffolds: {e}")  # 에러 발생 시 메시지 출력


# RANDOM FOREST

In [None]:
# ================================
# 라이브러리 임포트
# ================================
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score,
    confusion_matrix, precision_score, recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import joblib
import os

# ================================
# Bemis–Murcko scaffold-based 10-fold
# ================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

# ================================
# 데이터셋 불러오기
# ================================
data_file = train_path  # 데이터셋 파일 경로
df = pd.read_excel(data_file)

# SMILES (untuk scaffold)
smiles_all = df['SMILES'].astype(str).values

# ================================
# 특징(feature)과 타겟(target) 설정
# ================================
drop_cols = ['SMILES', 'Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors', 'Outcome']
X_values = df.drop(columns=drop_cols).values  # RDKit+CDK physchem features
y_values = df['Outcome'].astype(int).values   # 타겟 변수

print("Dataset loaded:", X_values.shape, "features,", len(y_values), "samples")

# ================================
# 하이퍼파라미터 그리드 설정
# ================================
paramgrid = {
    "max_features": [
        X_values.shape[1],
        X_values.shape[1] // 2,
        X_values.shape[1] // 4,
        X_values.shape[1] // 12,
        X_values.shape[1] // 10,
        X_values.shape[1] // 7,
        X_values.shape[1] // 5,
        X_values.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ================================
# 10-fold scaffold-CV
# ================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies, auc_scores, precisions, recalls = [], [], [], []
f1_scores, specificities, sensitivity_scores = [], [], []
ppvs, npvs, ccrs, bacc_scores = [], [], [], []

for train_idx, test_idx in tqdm(folds, total=len(folds), desc="Scaffold-CV folds (RDKit+CDK RF)"):
    X_train, X_test = X_values[train_idx], X_values[test_idx]
    y_train, y_test = y_values[train_idx], y_values[test_idx]
    
    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred)
    bacc, sens, spec = compute_bacc_from_preds(y_test, y_pred)

    cm = confusion_matrix(y_test, y_pred)
    tp = cm[1, 1]
    fp = cm[0, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    specificities.append(spec)
    sensitivity_scores.append(sens)
    ppvs.append(ppv)
    npvs.append(npv)
    ccrs.append(ccr)
    bacc_scores.append(bacc)

    print(f"Fold: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}")

# ================================
# CV 결과 요약 (fold 평균) + Sm
# ================================
mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)
Sm = 0.5 * (mean_auc + mean_bacc)

metrics_summary = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n===== Scaffold-CV Results (RDKit+CDK RF) =====")
for k, v in metrics_summary.items():
    print(f"CV {k}: {v:.4f}")

# ================================
# 전체 데이터셋으로 최종 모델 학습
# ================================
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_final.fit(X_values, y_values)
final_model = grid_final.best_estimator_

# ================================
# 모델 및 평가 지표 저장 (+ Sm)
# ================================
output_dir = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "Dermal_rf_rdkitcdk.pkl")
joblib.dump(final_model, model_path, compress=9)
print(f"\nFinal model saved: {model_path}")

metrics_path = os.path.join(output_dir, "Dermal_rf_rdkitcdk_metrics_with_Sm.xlsx")
pd.DataFrame([metrics_summary]).to_excel(metrics_path, index=False)
print(f"CV metrics (incl. Sm) report saved: {metrics_path}")

# Sm untuk consensus QSAR
sm_csv_path = os.path.join(output_dir, "Sm_Physchem_RF.csv")
pd.DataFrame([{"Descriptor": "Physchem", "Algorithm": "RF", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print(f"Sm for Physchem RF saved at: {sm_csv_path}")


# XGBOOST

In [None]:
# ================================
# Import libraries
# ================================
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score,
    confusion_matrix, precision_score, recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBClassifier
from tqdm import tqdm
import joblib
import os

# ================================
# Scaffold-based 10-fold utils
# ================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

# ================================
# Load dataset
# ================================
data_file = train_path
df = pd.read_excel(data_file)

smiles_all = df['SMILES'].astype(str).values

# ================================
# Features & Target (physchem only)
# ================================
drop_cols = ['SMILES', 'Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors', 'Outcome']
X_values = df.drop(columns=drop_cols).values
y_values = df['Outcome'].astype(int).values

print("Dataset loaded:", X_values.shape, "features,", len(y_values), "samples")

# ================================
# Hyperparameter grid for XGBoost
# ================================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ================================
# 10-fold scaffold-CV
# ================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies, auc_scores, precisions, recalls = [], [], [], []
f1_scores, specificities, sensitivity_scores = [], [], []
ppvs, npvs, ccrs, bacc_scores = [], [], [], []

for train_idx, test_idx in tqdm(folds, total=len(folds), desc="Scaffold-CV folds (RDKit+CDK XGB)"):
    X_train, X_test = X_values[train_idx], X_values[test_idx]
    y_train, y_test = y_values[train_idx], y_values[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(
            objective='binary:logistic',
            use_label_encoder=False,
            eval_metric='logloss'
        ),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=0,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred)
    bacc, sens, spec = compute_bacc_from_preds(y_test, y_pred)

    cm = confusion_matrix(y_test, y_pred)
    tp = cm[1, 1]
    fp = cm[0, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    specificities.append(spec)
    sensitivity_scores.append(sens)
    ppvs.append(ppv)
    npvs.append(npv)
    ccrs.append(ccr)
    bacc_scores.append(bacc)

    print(f"Fold: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}")

# ================================
# CV results + Sm
# ================================
mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)
Sm = 0.5 * (mean_auc + mean_bacc)

metrics_summary = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n===== Scaffold-CV Results (RDKit+CDK XGB) =====")
for k, v in metrics_summary.items():
    print(f"CV {k}: {v:.4f}")

# ================================
# Train final model on full dataset
# ================================
print("\nTraining final XGBoost model on full dataset with GridSearchCV...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss'
    ),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=0,
    n_jobs=-1
)
grid_final.fit(X_values, y_values)
final_model = grid_final.best_estimator_

# ================================
# Save model & metrics
# ================================
output_dir = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "Dermal_xgb_rdkitcdk.pkl")
joblib.dump(final_model, model_path, compress=9)
print(f"\nFinal XGBoost model saved: {model_path}")

metrics_path = os.path.join(output_dir, "Dermal_xgb_rdkitcdk_metrics_with_Sm.xlsx")
pd.DataFrame([metrics_summary]).to_excel(metrics_path, index=False)
print(f"CV metrics (incl. Sm) report saved: {metrics_path}")

# Sm untuk consensus QSAR
sm_csv_path = os.path.join(output_dir, "Sm_Physchem_XGB.csv")
pd.DataFrame([{"Descriptor": "Physchem", "Algorithm": "XGB", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print(f"Sm for Physchem XGB saved at: {sm_csv_path}")


# SVM

In [None]:
# ================================
# Import libraries
# ================================
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score,
    confusion_matrix, precision_score, recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from tqdm import tqdm
import joblib
import os

# ================================
# Scaffold-based 10-fold utils
# ================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

# ================================
# Load dataset
# ================================
file_path = train_path
df = pd.read_excel(file_path)

smiles_all = df['SMILES'].astype(str).values

# Drop kolom non-fitur (pakai physchem saja)
drop_cols = ['SMILES', 'Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors', 'Outcome']
X_values = df.drop(columns=drop_cols).values
y_values = df['Outcome'].astype(int).values

print("Dataset loaded:", X_values.shape, "features,", len(y_values), "samples")

# ================================
# Hyperparameter grid
# ================================
param_grid = {
    "svc__C": [0.1, 1, 10, 100],
    "svc__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "svc__gamma": ['scale', 'auto']
}

pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("svc", SVC(probability=True, class_weight="balanced"))
])

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ================================
# 10-fold scaffold-CV
# ================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies, auc_scores, precisions, recalls = [], [], [], []
f1_scores, specificities, sensitivity_scores = [], [], []
ppvs, npvs, ccrs, bacc_scores = [], [], [], []

for train_idx, test_idx in tqdm(folds, total=len(folds), desc="Scaffold-CV folds (Physchem SVM)"):
    X_train, X_test = X_values[train_idx], X_values[test_idx]
    y_train, y_test = y_values[train_idx], y_values[test_idx]

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=kappa_scorer,
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    acc = accuracy_score(y_test, y_pred)
    try:
        auc = roc_auc_score(y_test, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred)
    bacc, sens, spec = compute_bacc_from_preds(y_test, y_pred)

    cm = confusion_matrix(y_test, y_pred)
    tp = cm[1, 1]
    fp = cm[0, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    specificities.append(spec)
    sensitivity_scores.append(sens)
    ppvs.append(ppv)
    npvs.append(npv)
    ccrs.append(ccr)
    bacc_scores.append(bacc)

    print(f"Fold: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}")

# ================================
# CV results + Sm
# ================================
mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)
Sm = 0.5 * (mean_auc + mean_bacc)

metrics_summary = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n===== Scaffold-CV Results (Physchem SVM) =====")
for k, v in metrics_summary.items():
    print(f"CV {k}: {v:.4f}")

# ================================
# Train final model di seluruh dataset
# ================================
grid_final = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_final.fit(X_values, y_values)
final_model = grid_final.best_estimator_

# ================================
# Save model & metrics & Sm
# ================================
output_dir = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model"
os.makedirs(output_dir, exist_ok=True)

model_path = os.path.join(output_dir, "Dermal_svm_rdkitcdk.pkl")
joblib.dump(final_model, model_path, compress=9)
print(f"\nFinal SVM model saved: {model_path}")

metrics_path = os.path.join(output_dir, "Dermal_svm_rdkitcdk_metrics_with_Sm.xlsx")
pd.DataFrame([metrics_summary]).to_excel(metrics_path, index=False)
print(f"CV metrics (incl. Sm) report saved: {metrics_path}")

sm_csv_path = os.path.join(output_dir, "Sm_Physchem_SVM.csv")
pd.DataFrame([{"Descriptor": "Physchem", "Algorithm": "SVM", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print(f"Sm for Physchem SVM saved at: {sm_csv_path}")
