# 1. Import Module

In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Descriptors import MolLogP
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.DataStructs import ExplicitBitVect
import sys
import multiprocessing
from standardiser import break_bonds, neutralise, rules, unsalt
from standardiser.utils import StandardiseException, sanity_check
%reload_ext autoreload
%autoreload 2
def warn(*args, **kwargs):
    pass 
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import sys
from sklearn.metrics import cohen_kappa_score
import csv
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import ShuffleSplit
import _pickle as cPickle
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit    
import bz2
from glob import glob
import _pickle as cPickle
import pickle
# Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
# Draw.DrawingOptions.atomLabelFontSize = 18




# LOAD DATASET TRAINING


In [None]:
import pandas as pd
from rdkit import Chem

# Function to read Excel file into DataFrame
def load_excel_to_df(filename):
    df = pd.read_excel(filename)
    return df

# Convert strings back to lists of integers
def string_to_list(bit_string):
    if isinstance(bit_string, str):
        return list(map(int, bit_string.strip('[]').split(', ')))
    else:
        return bit_string

# Load Excel file
train_df = load_excel_to_df(r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Train_set_Dermal_balanced_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx")

# Apply the string-to-list conversion for the fingerprint columns
fingerprint_columns = ['Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors']
for col in fingerprint_columns:
    if col in train_df.columns:
        train_df[col] = train_df[col].apply(string_to_list)
    else:
        print(f"Warning: Column {col} not found in Excel file!")

# Define combined_df as train_df
combined_df = train_df

print("Combined DataFrame:")
print(combined_df.head())


In [None]:
combined_df.keys()

In [None]:
combine_df= combined_df.sort_values(['Outcome'], ascending=True)
combined_df['RowID'] = combined_df.index
combined_df.head(100)

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Label encoding
le = LabelEncoder()
outcomes = np.unique(combined_df['Outcome'])
le.fit(outcomes)
y = le.transform(combined_df['Outcome'])

# Hasil
classes, counts = np.unique(y, return_counts=True)

print("Classes                          : ", classes)
print("Number of cpds in each class     : ", counts)
print("Total number of cpds             : ", len(y))

In [None]:
S = pd.Series(le.transform(combined_df['Outcome']))  # Sama dengan y

In [None]:
ax = S.hist(bins=np.arange(-0.5,5))
ax.set_xticks(range(0,5))
#info

In [None]:
# Target
y = np.int32(S)  # pastikan S sudah didefinisikan

# Fingerprint arrays
x_maccs = np.array(list(combined_df['MACCS_Descriptors']))
x_morgan = np.array(list(combined_df['Morgan_Descriptors']))
x_apf = np.array(list(combined_df['APF_Descriptors']))  # ditambahkan APF


# MORGAN_RF

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score,
    roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
import joblib
import os

# =========================================================
# Fungsi: Bemis–Murcko scaffold-based KFold (10-fold)
# =========================================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    return 0.5 * (sens + spec), sens, spec

# =========================================================
# 1️⃣ Data, fitur, target (x_morgan, y, SMILES)
# =========================================================
# Asumsi:
#  - x_morgan: np.ndarray (n_samples, n_features)
#  - y: np.ndarray (n_samples,)
#  - smiles_all: list/array SMILES dari train set untuk scaffold-CV
# Pastikan sudah didefinisikan sebelumnya, mis. dari train_df['SMILES'].

# contoh:
# smiles_all = train_df['SMILES'].astype(str).values

# =========================================================
# 2️⃣ Hyperparameter grid
# =========================================================
paramgrid = {
    "max_features": [
        x_morgan.shape[1],
        x_morgan.shape[1] // 2,
        x_morgan.shape[1] // 4,
        x_morgan.shape[1] // 12,
        x_morgan.shape[1] // 10,
        x_morgan.shape[1] // 7,
        x_morgan.shape[1] // 5,
        x_morgan.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

# scoring utama untuk GridSearch (seperti sebelumnya)
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# =========================================================
# 3️⃣ 10-fold scaffold-CV untuk dapatkan Sm
# =========================================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies = []
auc_scores = []
precisions = []
recalls = []
f1_scores = []
specificities = []
sensitivity_scores = []
ppvs = []
npvs = []
bacc_scores = []
ccrs = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

for fold_id, (train_idx, val_idx) in enumerate(folds, start=1):
    X_train, X_val = x_morgan[train_idx], x_morgan[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,              # inner CV (bisa stratified biasa)
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_val)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_pred)
    try:
        auc = roc_auc_score(y_val, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_val, y_pred, zero_division=0)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_val, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)

    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    sensitivity_scores.append(sens)
    specificities.append(spec)
    ppvs.append(ppv)
    npvs.append(npv)
    bacc_scores.append(bacc)
    ccrs.append(ccr)

    print(f"Fold {fold_id}: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}")

# rata-rata CV metrics
mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)

Sm = 0.5 * (mean_auc + mean_bacc)    # definisi Sm di manuskrip

cv_metrics = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n=== QSAR 10-fold scaffold-CV (Morgan RF) ===")
for k, v in cv_metrics.items():
    print(f'CV {k}: {v:.4f}')

# =========================================================
# 4️⃣ Train final model on full dataset (seperti sebelumnya)
# =========================================================
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1
)
grid_final.fit(x_morgan, y)
final_model = grid_final.best_estimator_

# =========================================================
# 5️⃣ Save final model + CV metrics (termasuk Sm)
# =========================================================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

model_path = os.path.join(model_folder, 'Dermal_rf_morgan.pkl')
joblib.dump(final_model, model_path, compress=9)
print("\nFinal Random Forest model saved successfully at:")
print(model_path)

metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_rf_morgan_CV_metrics_with_Sm.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print("CV metrics (incl. Sm) saved successfully at:")
print(metrics_path)

# Opsional: simpan Sm saja ke CSV untuk keperluan consensus QSAR
sm_csv_path = os.path.join(model_folder, 'Sm_Morgan_RF.csv')
pd.DataFrame([{"Descriptor": "Morgan", "Algorithm": "RF", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print("Sm saved for consensus QSAR weighting at:")
print(sm_csv_path)


# MACCS_RF

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score,
    roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import joblib
import os

# =========================================================
# Fungsi: Bemis–Murcko scaffold-based KFold (10-fold)
# =========================================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

# =========================================================
# 1️⃣ Data, fitur, target (x_maccs, y, SMILES)
# =========================================================
# Asumsi:
#   - x_maccs: np.ndarray (n_samples, n_features)
#   - y: np.ndarray (n_samples,)
#   - smiles_all: array/list SMILES train set (sama urutannya dengan x_maccs, y)

# contoh:
# smiles_all = train_df['SMILES'].astype(str).values

# =========================================================
# 2️⃣ Hyperparameter grid
# =========================================================
paramgrid = {
    "max_features": [
        x_maccs.shape[1],
        x_maccs.shape[1] // 2,
        x_maccs.shape[1] // 4,
        x_maccs.shape[1] // 12,
        x_maccs.shape[1] // 10,
        x_maccs.shape[1] // 7,
        x_maccs.shape[1] // 5,
        x_maccs.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# =========================================================
# 3️⃣ 10-fold scaffold-CV untuk dapatkan Sm (MACCS + RF)
# =========================================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies, auc_scores, precisions, recalls = [], [], [], []
f1_scores, specificities, sensitivity_scores = [], [], []
ppvs, npvs, ccrs, bacc_scores = [], [], [], []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

for fold_id, (train_idx, val_idx) in enumerate(
    tqdm(folds, total=len(folds), desc="Outer scaffold-CV folds (MACCS RF)")
):
    X_train, X_val = x_maccs[train_idx], x_maccs[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,          # inner CV biasa
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_val)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_pred)
    try:
        auc = roc_auc_score(y_val, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred)
    bacc, sens, spec = compute_bacc_from_preds(y_val, y_pred)

    ppv = (confusion_matrix(y_val, y_pred)[1, 1] /
           (confusion_matrix(y_val, y_pred)[1, 1] + confusion_matrix(y_val, y_pred)[0, 1])
           ) if (confusion_matrix(y_val, y_pred)[1, 1] + confusion_matrix(y_val, y_pred)[0, 1]) > 0 else 0.0
    npv = (confusion_matrix(y_val, y_pred)[0, 0] /
           (confusion_matrix(y_val, y_pred)[0, 0] + confusion_matrix(y_val, y_pred)[1, 0])
           ) if (confusion_matrix(y_val, y_pred)[0, 0] + confusion_matrix(y_val, y_pred)[1, 0]) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    specificities.append(spec)
    sensitivity_scores.append(sens)
    ppvs.append(ppv)
    npvs.append(npv)
    ccrs.append(ccr)
    bacc_scores.append(bacc)

    print(f"Fold {fold_id+1}: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}")

mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)

Sm = 0.5 * (mean_auc + mean_bacc)

cv_metrics = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n=== QSAR 10-fold scaffold-CV (MACCS RF) ===")
for k, v in cv_metrics.items():
    print(f'CV {k}: {v:.4f}')

# =========================================================
# 4️⃣ Train final model di seluruh dataset (seperti sebelumnya)
# =========================================================
print("\nTraining final MACCS RF model on full dataset...")
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)
grid_final.fit(x_maccs, y)
final_model = grid_final.best_estimator_

# =========================================================
# 5️⃣ Save final model + CV metrics + Sm
# =========================================================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

model_path = os.path.join(model_folder, 'Dermal_rf_macckeys.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final Random Forest model saved successfully at:")
print(model_path)

metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_rf_macckeys_CV_metrics_with_Sm.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics (incl. Sm) saved successfully at: {metrics_path}")

# Sm untuk keperluan consensus QSAR
sm_csv_path = os.path.join(model_folder, 'Sm_MACCS_RF.csv')
pd.DataFrame([{"Descriptor": "MACCS", "Algorithm": "RF", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print("Sm saved for consensus QSAR weighting at:")
print(sm_csv_path)


# APF_RF

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score,
    roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import joblib
import os

# =========================================================
# Fungsi: Bemis–Murcko scaffold-based KFold (10-fold)
# =========================================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

# =========================================================
# 1️⃣ Data, fitur, target (x_apf, y, SMILES)
# =========================================================
# Asumsi:
#   - x_apf: np.ndarray (n_samples, n_features)
#   - y: np.ndarray (n_samples,)
#   - smiles_all: array/list SMILES train set, urutan sama dengan x_apf & y
# contoh:
# smiles_all = train_df['SMILES'].astype(str).values

# =========================================================
# 2️⃣ Hyperparameter grid
# =========================================================
paramgrid = {
    "max_features": [
        x_apf.shape[1],
        x_apf.shape[1] // 2,
        x_apf.shape[1] // 4,
        x_apf.shape[1] // 12,
        x_apf.shape[1] // 10,
        x_apf.shape[1] // 7,
        x_apf.shape[1] // 5,
        x_apf.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# =========================================================
# 3️⃣ 10-fold scaffold-CV untuk dapatkan Sm (APF + RF)
# =========================================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies, auc_scores, precisions, recalls = [], [], [], []
f1_scores, specificities, sensitivity_scores = [], [], []
ppvs, npvs, ccrs, bacc_scores = [], [], [], []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

for fold_id, (train_idx, val_idx) in enumerate(
    tqdm(folds, total=len(folds), desc="Outer scaffold-CV folds (APF RF)")
):
    X_train, X_val = x_apf[train_idx], x_apf[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,          # inner CV biasa
        n_jobs=-1,
        verbose=0
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_val)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_pred)
    try:
        auc = roc_auc_score(y_val, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred)
    bacc, sens, spec = compute_bacc_from_preds(y_val, y_pred)

    cm = confusion_matrix(y_val, y_pred)
    tp = cm[1, 1]
    fp = cm[0, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    specificities.append(spec)
    sensitivity_scores.append(sens)
    ppvs.append(ppv)
    npvs.append(npv)
    ccrs.append(ccr)
    bacc_scores.append(bacc)

    print(f"Fold {fold_id+1}: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}")

mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)

Sm = 0.5 * (mean_auc + mean_bacc)

cv_metrics = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n=== QSAR 10-fold scaffold-CV (APF RF) ===")
for k, v in cv_metrics.items():
    print(f'CV {k}: {v:.4f}')

# =========================================================
# 4️⃣ Train final model di seluruh dataset
# =========================================================
print("\nTraining final APF RF model on full dataset...")
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)
grid_final.fit(x_apf, y)
final_model = grid_final.best_estimator_

# =========================================================
# 5️⃣ Save final model + CV metrics + Sm
# =========================================================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

model_path = os.path.join(model_folder, 'Dermal_rf_apf.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final Random Forest model saved successfully at:")
print(model_path)

metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_rf_apf_CV_metrics_with_Sm.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics (incl. Sm) saved successfully at: {metrics_path}")

# Sm untuk keperluan consensus QSAR
sm_csv_path = os.path.join(model_folder, 'Sm_APF_RF.csv')
pd.DataFrame([{"Descriptor": "APF", "Algorithm": "RF", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print("Sm saved for consensus QSAR weighting at:")
print(sm_csv_path)


# MORGAN_XBOOST

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_morgan, y sudah didefinisikan sebelumnya
# Pastikan y = np.int32(S) jika perlu

# ==========================
# Hyperparameter grid untuk XGBoost
# ==========================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress
# ==========================
print("Starting 10-fold cross-validation...")

for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_morgan, y), total=10, desc="CV Folds")):
    X_train, X_test = x_morgan[train_idx], x_morgan[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))
    
    print(f"Fold {fold+1} done. Best params: {grid.best_params_}")

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n=== 10-Fold CV Metrics ===")
for k, v in cv_metrics.items():
    print(f'{k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("\nTraining final XGBoost model on full dataset with GridSearchCV...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_final.fit(x_morgan, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model & metrics
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# Save model
model_path = os.path.join(model_folder, 'Dermal_xgb_morgan.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final XGBoost model saved successfully!")

# Save CV metrics
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_xgb_morgan_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# MACCS_XGBOOST

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score,
    roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from xgboost import XGBClassifier
from tqdm import tqdm
import joblib
import os

# =========================================================
# Fungsi: Bemis–Murcko scaffold-based KFold (10-fold)
# =========================================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

# =========================================================
# 1️⃣ Data, fitur, target (x_maccs, y, SMILES)
# =========================================================
# Asumsi:
#   - x_maccs: np.ndarray (n_samples, n_features)
#   - y: np.ndarray (n_samples,)
#   - smiles_all: array/list SMILES train set (urutannya sama)
# contoh:
# smiles_all = train_df['SMILES'].astype(str).values

# =========================================================
# 2️⃣ Hyperparameter grid XGBoost
# =========================================================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# =========================================================
# 3️⃣ 10-fold scaffold-CV untuk dapatkan Sm (MACCS + XGB)
# =========================================================
folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

accuracies, auc_scores, precisions, recalls = [], [], [], []
f1_scores, specificities, sensitivity_scores = [], [], []
ppvs, npvs, ccrs, bacc_scores = [], [], [], []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

print("Starting 10-fold scaffold-based CV for MACCS + XGBoost...")

for fold_id, (train_idx, val_idx) in enumerate(
    tqdm(folds, total=len(folds), desc="Outer scaffold-CV folds (MACCS XGB)")
):
    X_train, X_val = x_maccs[train_idx], x_maccs[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    grid = GridSearchCV(
        estimator=XGBClassifier(
            objective='binary:logistic',
            use_label_encoder=False,
            eval_metric='logloss'
        ),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_val)
    y_proba = best_model.predict_proba(X_val)[:, 1]

    acc = accuracy_score(y_val, y_pred)
    try:
        auc = roc_auc_score(y_val, y_proba)
    except ValueError:
        auc = np.nan

    prec = precision_score(y_val, y_pred, zero_division=0)
    f1 = f1_score(y_val, y_pred)
    bacc, sens, spec = compute_bacc_from_preds(y_val, y_pred)

    cm = confusion_matrix(y_val, y_pred)
    tp = cm[1, 1]
    fp = cm[0, 1]
    tn = cm[0, 0]
    fn = cm[1, 0]
    ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
    ccr = calculate_ccr(sens, spec)

    accuracies.append(acc)
    auc_scores.append(auc)
    precisions.append(prec)
    recalls.append(sens)
    f1_scores.append(f1)
    specificities.append(spec)
    sensitivity_scores.append(sens)
    ppvs.append(ppv)
    npvs.append(npv)
    ccrs.append(ccr)
    bacc_scores.append(bacc)

    print(f"Fold {fold_id+1}: AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}, Best params: {grid.best_params_}")

mean_acc = np.nanmean(accuracies)
mean_auc = np.nanmean(auc_scores)
mean_bacc = np.nanmean(bacc_scores)

Sm = 0.5 * (mean_auc + mean_bacc)

cv_metrics = {
    "Accuracy": mean_acc,
    "AUC": mean_auc,
    "BACC": mean_bacc,
    "Precision": np.nanmean(precisions),
    "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
    "F1": np.nanmean(f1_scores),
    "Specificity": np.nanmean(specificities),
    "PPV": np.nanmean(ppvs),
    "NPV": np.nanmean(npvs),
    "CCR": np.nanmean(ccrs),
    "Sm": Sm
}

print("\n=== 10-Fold scaffold-CV Metrics for MACCS + XGBoost ===")
for k, v in cv_metrics.items():
    print(f'{k}: {v:.4f}')

# =========================================================
# 4️⃣ Fit final model di seluruh dataset
# =========================================================
print("\nTraining final XGBoost model on full dataset (MACCS features)...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss'
    ),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_final.fit(x_maccs, y)
final_model = grid_final.best_estimator_

# =========================================================
# 5️⃣ Save final model & metrics (incl. Sm)
# =========================================================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

model_path = os.path.join(model_folder, 'Dermal_xgb_maccs.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final XGBoost model saved successfully at:")
print(model_path)

metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_xgb_maccs_CV_metrics_with_Sm.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics (incl. Sm) saved successfully at: {metrics_path}")

# Sm untuk consensus QSAR
sm_csv_path = os.path.join(model_folder, 'Sm_MACCS_XGB.csv')
pd.DataFrame([{"Descriptor": "MACCS", "Algorithm": "XGB", "Sm": Sm}]).to_csv(sm_csv_path, index=False)
print("Sm saved for consensus QSAR weighting at:")
print(sm_csv_path)


# APF_XGBOOST

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_apf, y sudah didefinisikan sebelumnya

# ==========================
# Hyperparameter grid untuk XGBoost
# ==========================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress
# ==========================
print("Starting 10-fold cross-validation for APF features...")

for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_apf, y), total=10, desc="CV Folds")):
    X_train, X_test = x_apf[train_idx], x_apf[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))
    
    print(f"Fold {fold+1} done. Best params: {grid.best_params_}")

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n=== 10-Fold CV Metrics for APF ===")
for k, v in cv_metrics.items():
    print(f'{k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("\nTraining final XGBoost model on full dataset (APF features)...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_final.fit(x_apf, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model & metrics
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# Save model
model_path = os.path.join(model_folder, 'Dermal_xgb_apf.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final XGBoost model saved successfully!")

# Save CV metrics
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_xgb_apf_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# MORGAN_MACCS_APF_SVM

In [None]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score,
    roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.svm import SVC
from tqdm import tqdm
import joblib
import os

# =========================================================
# Fungsi: Bemis–Murcko scaffold-based KFold (10-fold)
# =========================================================
def get_bemis_murcko_scaffold(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    scaffold = MurckoScaffold.GetScaffoldForMol(mol)
    return Chem.MolToSmiles(scaffold) if scaffold is not None else None

def scaffold_kfold_indices(smiles_list, n_splits=10, random_state=42):
    scaffolds = [get_bemis_murcko_scaffold(smi) for smi in smiles_list]
    scaffold_to_indices = {}
    for idx, scaf in enumerate(scaffolds):
        scaffold_to_indices.setdefault(scaf, []).append(idx)

    unique_scaffolds = list(scaffold_to_indices.keys())
    rng = np.random.RandomState(random_state)
    rng.shuffle(unique_scaffolds)

    kf = KFold(n_splits=n_splits, shuffle=False)

    folds = []
    scaffold_array = np.array(unique_scaffolds)
    for scaf_train_idx, scaf_val_idx in kf.split(scaffold_array):
        train_idx = []
        val_idx = []
        for i in scaf_train_idx:
            train_idx.extend(scaffold_to_indices[scaffold_array[i]])
        for i in scaf_val_idx:
            val_idx.extend(scaffold_to_indices[scaffold_array[i]])
        folds.append((np.array(train_idx, dtype=int),
                      np.array(val_idx, dtype=int)))
    return folds

def compute_bacc_from_preds(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=int)
    y_pred = np.asarray(y_pred, dtype=int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sens = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    spec = tn / (tn + fp) if (tn + fp) > 0 else 0.0
    bacc = 0.5 * (sens + spec)
    return bacc, sens, spec

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2.0

# =========================================================
# Features, target, SMILES
# =========================================================
# Pastikan x_morgan, x_maccs, x_apf, y, smiles_all sudah didefinisikan
feature_sets = {
    "Morgan": x_morgan,
    "MACCS": x_maccs,
    "APF": x_apf
}
# contoh:
# smiles_all = train_df['SMILES'].astype(str).values

# =========================================================
# Hyperparameter grid SVM
# =========================================================
paramgrid = {
    "C": [0.1, 1, 10, 100],
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "gamma": ['scale', 'auto']
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# =========================================================
# Output folder
# =========================================================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# =========================================================
# Loop tiap feature set
# =========================================================
for name, X in feature_sets.items():
    print(f"\n===== Processing {name} features (SVM) =====")

    folds = scaffold_kfold_indices(smiles_all, n_splits=10, random_state=42)

    accuracies, auc_scores, precisions, recalls = [], [], [], []
    f1_scores, specificities, sensitivity_scores = [], [], []
    ppvs, npvs, ccrs, bacc_scores = [], [], [], []

    for fold_id, (train_idx, val_idx) in enumerate(
        tqdm(folds, total=len(folds), desc=f"{name} scaffold-CV folds (SVM)")
    ):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        grid = GridSearchCV(
            estimator=SVC(probability=True),
            param_grid=paramgrid,
            scoring=kappa_scorer,
            cv=5,
            verbose=1,
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_

        y_pred = best_model.predict(X_val)
        y_proba = best_model.predict_proba(X_val)[:, 1]

        acc = accuracy_score(y_val, y_pred)
        try:
            auc = roc_auc_score(y_val, y_proba)
        except ValueError:
            auc = np.nan

        prec = precision_score(y_val, y_pred, zero_division=0)
        f1 = f1_score(y_val, y_pred)
        bacc, sens, spec = compute_bacc_from_preds(y_val, y_pred)

        cm = confusion_matrix(y_val, y_pred)
        tp = cm[1, 1]
        fp = cm[0, 1]
        tn = cm[0, 0]
        fn = cm[1, 0]
        ppv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
        ccr = calculate_ccr(sens, spec)

        accuracies.append(acc)
        auc_scores.append(auc)
        precisions.append(prec)
        recalls.append(sens)
        f1_scores.append(f1)
        specificities.append(spec)
        sensitivity_scores.append(sens)
        ppvs.append(ppv)
        npvs.append(npv)
        ccrs.append(ccr)
        bacc_scores.append(bacc)

        print(f"Fold {fold_id+1} ({name}): AUC={auc:.4f}, BACC={bacc:.4f}, ACC={acc:.4f}, Best params: {grid.best_params_}")

    mean_acc = np.nanmean(accuracies)
    mean_auc = np.nanmean(auc_scores)
    mean_bacc = np.nanmean(bacc_scores)
    Sm = 0.5 * (mean_auc + mean_bacc)

    cv_metrics = {
        "Accuracy": mean_acc,
        "AUC": mean_auc,
        "BACC": mean_bacc,
        "Precision": np.nanmean(precisions),
        "Recall (Sensitivity)": np.nanmean(sensitivity_scores),
        "F1": np.nanmean(f1_scores),
        "Specificity": np.nanmean(specificities),
        "PPV": np.nanmean(ppvs),
        "NPV": np.nanmean(npvs),
        "CCR": np.nanmean(ccrs),
        "Sm": Sm
    }

    print(f"\n=== 10-Fold scaffold-CV Metrics for {name} (SVM) ===")
    for k, v in cv_metrics.items():
        print(f'{k}: {v:.4f}')

    # ==========================
    # Train final model full dataset
    # ==========================
    print(f"\nTraining final SVM model on full dataset ({name} features)...")
    grid_final = GridSearchCV(
        estimator=SVC(probability=True),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid_final.fit(X, y)
    final_model = grid_final.best_estimator_

    # ==========================
    # Save model & metrics & Sm
    # ==========================
    model_path = os.path.join(model_folder, f'Dermal_SVM_{name}.pkl')
    joblib.dump(final_model, model_path, compress=9)

    metrics_df = pd.DataFrame([cv_metrics])
    metrics_path = os.path.join(model_folder, f'Dermal_SVM_{name}_CV_metrics_with_Sm.xlsx')
    metrics_df.to_excel(metrics_path, index=False)

    sm_csv_path = os.path.join(model_folder, f'Sm_{name}_SVM.csv')
    pd.DataFrame([{"Descriptor": name, "Algorithm": "SVM", "Sm": Sm}]).to_csv(sm_csv_path, index=False)

    print(f"Final SVM model for {name} saved at:\n{model_path}")
    print(f"CV metrics (incl. Sm) saved at:\n{metrics_path}")
    print(f"Sm for {name} SVM saved at:\n{sm_csv_path}")
