# 1. Import Module

In [1]:
# =========================
# Standard Libraries
# =========================
import sys
import os
import csv
import bz2
import pickle
import _pickle as cPickle
import multiprocessing
import warnings
from glob import glob

# Suppress warnings
def warn(*args, **kwargs):
    pass
warnings.filterwarnings("ignore")
warnings.warn = warn

# =========================
# IPython Extensions
# =========================
%reload_ext autoreload
%autoreload 2

# =========================
# Data Handling
# =========================
import numpy as np
import pandas as pd
from tqdm import tqdm

# =========================
# RDKit
# =========================
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, PandasTools, Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.Descriptors import MolLogP
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs import ExplicitBitVect
from rdkit.Chem import MACCSkeys
from rdkit.Chem.AtomPairs import Pairs
from rdkit.Chem import rdMolDescriptors

# =========================
# Standardiser
# =========================
from standardiser import break_bonds, neutralise, rules, unsalt
from standardiser.utils import StandardiseException, sanity_check

# =========================
# Machine Learning / Scikit-learn
# =========================
from sklearn.metrics import (
    accuracy_score, roc_auc_score, confusion_matrix, precision_score,
    recall_score, f1_score, cohen_kappa_score, make_scorer
)
from sklearn.model_selection import (
    ShuffleSplit, RepeatedStratifiedKFold, StratifiedShuffleSplit,
    GridSearchCV, StratifiedKFold
)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
import joblib


In [2]:
import pandas as pd
from rdkit.Chem.Scaffolds import MurckoScaffold

# ==========================
# Load dataset
# ==========================
train_df = pd.read_excel(r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\FDA MDD (manual split)\Dataset\Train_set_FDAMMD_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx")

# ==========================
# 1️⃣ Ukuran dataset dan distribusi kelas
# ==========================
print("=== Dataset Info ===")
print(f"Total rows: {train_df.shape[0]}, Total features (excluding SMILES/target): {train_df.shape[1]-2}")
print("\nClass distribution:")
print(train_df['Outcome'].value_counts())
print("\nClass ratio:")
print(train_df['Outcome'].value_counts(normalize=True))

# ==========================
# 2️⃣ Cek duplikasi SMILES
# ==========================
if 'SMILES' in train_df.columns:
    dup_count = train_df['SMILES'].duplicated().sum()
    print(f"\nNumber of duplicated SMILES: {dup_count}")
else:
    print("\nSMILES column not found for duplicate check.")

# ==========================
# 3️⃣ Cek NaN dan tipe fitur
# ==========================
numeric_features = train_df.drop(columns=['SMILES','Outcome'], errors='ignore').select_dtypes(include=['int64','float64'])
print(f"\nNumber of numeric features: {numeric_features.shape[1]}")
print(f"Any missing values: {numeric_features.isna().sum().sum()}")

print("\nFeature types:")
print(train_df.dtypes.value_counts())

# ==========================
# 4️⃣ Korelasi fitur dengan target
# ==========================
corr = numeric_features.corrwith(train_df['Outcome'])
top_corr = corr.abs().sort_values(ascending=False).head(10)
print("\nTop 10 features most correlated with target:")
print(top_corr)

# ==========================
# 5️⃣ Scaffold uniqueness (opsional)
# ==========================
if 'SMILES' in train_df.columns:
    try:
        scaffolds = train_df['SMILES'].apply(lambda s: MurckoScaffold.MurckoScaffoldSmiles(smiles=s))
        print(f"\nNumber of unique Murcko scaffolds: {scaffolds.nunique()}")
    except Exception as e:
        print(f"Error computing scaffolds: {e}")


=== Dataset Info ===
Total rows: 642, Total features (excluding SMILES/target): 54

Class distribution:
Outcome
1    353
0    289
Name: count, dtype: int64

Class ratio:
Outcome
1    0.549844
0    0.450156
Name: proportion, dtype: float64

Number of duplicated SMILES: 0

Number of numeric features: 51
Any missing values: 51

Feature types:
float64    51
object      4
int64       1
Name: count, dtype: int64

Top 10 features most correlated with target:
MLogP         0.467188
Ring Count    0.407969
Chi4n         0.405020
Chi3n         0.392677
ALogP         0.381363
logP          0.377029
Chi2n         0.365699
VAdjMat       0.362352
Chi1n         0.353342
XLogP         0.334022
dtype: float64
Error computing scaffolds: No molecule provided


[20:56:11] Explicit valence for atom # 16 N, 3, is greater than permitted


In [2]:
import pandas as pd
from rdkit.Chem.Scaffolds import MurckoScaffold

# ==========================
# 데이터셋 불러오기
# ==========================
train_df = pd.read_excel(r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\FDA MDD (manual split)\Dataset\Train_set_FDAMMD_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx")

# ==========================
# 1️⃣ 데이터셋 크기와 클래스 분포 확인
# ==========================
print("=== Dataset Info ===")
# 데이터 행(row) 수와 특징(feature) 수 출력 (SMILES와 Outcome 제외)
print(f"Total rows: {train_df.shape[0]}, Total features (excluding SMILES/target): {train_df.shape[1]-2}")

# 클래스 분포 출력
print("\nClass distribution:")
print(train_df['Outcome'].value_counts())

# 클래스 비율 출력
print("\nClass ratio:")
print(train_df['Outcome'].value_counts(normalize=True))

# ==========================
# 2️⃣ SMILES 중복 확인
# ==========================
if 'SMILES' in train_df.columns:
    dup_count = train_df['SMILES'].duplicated().sum()
    print(f"\nNumber of duplicated SMILES: {dup_count}")
else:
    print("\nSMILES column not found for duplicate check.")  # SMILES 컬럼이 없을 경우 메시지 출력

# ==========================
# 3️⃣ 결측치(NaN) 및 특징 타입 확인
# ==========================
# SMILES와 Outcome 컬럼 제외 후 숫자형(numeric) 특징 선택
numeric_features = train_df.drop(columns=['SMILES','Outcome'], errors='ignore').select_dtypes(include=['int64','float64'])

print(f"\nNumber of numeric features: {numeric_features.shape[1]}")  # 숫자형 feature 수
print(f"Any missing values: {numeric_features.isna().sum().sum()}")   # 결측치 확인

print("\nFeature types:")
print(train_df.dtypes.value_counts())  # 각 컬럼 타입 개수 확인

# ==========================
# 4️⃣ 특징과 타겟 간 상관관계 확인
# ==========================
# 숫자형 특징과 Outcome 간 상관계수 계산
corr = numeric_features.corrwith(train_df['Outcome'])
# 절댓값 기준 상위 10개 특징 출력
top_corr = corr.abs().sort_values(ascending=False).head(10)
print("\nTop 10 features most correlated with target:")
print(top_corr)

# ==========================
# 5️⃣ Murcko Scaffold 고유성 확인 (옵션)
# ==========================
if 'SMILES' in train_df.columns:
    try:
        # 각 SMILES에 대해 Murcko Scaffold 추출
        scaffolds = train_df['SMILES'].apply(lambda s: MurckoScaffold.MurckoScaffoldSmiles(smiles=s))
        # 고유한 Scaffold 수 출력
        print(f"\nNumber of unique Murcko scaffolds: {scaffolds.nunique()}")
    except Exception as e:
        print(f"Error computing scaffolds: {e}")  # 에러 발생 시 메시지 출력


# RANDOM FOREST

In [3]:
# ================================
# 라이브러리 임포트
# ================================
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, 
    confusion_matrix, precision_score, recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
import joblib
import os

# ================================
# 데이터셋 불러오기
# ================================
data_file = train_path  # 데이터셋 파일 경로
df = pd.read_excel(data_file)

# ================================
# 특징(feature)과 타겟(target) 설정
# ================================
drop_cols = ['SMILES', 'Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors', 'Outcome']
X_values = df.drop(columns=drop_cols).values  # 입력 특징
y_values = df['Outcome'].astype(int).values   # 타겟 변수

print("Dataset loaded:", X_values.shape, "features,", len(y_values), "samples")  # 데이터 크기 확인

# ================================
# 하이퍼파라미터 그리드 설정
# ================================
paramgrid = {
    "max_features": [
        X_values.shape[1],
        X_values.shape[1] // 2,
        X_values.shape[1] // 4,
        X_values.shape[1] // 12,
        X_values.shape[1] // 10,
        X_values.shape[1] // 7,
        X_values.shape[1] // 5,
        X_values.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

# 사용자 정의 스코어 (quadratic weighted kappa)
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)  # 10-fold CV

# ================================
# 평가 지표 저장용 컨테이너
# ================================
accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

# CCR 계산 함수 정의
def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ================================
# 교차검증(CV) 루프
# ================================
for train_idx, test_idx in tqdm(cv.split(X_values, y_values), total=cv.get_n_splits(), desc="CV folds"):
    # 학습/검증 데이터 분리
    X_train, X_test = X_values[train_idx], X_values[test_idx]
    y_train, y_test = y_values[train_idx], y_values[test_idx]
    
    # GridSearchCV를 이용한 RF 모델 학습
    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        n_jobs=-1,
        verbose=0
    )
    
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_  # 최적 모델 선택
    
    # 예측 수행
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # 평가 지표 계산
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)  # 양성 예측도(PPV)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)  # 음성 예측도(NPV)
    ccrs.append(calculate_ccr(sensitivity, specificity))  # CCR 계산

# ================================
# CV 결과 요약 (fold 평균)
# ================================
metrics_summary = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n===== CV Results =====")
for k, v in metrics_summary.items():
    print(f"CV {k}: {v:.4f}")

# ================================
# 전체 데이터셋으로 최종 모델 학습
# ================================
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_final.fit(X_values, y_values)
final_model = grid_final.best_estimator_

# ================================
# 모델 및 평가 지표 저장
# ================================
output_dir = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model"
os.makedirs(output_dir, exist_ok=True)

# 모델 저장
model_path = os.path.join(output_dir, "Dermal_rf_rdkitcdk.pkl")
joblib.dump(final_model, model_path, compress=9)
print(f"\nFinal model saved: {model_path}")

# 평가 지표 저장
metrics_path = os.path.join(output_dir, "Dermal_rf_rdkitcdk_metrics.xlsx")
pd.DataFrame([metrics_summary]).to_excel(metrics_path, index=False)
print(f"CV metrics report saved: {metrics_path}")


Dataset loaded: (611, 51) features, 611 samples


CV folds: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [04:49<00:00, 28.90s/it]



===== CV Results =====
CV Accuracy: 0.6988
CV AUC: 0.7816
CV Precision: 0.7034
CV Recall (Sensitivity): 0.6927
CV F1: 0.6966
CV Specificity: 0.7043
CV PPV: 0.7034
CV NPV: 0.6968
CV CCR: 0.6985

Final model saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model\Dermal_rf_rdkitcdk.pkl
CV metrics report saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model\Dermal_rf_rdkitcdk_metrics.xlsx


# XGBOOST

In [4]:
# ================================
# Import libraries
# ================================
import pandas as pd
import numpy as np
from sklearn.metrics import (
    make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, 
    confusion_matrix, precision_score, recall_score, f1_score
)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from tqdm import tqdm
import joblib
import os

# ================================
# Load dataset
# ================================
data_file = train_path
df = pd.read_excel(data_file)

# ================================
# Features & Target
# ================================
drop_cols = ['SMILES', 'Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors', 'Outcome']
X_values = df.drop(columns=drop_cols).values
y_values = df['Outcome'].astype(int).values

print("Dataset loaded:", X_values.shape, "features,", len(y_values), "samples")

# ================================
# Hyperparameter grid for XGBoost
# ================================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# ================================
# Evaluation containers
# ================================
accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ================================
# Cross-validation loop
# ================================
for train_idx, test_idx in tqdm(cv.split(X_values, y_values), total=cv.get_n_splits(), desc="CV folds"):
    X_train, X_test = X_values[train_idx], X_values[test_idx]
    y_train, y_test = y_values[train_idx], y_values[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(
            objective='binary:logistic', 
            use_label_encoder=False, 
            eval_metric='logloss'
        ),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=0,
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))

# ================================
# Report metrics (mean across folds)
# ================================
metrics_summary = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n===== CV Results =====")
for k, v in metrics_summary.items():
    print(f"CV {k}: {v:.4f}")

# ================================
# Train final model on full dataset
# ================================
print("\nTraining final XGBoost model on full dataset with GridSearchCV...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(
        objective='binary:logistic', 
        use_label_encoder=False, 
        eval_metric='logloss'
    ),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=0,
    n_jobs=-1
)
grid_final.fit(X_values, y_values)
final_model = grid_final.best_estimator_

# ================================
# Save model & metrics
# ================================
output_dir = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model"
os.makedirs(output_dir, exist_ok=True)

# Save model
model_path = os.path.join(output_dir, "Dermal_xgb_rdkitcdk.pkl")
joblib.dump(final_model, model_path, compress=9)
print(f"\nFinal XGBoost model saved: {model_path}")

# Save metrics
metrics_path = os.path.join(output_dir, "Dermal_xgb_rdkitcdk_metrics.xlsx")
pd.DataFrame([metrics_summary]).to_excel(metrics_path, index=False)
print(f"CV metrics report saved: {metrics_path}")


Dataset loaded: (611, 51) features, 611 samples


CV folds: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [04:54<00:00, 29.50s/it]



===== CV Results =====
CV Accuracy: 0.7037
CV AUC: 0.7694
CV Precision: 0.6985
CV Recall (Sensitivity): 0.7323
CV F1: 0.7117
CV Specificity: 0.6751
CV PPV: 0.6985
CV NPV: 0.7178
CV CCR: 0.7037

Training final XGBoost model on full dataset with GridSearchCV...

Final XGBoost model saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model\Dermal_xgb_rdkitcdk.pkl
CV metrics report saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model\Dermal_xgb_rdkitcdk_metrics.xlsx


# SVM

In [4]:
# ================================
# Data
# ================================
X_values = x_rdkitcdk.values
y_values = y

# ================================
# Hyperparameter grid SVM (lebih kecil dan cepat)
# ================================
paramgrid = {
    "C": [0.1, 1, 10],
    "kernel": ['linear', 'rbf'],
    "gamma": ['scale']
}

kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ================================
# RandomizedSearchCV
# ================================
rand_search = RandomizedSearchCV(
    estimator=SVC(probability=True, class_weight='balanced'),
    param_distributions=paramgrid,
    n_iter=5,          # cukup 5 kombinasi random
    scoring=kappa_scorer,
    cv=3,              # internal CV kecil
    n_jobs=-1,
    verbose=1,
    random_state=42
)
rand_search.fit(X_values, y_values)
best_svm = rand_search.best_estimator_
print(f"Best params: {rand_search.best_params_}")

# ================================
# Fit final model di seluruh dataset
# ================================
best_svm.fit(X_values, y_values)

# ================================
# Save model
# ================================
save_path = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Carcinogencity (manual split)\Model\Carcino_rdkitcdk.pkl'
joblib.dump(best_svm, save_path, compress=9)
print(f"Final SVM model saved: {save_path}")


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best params: {'kernel': 'linear', 'gamma': 'scale', 'C': 0.1}
Final SVM model saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_svm_rdkitcdk.pkl


In [5]:
# ================================
# Import libraries
# ================================
# ================================
# Load dataset
# ================================
file_path = train_path
df = pd.read_excel(file_path)

# Drop kolom non-fitur
drop_cols = ['SMILES', 'Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors', 'Outcome']
X_values = df.drop(columns=drop_cols).values
y_values = df['Outcome'].astype(int).values

print("Dataset loaded:", X_values.shape, "features,", len(y_values), "samples")

# ================================
# Hyperparameter grid
# ================================
param_grid = {
    "svc__C": [0.1, 1, 10, 100],
    "svc__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "svc__gamma": ['scale', 'auto']
}

# Pipeline: imputasi -> scaling -> SVM
pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("svc", SVC(probability=True, class_weight="balanced"))
])

# Custom scorer pakai Cohen’s Kappa
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')
cv_outer = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# ================================
# Cross-validation loop
# ================================
accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

for train_idx, test_idx in tqdm(cv_outer.split(X_values, y_values), total=cv_outer.get_n_splits(), desc="CV folds"):
    X_train, X_test = X_values[train_idx], X_values[test_idx]
    y_train, y_test = y_values[train_idx], y_values[test_idx]

    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        scoring=kappa_scorer,
        cv=5,
        n_jobs=-1,
        verbose=0
    )

    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_

    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:, 1]

    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)

    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)

    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))

# ================================
# Report metrics (mean across folds)
# ================================
metrics_summary = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n===== CV Results =====")
for k, v in metrics_summary.items():
    print(f"CV {k}: {v:.4f}")

# ================================
# Train final model di seluruh dataset
# ================================
grid_final = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=0
)
grid_final.fit(X_values, y_values)
final_model = grid_final.best_estimator_

# ================================
# Save model & metrics
# ================================
output_dir = r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model"
os.makedirs(output_dir, exist_ok=True)

# Save model
model_path = os.path.join(output_dir, "Dermal_svm_rdkitcdk.pkl")
joblib.dump(final_model, model_path, compress=9)
print(f"\nFinal SVM model saved: {model_path}")

# Save metrics
metrics_path = os.path.join(output_dir, "Dermal_svm_rdkitcdk_metrics.xlsx")
pd.DataFrame([metrics_summary]).to_excel(metrics_path, index=False)
print(f"CV metrics report saved: {metrics_path}")


Dataset loaded: (611, 51) features, 611 samples


CV folds: 100%|████████████████████████████████████████████████████████████████████████| 10/10 [04:29<00:00, 26.97s/it]



===== CV Results =====
CV Accuracy: 0.6645
CV AUC: 0.7575
CV Precision: 0.6527
CV Recall (Sensitivity): 0.7090
CV F1: 0.6780
CV Specificity: 0.6194
CV PPV: 0.6527
CV NPV: 0.6835
CV CCR: 0.6642

Final SVM model saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model\Dermal_svm_rdkitcdk.pkl
CV metrics report saved: C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model\Dermal_svm_rdkitcdk_metrics.xlsx
