# 1. Import Module

In [None]:
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Descriptors import MolLogP
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import Descriptors
from rdkit.Chem import PandasTools
from rdkit.DataStructs import ExplicitBitVect
import sys
import multiprocessing
from standardiser import break_bonds, neutralise, rules, unsalt
from standardiser.utils import StandardiseException, sanity_check
%reload_ext autoreload
%autoreload 2
def warn(*args, **kwargs):
    pass 
import warnings
warnings.filterwarnings("ignore")
warnings.warn = warn
from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
import sys
from sklearn.metrics import cohen_kappa_score
import csv
from rdkit.Chem import MACCSkeys
from sklearn.model_selection import ShuffleSplit
import _pickle as cPickle
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedShuffleSplit    
import bz2
from glob import glob
import _pickle as cPickle
import pickle
# Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
# Draw.DrawingOptions.atomLabelFontSize = 18




# LOAD DATASET TRAINING


In [None]:
import pandas as pd
from rdkit import Chem

# Function to read Excel file into DataFrame
def load_excel_to_df(filename):
    df = pd.read_excel(filename)
    return df

# Convert strings back to lists of integers
def string_to_list(bit_string):
    if isinstance(bit_string, str):
        return list(map(int, bit_string.strip('[]').split(', ')))
    else:
        return bit_string

# Load Excel file
train_df = load_excel_to_df(r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Train_set_Dermal_balanced_with_fingerprints_sorted_with_RDKit_and_CDK_features.xlsx")

# Apply the string-to-list conversion for the fingerprint columns
fingerprint_columns = ['Morgan_Descriptors', 'MACCS_Descriptors', 'APF_Descriptors']
for col in fingerprint_columns:
    if col in train_df.columns:
        train_df[col] = train_df[col].apply(string_to_list)
    else:
        print(f"Warning: Column {col} not found in Excel file!")

# Define combined_df as train_df
combined_df = train_df

print("Combined DataFrame:")
print(combined_df.head())


In [None]:
combined_df.keys()

In [None]:
combine_df= combined_df.sort_values(['Outcome'], ascending=True)
combined_df['RowID'] = combined_df.index
combined_df.head(100)

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Label encoding
le = LabelEncoder()
outcomes = np.unique(combined_df['Outcome'])
le.fit(outcomes)
y = le.transform(combined_df['Outcome'])

# Hasil
classes, counts = np.unique(y, return_counts=True)

print("Classes                          : ", classes)
print("Number of cpds in each class     : ", counts)
print("Total number of cpds             : ", len(y))

In [None]:
S = pd.Series(le.transform(combined_df['Outcome']))  # Sama dengan y

In [None]:
ax = S.hist(bins=np.arange(-0.5,5))
ax.set_xticks(range(0,5))
#info

In [None]:
# Target
y = np.int32(S)  # pastikan S sudah didefinisikan

# Fingerprint arrays
x_maccs = np.array(list(combined_df['MACCS_Descriptors']))
x_morgan = np.array(list(combined_df['Morgan_Descriptors']))
x_apf = np.array(list(combined_df['APF_Descriptors']))  # ditambahkan APF


# MORGAN_RF

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import joblib
import os

# ==========================
# 1️⃣ Define features and target
# ==========================
# x_morgan, y sudah didefinisikan sebelumnya

# ==========================
# 2️⃣ Hyperparameter grid
# ==========================
paramgrid = {
    "max_features": [
        x_morgan.shape[1], x_morgan.shape[1] // 2, x_morgan.shape[1] // 4, x_morgan.shape[1] // 12, x_morgan.shape[1] // 10,
        x_morgan.shape[1] // 7, x_morgan.shape[1] // 5, x_morgan.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

# ==========================
# 3️⃣ Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 4️⃣ Cross-validation for performance reporting
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Lists to store metrics
accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# 5️⃣ Loop CV to evaluate performance
# ==========================
for train_idx, test_idx in cv.split(x_morgan, y):
    X_train, X_test = x_morgan[train_idx], x_morgan[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    # GridSearchCV untuk hyperparameter terbaik
    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,  
        n_jobs=-1
    )
    
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    # Prediksi fold test
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))

# ==========================
# 6️⃣ Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

for k, v in cv_metrics.items():
    print(f'CV {k}: {v:.4f}')

# ==========================
# 7️⃣ Train final model on full dataset
# ==========================
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1
)
grid_final.fit(x_morgan, y)
final_model = grid_final.best_estimator_

# ==========================
# 8️⃣ Save final model
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)
model_path = os.path.join(model_folder, 'FDAMDD_rf_morgan.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final Random Forest model saved successfully!")

# ==========================
# 9️⃣ Save CV metrics report
# ==========================
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_rf_morgan_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# MACCS_RF

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_maccs, y sudah didefinisikan sebelumnya

# ==========================
# Hyperparameter grid
# ==========================
paramgrid = {
    "max_features": [
        x_maccs.shape[1], x_maccs.shape[1] // 2, x_maccs.shape[1] // 4,
        x_maccs.shape[1] // 12, x_maccs.shape[1] // 10,
        x_maccs.shape[1] // 7, x_maccs.shape[1] // 5, x_maccs.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress bar
# ==========================
for fold_idx, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_maccs, y), total=cv.get_n_splits(), desc="Outer CV folds")):
    X_train, X_test = x_maccs[train_idx], x_maccs[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

for k, v in cv_metrics.items():
    print(f'CV {k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("Training final model on full dataset...")
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)
grid_final.fit(x_maccs, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

model_path = os.path.join(model_folder, 'FDAMDD_rf_macckeys.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final Random Forest model saved successfully!")

# ==========================
# Save CV metrics report
# ==========================
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_rf_macckeys_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# APF_RF

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_apf, y sudah didefinisikan sebelumnya
# Pastikan y = np.int32(S)

# ==========================
# Hyperparameter grid
# ==========================
paramgrid = {
    "max_features": [
        x_apf.shape[1], x_apf.shape[1] // 2, x_apf.shape[1] // 4,
        x_apf.shape[1] // 12, x_apf.shape[1] // 10,
        x_apf.shape[1] // 7, x_apf.shape[1] // 5, x_apf.shape[1] // 3
    ],
    "n_estimators": [10, 100, 300, 500],
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress bar
# ==========================
for fold_idx, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_apf, y), total=cv.get_n_splits(), desc="Outer CV folds")):
    X_train, X_test = x_apf[train_idx], x_apf[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=RandomForestClassifier(class_weight='balanced'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

for k, v in cv_metrics.items():
    print(f'CV {k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("Training final model on full dataset...")
grid_final = GridSearchCV(
    estimator=RandomForestClassifier(class_weight='balanced'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    n_jobs=-1,
    verbose=2
)
grid_final.fit(x_apf, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model & metrics report
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# Save model
model_path = os.path.join(model_folder, 'Dermal_rf_apf.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final Random Forest model saved successfully!")

# Save CV metrics
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_rf_apf_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# MORGAN_XBOOST

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_morgan, y sudah didefinisikan sebelumnya
# Pastikan y = np.int32(S) jika perlu

# ==========================
# Hyperparameter grid untuk XGBoost
# ==========================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress
# ==========================
print("Starting 10-fold cross-validation...")

for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_morgan, y), total=10, desc="CV Folds")):
    X_train, X_test = x_morgan[train_idx], x_morgan[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))
    
    print(f"Fold {fold+1} done. Best params: {grid.best_params_}")

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n=== 10-Fold CV Metrics ===")
for k, v in cv_metrics.items():
    print(f'{k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("\nTraining final XGBoost model on full dataset with GridSearchCV...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_final.fit(x_morgan, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model & metrics
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# Save model
model_path = os.path.join(model_folder, 'Dermal_xgb_morgan.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final XGBoost model saved successfully!")

# Save CV metrics
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_xgb_morgan_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# MACCS_XGBOOST

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_maccs, y sudah didefinisikan sebelumnya

# ==========================
# Hyperparameter grid untuk XGBoost
# ==========================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress
# ==========================
print("Starting 10-fold cross-validation for MACCS features...")

for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_maccs, y), total=10, desc="CV Folds")):
    X_train, X_test = x_maccs[train_idx], x_maccs[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))
    
    print(f"Fold {fold+1} done. Best params: {grid.best_params_}")

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n=== 10-Fold CV Metrics for MACCS ===")
for k, v in cv_metrics.items():
    print(f'{k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("\nTraining final XGBoost model on full dataset (MACCS features)...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_final.fit(x_maccs, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model & metrics
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# Save model
model_path = os.path.join(model_folder, 'Dermal_xgb_maccs.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final XGBoost model saved successfully!")

# Save CV metrics
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_xgb_maccs_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# APF_XGBOOST

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# x_apf, y sudah didefinisikan sebelumnya

# ==========================
# Hyperparameter grid untuk XGBoost
# ==========================
paramgrid = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1, 0.2]
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# 10-fold CV
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
confusion_matrices = []

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Outer CV loop dengan progress
# ==========================
print("Starting 10-fold cross-validation for APF features...")

for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(x_apf, y), total=10, desc="CV Folds")):
    X_train, X_test = x_apf[train_idx], x_apf[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    
    grid = GridSearchCV(
        estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    
    y_pred = best_model.predict(X_test)
    y_proba = best_model.predict_proba(X_test)[:,1]
    
    # Metrics
    accuracies.append(accuracy_score(y_test, y_pred))
    auc_scores.append(roc_auc_score(y_test, y_proba))
    precisions.append(precision_score(y_test, y_pred, zero_division=0))
    recalls.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices.append(cm)
    
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    sensitivity_scores.append(sensitivity)
    specificities.append(specificity)
    
    ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
    npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
    ccrs.append(calculate_ccr(sensitivity, specificity))
    
    print(f"Fold {fold+1} done. Best params: {grid.best_params_}")

# ==========================
# Report metrics
# ==========================
cv_metrics = {
    "Accuracy": np.mean(accuracies),
    "AUC": np.mean(auc_scores),
    "Precision": np.mean(precisions),
    "Recall (Sensitivity)": np.mean(sensitivity_scores),
    "F1": np.mean(f1_scores),
    "Specificity": np.mean(specificities),
    "PPV": np.mean(ppvs),
    "NPV": np.mean(npvs),
    "CCR": np.mean(ccrs)
}

print("\n=== 10-Fold CV Metrics for APF ===")
for k, v in cv_metrics.items():
    print(f'{k}: {v:.4f}')

# ==========================
# Fit final model di seluruh dataset
# ==========================
print("\nTraining final XGBoost model on full dataset (APF features)...")
grid_final = GridSearchCV(
    estimator=XGBClassifier(objective='binary:logistic', use_label_encoder=False, eval_metric='logloss'),
    param_grid=paramgrid,
    scoring=kappa_scorer,
    cv=5,
    verbose=1,
    n_jobs=-1
)
grid_final.fit(x_apf, y)
final_model = grid_final.best_estimator_

# ==========================
# Save final model & metrics
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# Save model
model_path = os.path.join(model_folder, 'Dermal_xgb_apf.pkl')
joblib.dump(final_model, model_path, compress=9)
print("Final XGBoost model saved successfully!")

# Save CV metrics
metrics_df = pd.DataFrame([cv_metrics])
metrics_path = os.path.join(model_folder, 'Dermal_xgb_apf_CV_metrics.xlsx')
metrics_df.to_excel(metrics_path, index=False)
print(f"CV metrics report saved successfully at: {metrics_path}")


# MORGAN_MACCS_APF_SVM

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import make_scorer, cohen_kappa_score, accuracy_score, roc_auc_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.svm import SVC
import joblib
from tqdm import tqdm
import os

# ==========================
# Features and target
# ==========================
# Pastikan x_morgan, x_maccs, x_apf, dan y sudah didefinisikan sebelumnya
feature_sets = {
    "Morgan": x_morgan,
    "MACCS": x_maccs,
    "APF": x_apf
}

# ==========================
# Hyperparameter grid SVM
# ==========================
paramgrid = {
    "C": [0.1, 1, 10, 100],
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "gamma": ['scale', 'auto']
}

# ==========================
# Scoring function
# ==========================
kappa_scorer = make_scorer(cohen_kappa_score, weights='quadratic')

# ==========================
# CV setup
# ==========================
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

def calculate_ccr(sensitivity, specificity):
    return (sensitivity + specificity) / 2

# ==========================
# Output folder
# ==========================
model_folder = r'C:\Fauzan\Manuskrip QSAR 1\Major Revision\Acute Dermal Toxicity (manual split)\Model'
os.makedirs(model_folder, exist_ok=True)

# ==========================
# Loop untuk tiap feature set
# ==========================
for name, X in feature_sets.items():
    print(f"\n===== Processing {name} features =====")
    
    accuracies, auc_scores, precisions, recalls, f1_scores = [], [], [], [], []
    specificities, sensitivity_scores, ppvs, npvs, ccrs = [], [], [], [], []
    confusion_matrices = []
    
    for fold, (train_idx, test_idx) in enumerate(tqdm(cv.split(X, y), total=10, desc=f"{name} CV Folds")):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        
        grid = GridSearchCV(
            estimator=SVC(probability=True),
            param_grid=paramgrid,
            scoring=kappa_scorer,
            cv=5,
            verbose=1,
            n_jobs=-1
        )
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)[:,1]
        
        # Metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        auc_scores.append(roc_auc_score(y_test, y_proba))
        precisions.append(precision_score(y_test, y_pred, zero_division=0))
        recalls.append(recall_score(y_test, y_pred))
        f1_scores.append(f1_score(y_test, y_pred))
        
        cm = confusion_matrix(y_test, y_pred)
        confusion_matrices.append(cm)
        
        tn, fp, fn, tp = cm.ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        sensitivity_scores.append(sensitivity)
        specificities.append(specificity)
        
        ppvs.append(tp / (tp + fp) if (tp + fp) > 0 else 0)
        npvs.append(tn / (tn + fn) if (tn + fn) > 0 else 0)
        ccrs.append(calculate_ccr(sensitivity, specificity))
        
        print(f"Fold {fold+1} done. Best params: {grid.best_params_}")
    
    # ==========================
    # Report metrics
    # ==========================
    cv_metrics = {
        "Accuracy": np.mean(accuracies),
        "AUC": np.mean(auc_scores),
        "Precision": np.mean(precisions),
        "Recall (Sensitivity)": np.mean(sensitivity_scores),
        "F1": np.mean(f1_scores),
        "Specificity": np.mean(specificities),
        "PPV": np.mean(ppvs),
        "NPV": np.mean(npvs),
        "CCR": np.mean(ccrs)
    }

    print(f"\n=== 10-Fold CV Metrics for {name} ===")
    for k, v in cv_metrics.items():
        print(f'{k}: {v:.4f}')

    # ==========================
    # Train full dataset dengan best hyperparameter
    # ==========================
    print(f"\nTraining final SVM model on full dataset ({name} features)...")
    grid_final = GridSearchCV(
        estimator=SVC(probability=True),
        param_grid=paramgrid,
        scoring=kappa_scorer,
        cv=5,
        verbose=1,
        n_jobs=-1
    )
    grid_final.fit(X, y)
    final_model = grid_final.best_estimator_
    
    # ==========================
    # Save model & metrics
    # ==========================
    model_path = os.path.join(model_folder, f'Dermal_SVM_{name}.pkl')
    joblib.dump(final_model, model_path, compress=9)
    
    metrics_df = pd.DataFrame([cv_metrics])
    metrics_path = os.path.join(model_folder, f'Dermal_SVM_{name}_CV_metrics.xlsx')
    metrics_df.to_excel(metrics_path, index=False)
    
    print(f"Final SVM model for {name} saved at:\n{model_path}")
    print(f"CV metrics report saved at:\n{metrics_path}")


In [None]:
import os

# Daftar file asli
files = [
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_SVM_morgan.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_xgb_apf.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_xgb_maccs.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_xgb_morgan.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_rf_apf.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_rf_macckeys.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_rf_morgan.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_SVM_APF.pkl",
    r"C:\Fauzan\Manuskrip QSAR 1\Major Revision\Respiratory Irritation New\Model\final_SVM_MACCS.pkl"
]

for filepath in files:
    dir_name = os.path.dirname(filepath)
    base_name = os.path.basename(filepath)
    name, ext = os.path.splitext(base_name)
    
    # Sisipkan "Resp" di awal nama file
    new_name = f"Resp_{name}{ext}"
    new_path = os.path.join(dir_name, new_name)
    
    os.rename(filepath, new_path)
    print(f"Renamed:\n{filepath}\n -> {new_path}\n")
