In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix
import xgboost as xgb

In [None]:
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
radius = 2               
n_bits = 2048            
df = pd.read_csv("Dataset/AD.csv")
smiles_list = df["SMILES"].tolist()
morgan_data = []
invalid_smiles = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(smiles)
        continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    bit_arr = [int(x) for x in fp]
    morgan_data.append([smiles] + bit_arr)
columns = ["SMILES"] + [f"bit_{i}" for i in range(n_bits)]
morgan_df = pd.DataFrame(morgan_data, columns=columns)
morgan_df.to_csv("res_morgan_fingerprint_AD.csv", index=False)
merged_df = pd.concat([df, morgan_df], axis=1)

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
radius = 2              
n_bits = 2048            
df = pd.read_csv("Dataset/TEST2.csv")
smiles_list = df["SMILES"].tolist()
morgan_data = []
invalid_smiles = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(smiles)
        continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    bit_arr = [int(x) for x in fp]
    morgan_data.append([smiles] + bit_arr)
columns = ["SMILES"] + [f"bit_{i}" for i in range(n_bits)]
morgan_df2 = pd.DataFrame(morgan_data, columns=columns)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import catboost as cb
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform
import warnings
warnings.filterwarnings('ignore')


label = merged_df.iloc[:, 1]
features = merged_df.iloc[:, 3:]


X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size=0.2, stratify=label, random_state=42
)


search_spaces = {
    "DecisionTree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 3, 5]
        },
        "method": "grid"
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "max_features": ["auto", "sqrt"]
        },
        "method": "random"
    },
    "LogisticRegression": {
        "model": LogisticRegression(solver='liblinear'),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"]
        },
        "method": "grid"
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.2),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "gamma": uniform(0, 0.5)
        },
        "method": "random"
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5],
            "subsample": [0.6, 0.8, 1.0]
        },
        "method": "grid"
    },
    "CatBoost": {
        "model": cb.CatBoostClassifier(verbose=False, random_state=42),
        "params": {
            "iterations": [100, 200],
            "depth": [3, 5, 7],
            "learning_rate": [0.01, 0.05, 0.1],
            "l2_leaf_reg": [1, 3, 5],
            "bagging_temperature": [0, 1, 3]
        },
        "method": "random"
    }
}

results = []
best_params = {}

for name, config in search_spaces.items():
    print(f"\n {name} ...")
    if config["method"] == "grid":
        searcher = GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            scoring="roc_auc",
            cv=5,
            n_jobs=-1
        )
    else:
        searcher = RandomizedSearchCV(
            estimator=config["model"],
            param_distributions=config["params"],
            n_iter=30,
            scoring="roc_auc",
            cv=5,
            n_jobs=-1,
            random_state=42
        )

    searcher.fit(X_train, y_train)
    best_model = searcher.best_estimator_
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Best_Params": searcher.best_params_,
        "AUC": auc
    })

    best_params[name] = searcher.best_params_

    print(f" {name}  AUC: {auc:.4f}")
    print(f"  {searcher.best_params_}")


results_df = pd.DataFrame(results)
results_df.to_csv("model_best_params_auc_ad.csv", index=False)


print("\n:\n")
import pprint
pprint.pprint(best_params)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import pprint

warnings.filterwarnings("ignore")


label = merged_df.iloc[:, 1]
features = merged_df.iloc[:, 3:]

binary_classes = sorted(label.unique())

X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size=0.2, stratify=label, random_state=42
)


best_params = pd.read_csv("model_best_params_auc_ad.csv", index_col="Model")
best_params_dict = best_params['Best_Params'].apply(eval).to_dict()


models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42, **best_params_dict["DecisionTree"]),
    'RandomForest': RandomForestClassifier(random_state=42, **best_params_dict["RandomForest"]),
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=200, random_state=42, **best_params_dict["LogisticRegression"]),
    'XGBoost': xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, **best_params_dict["XGBoost"]),
    'GradientBoosting': GradientBoostingClassifier(random_state=42, **best_params_dict["GradientBoosting"]),
    'CatBoost': cb.CatBoostClassifier(verbose=False, random_state=42, **best_params_dict.get("CatBoost", {}))
}


results = []


def evaluate_model(model, model_name, threshold=0.5):
    model.fit(X_train, y_train)
    y_prob = model.predict_proba(X_test)[:, 1]
    y_pred = (y_prob >= threshold).astype(int)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score": f1,
        "Precision": precision,
        "Recall": recall
    })

    plt.figure(figsize=(6, 5))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=binary_classes, yticklabels=binary_classes)
    plt.title(f'{model_name} Confusion Matrix (Threshold={threshold})')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.tight_layout()
    plt.savefig(f'{model_name}_confusion_matrix.svg', format='svg')
    plt.close()

    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)

    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})', lw=2)
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.title(f'{model_name} ROC Curve', fontsize=14)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(f"{model_name}_roc_curve.svg", format='svg')
    plt.close()

    return auc


for model_name, model in models.items():
    auc = evaluate_model(model, model_name, threshold=0.5)
    print(f"{model_name} AUC: {auc:.4f}")


results_df = pd.DataFrame(results)
results_df.to_csv("model_evaluation_results.csv", index=False)

In [None]:
new_features = morgan_df2.iloc[:, 1:]
rf_model = models["CatBoost"]  
pred_labels = rf_model.predict(new_features)
pred_probs = rf_model.predict_proba(new_features)[:, 1]
threshold = 0.5
pred_labels_custom = (pred_probs >= threshold).astype(int)
df22 = pd.read_csv("Dataset/TEST2.csv")
df22["Predicted_Label"] = pred_labels_custom
df22["Predicted_Probability"] = pred_probs
df22.to_csv("prediction_results_AD.csv", index=False)