In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
import warnings
warnings.filterwarnings('ignore')
radius = 2              
n_bits = 2048          

df = pd.read_csv("stroke.csv")
smiles_list = df["SMILES"].tolist()
morgan_data = []
invalid_smiles = []

for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(smiles)
        continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    bit_arr = [int(x) for x in fp]
    morgan_data.append([smiles] + bit_arr)


columns = ["SMILES"] + [f"bit_{i}" for i in range(n_bits)]


morgan_df = pd.DataFrame(morgan_data, columns=columns)
morgan_df.to_csv("res_morgan_fingerprint2.csv", index=False)
print("Morgan saved in res_morgan_fingerprint.csv")

if invalid_smiles:
    print("problem：")
    for s in invalid_smiles:
        print(s)
merged_df = pd.concat([df, morgan_df], axis=1)

Morgan saved in res_morgan_fingerprint.csv


In [3]:
df2 = pd.read_csv("newtest.csv")
smiles_list2 = df2["SMILES"].tolist()
morgan_data2 = []
invalid_smiles = []
for smiles in smiles_list2:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(smiles)
        continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    bit_arr = [int(x) for x in fp]
    morgan_data2.append([smiles] + bit_arr)
columns = ["SMILES"] + [f"bit_{i}" for i in range(n_bits)]
morgan_df2 = pd.DataFrame(morgan_data2, columns=columns)


In [4]:
merged_df

Unnamed: 0,cid,SMILES,value,SMILES.1,bit_0,bit_1,bit_2,bit_3,bit_4,bit_5,...,bit_2038,bit_2039,bit_2040,bit_2041,bit_2042,bit_2043,bit_2044,bit_2045,bit_2046,bit_2047
0,51,C(CC(=O)O)C(=O)C(=O)O,0,C(CC(=O)O)C(=O)C(=O)O,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,79,C1=CC(=CN=C1)C#N,0,C1=CC(=CN=C1)C#N,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,240,C1=CC=C(C=C1)C=O,0,C1=CC=C(C=C1)C=O,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,311,C(C(=O)O)C(CC(=O)O)(C(=O)O)O,0,C(C(=O)O)C(CC(=O)O)(C(=O)O)O,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,323,C1=CC=C2C(=C1)C=CC(=O)O2,0,C1=CC=C2C(=C1)C=CC(=O)O2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2564,135871453,CC1=C([C@@H](NC(=NCC2=CC=CC=C2)N1CC3=CC=CC=C3)...,1,CC1=C([C@@H](NC(=NCC2=CC=CC=C2)N1CC3=CC=CC=C3)...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2565,135894587,CC1=C(C=CC(=C1)O)C2=NNC(=C2)C(=O)NCC3=CC(=CC(=...,1,CC1=C(C=CC(=C1)O)C2=NNC(=C2)C(=O)NCC3=CC(=CC(=...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2566,135896301,CC1=CC2=C(C=C1C)N[C@]3(C[C@H]4CCCC(=O)N5[C@H]4...,1,CC1=CC2=C(C=C1C)N[C@]3(C[C@H]4CCCC(=O)N5[C@H]4...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2567,136153084,CCN1C(=C([C@@H](NC1=NCC2=CC(=CC=C2)Cl)C3=CC(=C...,1,CCN1C(=C([C@@H](NC1=NCC2=CC(=CC=C2)Cl)C3=CC(=C...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform
import warnings
import os
warnings.filterwarnings('ignore')


label = merged_df.iloc[:, 2]
features = merged_df.iloc[:, 4:]



X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size=0.2, stratify=label, random_state=42
)


search_spaces = {
    "DecisionTree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 3, 5]
        },
        "method": "grid"
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "max_features": ["auto", "sqrt"]
        },
        "method": "random"
    },
    "LogisticRegression": {
        "model": LogisticRegression(solver='liblinear'),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"]
        },
        "method": "grid"
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.2),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "gamma": uniform(0, 0.5)
        },
        "method": "random"
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5],
            "subsample": [0.6, 0.8, 1.0]
        },
        "method": "grid"
    }
}


results = []
best_params = {} 

for name, config in search_spaces.items():
    print(f"\n ...")
    if config["method"] == "grid":
        searcher = GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            scoring="roc_auc",
            cv=5,
            n_jobs=-1
        )
    else:
        searcher = RandomizedSearchCV(
            estimator=config["model"],
            param_distributions=config["params"],
            n_iter=30,
            scoring="roc_auc",
            cv=5,
            n_jobs=-1,
            random_state=42
        )
    
    searcher.fit(X_train, y_train)
    best_model = searcher.best_estimator_
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Best_Params": searcher.best_params_,
        "AUC": auc
    })

    best_params[name] = searcher.best_params_


    print(f"   best: {searcher.best_params_}")


results_df = pd.DataFrame(results)
save_dir = "stroke"
os.makedirs(save_dir, exist_ok=True)
results_df.to_csv("stroke/model_best_params_auc.csv", index=False,)

print("\n:\n")
import pprint
pprint.pprint(best_params)


 ...
   best: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}

 ...
   best: {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': None}

 ...
   best: {'C': 10, 'penalty': 'l2'}

 ...
   best: {'colsample_bytree': 0.6298202574719083, 'gamma': 0.49344346830025865, 'learning_rate': 0.1644489538593315, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.9261845713819337}

 ...
   best: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.8}

:

{'DecisionTree': {'max_depth': 10,
                  'min_samples_leaf': 5,
                  'min_samples_split': 2},
 'GradientBoosting': {'learning_rate': 0.1,
                      'max_depth': 3,
                      'n_estimators': 200,
                      'subsample': 0.8},
 'LogisticRegression': {'C': 10, 'penalty': 'l2'},
 'RandomForest': {'max_depth': None,
                  'max_features': 'sqrt',
                  'min_samples_split': 5,
                  'n_estima

In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


label = merged_df.iloc[:, 2]
features = merged_df.iloc[:, 4:]


binary_classes = sorted(label.unique())



X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, stratify=label, random_state=42)


models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42).set_params(**best_params["DecisionTree"]),
    'RandomForest': RandomForestClassifier(random_state=42).set_params(**best_params["RandomForest"]),
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=200, random_state=42).set_params(**best_params["LogisticRegression"]),
    'XGBoost': xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss').set_params(**best_params["XGBoost"]),
    'GradientBoosting': GradientBoostingClassifier(random_state=42).set_params(**best_params["GradientBoosting"])
}

results = []


plt.figure(figsize=(10, 8))


def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score": f1,
        "Precision": precision,
        "Recall": recall
    })


    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=binary_classes, yticklabels=binary_classes)
    plt.title(f'{model_name} 混淆矩阵')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.savefig(f'{model_name}_confusion_matrix.svg', format='svg')
    plt.close()
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})', lw=2)

    return auc

# ==== 评估每个模型 ====
for model_name, model in models.items():
    auc = evaluate_model(model, model_name)
    print(f"{model_name}  AUC: {auc:.2f}")


plt.plot([0, 1], [0, 1], color='grey', linestyle='--', lw=1)
plt.title('ROC曲线', fontsize=16)
plt.xlabel('假阳性率', fontsize=12)
plt.ylabel('真阳性率', fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.5)
plt.savefig('stroke/combined_roc_curves.svg', format='svg')
plt.close()


results_df = pd.DataFrame(results)
results_df.to_csv('stroke/model_evaluation_results.csv', index=False)



DecisionTree  AUC: 0.96
RandomForest  AUC: 0.99
LogisticRegression  AUC: 0.99
XGBoost  AUC: 0.99
GradientBoosting  AUC: 0.99


In [7]:
new_features = morgan_df2.iloc[:, 1:]
rf_model = models["XGBoost"] 
pred_labels = rf_model.predict(new_features)
pred_probs = rf_model.predict_proba(new_features)[:, 1]

df2["Predicted_Label"] = pred_labels
df2["Predicted_Probability"] = pred_probs
df2.to_csv("stroke/prediction_results.csv", index=False)