In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

In [2]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs
radius = 2              
n_bits = 2048          

df = pd.read_csv("anti_inflam.csv")
smiles_list = df["SMILES"].tolist()

morgan_data = []
invalid_smiles = []

for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(smiles)
        continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    bit_arr = [int(x) for x in fp]
    morgan_data.append([smiles] + bit_arr)


columns = ["SMILES"] + [f"bit_{i}" for i in range(n_bits)]


morgan_df = pd.DataFrame(morgan_data, columns=columns)
morgan_df.to_csv("res_morgan_fingerprint1.csv", index=False)
print("Morgan saved in res_morgan_fingerprint.csv")

if invalid_smiles:
    print("problem：")
    for s in invalid_smiles:
        print(s)


Morgan saved in res_morgan_fingerprint.csv


In [3]:
df2 = pd.read_csv("newtest.csv")
smiles_list2 = df2["SMILES"].tolist()
morgan_data2 = []
invalid_smiles = []
for smiles in smiles_list2:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        invalid_smiles.append(smiles)
        continue
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
    bit_arr = [int(x) for x in fp]
    morgan_data2.append([smiles] + bit_arr)
columns = ["SMILES"] + [f"bit_{i}" for i in range(n_bits)]
morgan_df2 = pd.DataFrame(morgan_data2, columns=columns)


In [4]:
merged_df = pd.concat([df, morgan_df], axis=1)

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform
import warnings
import os
warnings.filterwarnings('ignore')


label = merged_df.iloc[:, 2]
features = merged_df.iloc[:, 6:]



X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size=0.2, stratify=label, random_state=42
)


search_spaces = {
    "DecisionTree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 3, 5]
        },
        "method": "grid"
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "max_features": ["auto", "sqrt"]
        },
        "method": "random"
    },
    "LogisticRegression": {
        "model": LogisticRegression(solver='liblinear'),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"]
        },
        "method": "grid"
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.2),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "gamma": uniform(0, 0.5)
        },
        "method": "random"
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5],
            "subsample": [0.6, 0.8, 1.0]
        },
        "method": "grid"
    }
}


results = []
best_params = {} 

for name, config in search_spaces.items():
    print(f"\n ...")
    if config["method"] == "grid":
        searcher = GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            scoring="roc_auc",
            cv=5,
            n_jobs=-1
        )
    else:
        searcher = RandomizedSearchCV(
            estimator=config["model"],
            param_distributions=config["params"],
            n_iter=30,
            scoring="roc_auc",
            cv=5,
            n_jobs=-1,
            random_state=42
        )
    
    searcher.fit(X_train, y_train)
    best_model = searcher.best_estimator_
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Best_Params": searcher.best_params_,
        "AUC": auc
    })

    best_params[name] = searcher.best_params_


    print(f"   best: {searcher.best_params_}")


results_df = pd.DataFrame(results)
save_dir = "stat3"
os.makedirs(save_dir, exist_ok=True)
results_df.to_csv("stat3/model_best_params_auc.csv", index=False,)

print("\n:\n")
import pprint
pprint.pprint(best_params)


 ...
   best: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}

 ...
   best: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': None}

 ...
   best: {'C': 0.1, 'penalty': 'l2'}

 ...
   best: {'colsample_bytree': 0.8417669517111269, 'gamma': 0.26992054565083656, 'learning_rate': 0.050612244946953884, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.7159005811655073}

 ...
   best: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}

:

{'DecisionTree': {'max_depth': 10,
                  'min_samples_leaf': 1,
                  'min_samples_split': 2},
 'GradientBoosting': {'learning_rate': 0.1,
                      'max_depth': 3,
                      'n_estimators': 200,
                      'subsample': 1.0},
 'LogisticRegression': {'C': 0.1, 'penalty': 'l2'},
 'RandomForest': {'max_depth': None,
                  'max_features': 'sqrt',
                  'min_samples_split': 2,
                  'n_es

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns


label = merged_df.iloc[:, 2]
features = merged_df.iloc[:, 6:]


binary_classes = sorted(label.unique())



X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, stratify=label, random_state=42)


models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42).set_params(**best_params["DecisionTree"]),
    'RandomForest': RandomForestClassifier(random_state=42).set_params(**best_params["RandomForest"]),
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=200, random_state=42).set_params(**best_params["LogisticRegression"]),
    'XGBoost': xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss').set_params(**best_params["XGBoost"]),
    'GradientBoosting': GradientBoostingClassifier(random_state=42).set_params(**best_params["GradientBoosting"])
}

results = []


plt.figure(figsize=(10, 8))


def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score": f1,
        "Precision": precision,
        "Recall": recall
    })


    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=binary_classes, yticklabels=binary_classes)
    plt.title(f'{model_name} 混淆矩阵')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.savefig(f'{model_name}_confusion_matrix.svg', format='svg')
    plt.close()
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})', lw=2)

    return auc

for model_name, model in models.items():
    auc = evaluate_model(model, model_name)
    print(f"{model_name}  AUC: {auc:.2f}")


plt.plot([0, 1], [0, 1], color='grey', linestyle='--', lw=1)
plt.title('ROC曲线', fontsize=16)
plt.xlabel('假阳性率', fontsize=12)
plt.ylabel('真阳性率', fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.5)
plt.savefig('stat3/combined_roc_curves.svg', format='svg')
plt.close()


results_df = pd.DataFrame(results)
results_df.to_csv('stat3/model_evaluation_results.csv', index=False)



DecisionTree  AUC: 0.78
RandomForest  AUC: 0.93
LogisticRegression  AUC: 0.95
XGBoost  AUC: 0.94
GradientBoosting  AUC: 0.93


In [7]:
morgan_df2

Unnamed: 0,SMILES,bit_0,bit_1,bit_2,bit_3,bit_4,bit_5,bit_6,bit_7,bit_8,...,bit_2038,bit_2039,bit_2040,bit_2041,bit_2042,bit_2043,bit_2044,bit_2045,bit_2046,bit_2047
0,CC1=CC(=CC(=C1)[C@@H]2CCCN2CC3=CC=C(C=C3)OC4=C...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,CCCCCC1=CC(=C(C(=C1)O)[C@@H]2C=C(CC[C@H]2C(=C)...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,C[C@@H]1CC[C@H](C2=C(CC[C@H]12)C)/C=C(\C)/C(=O)O,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,CC(=O)NC1=CC(=CC=C1)N2CCN(CC2)CCCCNS(=O)(=O)CC...,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,CC1(C2CCC1([C@H](C2)OC(=O)N[C@@](C)(CC3=CNC4=C...,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,C(CCN=C(N)N)CN,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
251,CN[C@H]1CC[C@H](C2=CC=CC=C12)C3=CC(=C(C=C3)Cl)Cl,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
252,C1(C(C(C(C(C1O)O)O)O)O)O,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
253,CC(CC1C2=CC=CC=C2CCC3=CC=CC=C13)CN(C)C.Cl,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
new_features = morgan_df2.iloc[:, 1:]
rf_model = models["LogisticRegression"] 
pred_labels = rf_model.predict(new_features)
pred_probs = rf_model.predict_proba(new_features)[:, 1]

df2["Predicted_Label"] = pred_labels
df2["Predicted_Probability"] = pred_probs
df2.to_csv("stat3/prediction_results.csv", index=False)

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform
import warnings
import os
warnings.filterwarnings('ignore')


label = merged_df.iloc[:, 3]
features = merged_df.iloc[:, 6:]



X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size=0.2, stratify=label, random_state=42
)


search_spaces = {
    "DecisionTree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 3, 5]
        },
        "method": "grid"
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "max_features": ["auto", "sqrt"]
        },
        "method": "random"
    },
    "LogisticRegression": {
        "model": LogisticRegression(solver='liblinear'),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"]
        },
        "method": "grid"
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.2),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "gamma": uniform(0, 0.5)
        },
        "method": "random"
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5],
            "subsample": [0.6, 0.8, 1.0]
        },
        "method": "grid"
    }
}


results = []
best_params = {} 

for name, config in search_spaces.items():
    print(f"\n ...")
    if config["method"] == "grid":
        searcher = GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            scoring="roc_auc",
            cv=5,
            n_jobs=-1
        )
    else:
        searcher = RandomizedSearchCV(
            estimator=config["model"],
            param_distributions=config["params"],
            n_iter=30,
            scoring="roc_auc",
            cv=5,
            n_jobs=-1,
            random_state=42
        )
    
    searcher.fit(X_train, y_train)
    best_model = searcher.best_estimator_
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Best_Params": searcher.best_params_,
        "AUC": auc
    })

    best_params[name] = searcher.best_params_


    print(f"   best: {searcher.best_params_}")


results_df = pd.DataFrame(results)
save_dir = "mtor"
os.makedirs(save_dir, exist_ok=True)
results_df.to_csv("mtor/model_best_params_auc.csv", index=False,)

print("\n:\n")
import pprint
pprint.pprint(best_params)


 ...
   best: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}

 ...
   best: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 'sqrt', 'max_depth': None}

 ...
   best: {'C': 0.1, 'penalty': 'l2'}

 ...
   best: {'colsample_bytree': 0.8123738333268545, 'gamma': 0.22389158228654582, 'learning_rate': 0.12057861781426558, 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.9046478461314871}

 ...
   best: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}

:

{'DecisionTree': {'max_depth': 10,
                  'min_samples_leaf': 5,
                  'min_samples_split': 2},
 'GradientBoosting': {'learning_rate': 0.1,
                      'max_depth': 5,
                      'n_estimators': 200,
                      'subsample': 1.0},
 'LogisticRegression': {'C': 0.1, 'penalty': 'l2'},
 'RandomForest': {'max_depth': None,
                  'max_features': 'sqrt',
                  'min_samples_split': 2,
                  'n_est

In [10]:
label = merged_df.iloc[:, 3]
features = merged_df.iloc[:, 6:]
binary_classes = sorted(label.unique())
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, stratify=label, random_state=42)
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42).set_params(**best_params["DecisionTree"]),
    'RandomForest': RandomForestClassifier(random_state=42).set_params(**best_params["RandomForest"]),
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=200, random_state=42).set_params(**best_params["LogisticRegression"]),
    'XGBoost': xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss').set_params(**best_params["XGBoost"]),
    'GradientBoosting': GradientBoostingClassifier(random_state=42).set_params(**best_params["GradientBoosting"])
}
results = []
plt.figure(figsize=(10, 8))
def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)

    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score": f1,
        "Precision": precision,
        "Recall": recall
    })
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=binary_classes, yticklabels=binary_classes)
    plt.title(f'{model_name} 混淆矩阵')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.savefig(f'{model_name}_confusion_matrix.svg', format='svg')
    plt.close()
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})', lw=2)
    return auc
for model_name, model in models.items():
    auc = evaluate_model(model, model_name)
    print(f"{model_name}  AUC: {auc:.2f}")
plt.plot([0, 1], [0, 1], color='grey', linestyle='--', lw=1)
plt.title('ROC曲线', fontsize=16)
plt.xlabel('假阳性率', fontsize=12)
plt.ylabel('真阳性率', fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.5)
plt.savefig('mtor/combined_roc_curves.svg', format='svg')
plt.close()
results_df = pd.DataFrame(results)
results_df.to_csv('mtor/model_evaluation_results.csv', index=False)



DecisionTree  AUC: 0.77
RandomForest  AUC: 0.89
LogisticRegression  AUC: 0.87
XGBoost  AUC: 0.89
GradientBoosting  AUC: 0.88


In [None]:
new_features = morgan_df2.iloc[:, 1:]
rf_model = models["XGBoost"] 
pred_labels = rf_model.predict(new_features)
pred_probs = rf_model.predict_proba(new_features)[:, 1]

df2["Predicted_Label"] = pred_labels
df2["Predicted_Probability"] = pred_probs
df2.to_csv("mtor/prediction_results.csv", index=False)

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import roc_auc_score
from scipy.stats import randint, uniform
import warnings
import os
warnings.filterwarnings('ignore')


label = merged_df.iloc[:, 4]
features = merged_df.iloc[:, 6:]



X_train, X_test, y_train, y_test = train_test_split(
    features, label, test_size=0.2, stratify=label, random_state=42
)


search_spaces = {
    "DecisionTree": {
        "model": DecisionTreeClassifier(random_state=42),
        "params": {
            "max_depth": [3, 5, 10],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 3, 5]
        },
        "method": "grid"
    },
    "RandomForest": {
        "model": RandomForestClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "max_depth": [None, 5, 10],
            "min_samples_split": [2, 5],
            "max_features": ["auto", "sqrt"]
        },
        "method": "random"
    },
    "LogisticRegression": {
        "model": LogisticRegression(solver='liblinear'),
        "params": {
            "C": [0.01, 0.1, 1, 10],
            "penalty": ["l1", "l2"]
        },
        "method": "grid"
    },
    "XGBoost": {
        "model": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
        "params": {
            "n_estimators": [50, 100, 200],
            "max_depth": randint(3, 10),
            "learning_rate": uniform(0.01, 0.2),
            "subsample": uniform(0.6, 0.4),
            "colsample_bytree": uniform(0.6, 0.4),
            "gamma": uniform(0, 0.5)
        },
        "method": "random"
    },
    "GradientBoosting": {
        "model": GradientBoostingClassifier(random_state=42),
        "params": {
            "n_estimators": [100, 200],
            "learning_rate": [0.01, 0.1, 0.2],
            "max_depth": [3, 5],
            "subsample": [0.6, 0.8, 1.0]
        },
        "method": "grid"
    }
}


results = []
best_params = {} 

for name, config in search_spaces.items():
    print(f"\n ...")
    if config["method"] == "grid":
        searcher = GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            scoring="roc_auc",
            cv=5,
            n_jobs=-1
        )
    else:
        searcher = RandomizedSearchCV(
            estimator=config["model"],
            param_distributions=config["params"],
            n_iter=30,
            scoring="roc_auc",
            cv=5,
            n_jobs=-1,
            random_state=42
        )
    
    searcher.fit(X_train, y_train)
    best_model = searcher.best_estimator_
    y_prob = best_model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_prob)

    results.append({
        "Model": name,
        "Best_Params": searcher.best_params_,
        "AUC": auc
    })

    best_params[name] = searcher.best_params_


    print(f"   best: {searcher.best_params_}")


results_df = pd.DataFrame(results)
save_dir = "nfkb"
os.makedirs(save_dir, exist_ok=True)
results_df.to_csv("nfkb/model_best_params_auc.csv", index=False,)

print("\n:\n")
import pprint
pprint.pprint(best_params)


 ...
   best: {'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}

 ...
   best: {'n_estimators': 200, 'min_samples_split': 5, 'max_features': 'sqrt', 'max_depth': None}

 ...
   best: {'C': 0.1, 'penalty': 'l2'}

 ...
   best: {'colsample_bytree': 0.7243929286862649, 'gamma': 0.16259166101337352, 'learning_rate': 0.15592123566761282, 'max_depth': 8, 'n_estimators': 200, 'subsample': 0.8244973703390804}

 ...
   best: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}

:

{'DecisionTree': {'max_depth': 10,
                  'min_samples_leaf': 5,
                  'min_samples_split': 2},
 'GradientBoosting': {'learning_rate': 0.1,
                      'max_depth': 5,
                      'n_estimators': 200,
                      'subsample': 1.0},
 'LogisticRegression': {'C': 0.1, 'penalty': 'l2'},
 'RandomForest': {'max_depth': None,
                  'max_features': 'sqrt',
                  'min_samples_split': 5,
                  'n_est

In [13]:
label = merged_df.iloc[:, 4]
features = merged_df.iloc[:, 6:]
binary_classes = sorted(label.unique())
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2, stratify=label, random_state=42)
models = {
    'DecisionTree': DecisionTreeClassifier(random_state=42).set_params(**best_params["DecisionTree"]),
    'RandomForest': RandomForestClassifier(random_state=42).set_params(**best_params["RandomForest"]),
    'LogisticRegression': LogisticRegression(solver='liblinear', max_iter=200, random_state=42).set_params(**best_params["LogisticRegression"]),
    'XGBoost': xgb.XGBClassifier(objective='binary:logistic', random_state=42, use_label_encoder=False, eval_metric='logloss').set_params(**best_params["XGBoost"]),
    'GradientBoosting': GradientBoostingClassifier(random_state=42).set_params(**best_params["GradientBoosting"])
}
results = []
plt.figure(figsize=(10, 8))
def evaluate_model(model, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "F1-Score": f1,
        "Precision": precision,
        "Recall": recall
    })

    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=binary_classes, yticklabels=binary_classes)
    plt.title(f'{model_name} 混淆矩阵')
    plt.xlabel('预测标签')
    plt.ylabel('真实标签')
    plt.savefig(f'{model_name}_confusion_matrix.svg', format='svg')
    plt.close()
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    auc = roc_auc_score(y_test, y_prob)
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {auc:.2f})', lw=2)

    return auc


for model_name, model in models.items():
    auc = evaluate_model(model, model_name)
    print(f"{model_name}  AUC: {auc:.2f}")


plt.plot([0, 1], [0, 1], color='grey', linestyle='--', lw=1)
plt.title('ROC曲线', fontsize=16)
plt.xlabel('假阳性率', fontsize=12)
plt.ylabel('真阳性率', fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(alpha=0.5)
plt.savefig('nfkb/combined_roc_curves.svg', format='svg')
plt.close()


results_df = pd.DataFrame(results)
results_df.to_csv('nfkb/model_evaluation_results.csv', index=False)



DecisionTree  AUC: 0.82
RandomForest  AUC: 0.93
LogisticRegression  AUC: 0.90
XGBoost  AUC: 0.92
GradientBoosting  AUC: 0.92


In [None]:
new_features = morgan_df2.iloc[:, 1:]
rf_model = models["RandomForest"] 
pred_labels = rf_model.predict(new_features)
pred_probs = rf_model.predict_proba(new_features)[:, 1]

df2["Predicted_Label"] = pred_labels
df2["Predicted_Probability"] = pred_probs
df2.to_csv("nfkb/prediction_results.csv", index=False)