In [1]:
import warnings
warnings.filterwarnings('ignore', category=UserWarning)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np
import pandas as pd
df1 = pd.read_csv(r"D:\desktop\B\数据\标准化后数据\df1.csv")
df2 = pd.read_excel(r"D:\desktop\B\数据\标准化后数据\df2.xlsx")
df1[['mort', 'CONS','MV','CRRT']] = df1[['mort',  'CONS','MV','CRRT']].astype('category')
df2[['mort',  'CONS','MV','CRRT']] = df2[['mort',  'CONS','MV','CRRT']].astype('category')
selected_features = ["CONS", "LDH", "MV", "AST", "CRRT", "U", "L", "HR", "D.Dimer", "CR", "age", "ALT"]
target = 'mort'
X = df1[selected_features]
y = df1[target]
X_train, X_internal_test, y_train, y_internal_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_external_test = df2[selected_features]
y_external_test = df2[target]

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.utils import resample
import numpy as np

RANDOM_SEED = 42

# 定义计算置信区间的函数
def bootstrap_metric_ci(y_true, y_pred, metric_func, n_bootstraps=10000, ci=95, random_state=42):
    rng = np.random.RandomState(random_state)
    bootstrapped_scores = []
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    for _ in range(n_bootstraps):
        indices = resample(np.arange(len(y_true)), replace=True, random_state=rng)
        score = metric_func(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)
    
    lower_bound = np.percentile(bootstrapped_scores, (100-ci)/2)
    upper_bound = np.percentile(bootstrapped_scores, 100-(100-ci)/2)
    return lower_bound, upper_bound

# 设置逻辑回归模型的超参数
params_lr = {
    'C': [0.1, 0.5, 1.0, 2.0, 5.0], 
    'solver': ['lbfgs', 'liblinear'], 
    'max_iter': [100, 200, 300]  
}

best_models_lr = {}

# 逐步添加特征并训练模型
for i in range(1, len(selected_features) + 1):
    current_features = selected_features[:i]
    X_train_current = X_train[current_features]
    X_internal_test_current = X_internal_test[current_features]
    X_external_test_current = X_external_test[current_features]
    
    # 创建逻辑回归模型并配置GridSearchCV
    model_lr = LogisticRegression(random_state=RANDOM_SEED)
    grid_lr = GridSearchCV(model_lr, params_lr, cv=10, scoring='roc_auc', n_jobs=-1)
    grid_lr.fit(X_train_current, y_train)

    best_model_lr = grid_lr.best_estimator_

    # 使用交叉验证在训练集和内部验证集上进行预测
    y_train_pred_cv = cross_val_predict(best_model_lr, X_train_current, y_train, cv=10, method='predict')
    y_train_proba_cv = cross_val_predict(best_model_lr, X_train_current, y_train, cv=10, method='predict_proba')[:, 1]
    
    y_internal_pred_cv = cross_val_predict(best_model_lr, X_internal_test_current, y_internal_test, cv=10, method='predict')
    y_internal_proba_cv = cross_val_predict(best_model_lr, X_internal_test_current, y_internal_test, cv=10, method='predict_proba')[:, 1]

    # 进行外部测试集预测
    y_external_pred_lr = best_model_lr.predict(X_external_test_current)
    y_external_proba_lr = best_model_lr.predict_proba(X_external_test_current)[:, 1]

    # 计算训练集的各项指标及其置信区间
    train_metrics_lr = {
        'accuracy': accuracy_score(y_train, y_train_pred_cv),
        'precision': precision_score(y_train, y_train_pred_cv, average='weighted', zero_division=0),
        'recall': recall_score(y_train, y_train_pred_cv, average='weighted'),
        'f1_score': f1_score(y_train, y_train_pred_cv, average='weighted'),
        'auc': roc_auc_score(y_train, y_train_proba_cv)
    }

    train_ci_lr = {
        'accuracy_ci': bootstrap_metric_ci(y_train, y_train_pred_cv, accuracy_score),
        'precision_ci': bootstrap_metric_ci(y_train, y_train_pred_cv, lambda y_true, y_pred: precision_score(y_true, y_pred, average='weighted', zero_division=0)),
        'recall_ci': bootstrap_metric_ci(y_train, y_train_pred_cv, lambda y_true, y_pred: recall_score(y_true, y_pred, average='weighted')),
        'f1_score_ci': bootstrap_metric_ci(y_train, y_train_pred_cv, lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted')),
        'auc_ci': bootstrap_metric_ci(y_train, y_train_proba_cv, roc_auc_score)
    }

    # 计算内部验证集的各项指标及其置信区间
    internal_metrics_lr = {
        'accuracy': accuracy_score(y_internal_test, y_internal_pred_cv),
        'precision': precision_score(y_internal_test, y_internal_pred_cv, average='weighted', zero_division=0),
        'recall': recall_score(y_internal_test, y_internal_pred_cv, average='weighted'),
        'f1_score': f1_score(y_internal_test, y_internal_pred_cv, average='weighted'),
        'auc': roc_auc_score(y_internal_test, y_internal_proba_cv)
    }

    internal_ci_lr = {
        'accuracy_ci': bootstrap_metric_ci(y_internal_test, y_internal_pred_cv, accuracy_score),
        'precision_ci': bootstrap_metric_ci(y_internal_test, y_internal_pred_cv, lambda y_true, y_pred: precision_score(y_true, y_pred, average='weighted', zero_division=0)),
        'recall_ci': bootstrap_metric_ci(y_internal_test, y_internal_pred_cv, lambda y_true, y_pred: recall_score(y_true, y_pred, average='weighted')),
        'f1_score_ci': bootstrap_metric_ci(y_internal_test, y_internal_pred_cv, lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted')),
        'auc_ci': bootstrap_metric_ci(y_internal_test, y_internal_proba_cv, roc_auc_score)
    }

    # 计算外部验证集的各项指标及其置信区间
    external_metrics_lr = {
        'accuracy': accuracy_score(y_external_test, y_external_pred_lr),
        'precision': precision_score(y_external_test, y_external_pred_lr, average='weighted', zero_division=0),
        'recall': recall_score(y_external_test, y_external_pred_lr, average='weighted'),
        'f1_score': f1_score(y_external_test, y_external_pred_lr, average='weighted'),
        'auc': roc_auc_score(y_external_test, y_external_proba_lr)
    }

    external_ci_lr = {
        'accuracy_ci': bootstrap_metric_ci(y_external_test, y_external_pred_lr, accuracy_score),
        'precision_ci': bootstrap_metric_ci(y_external_test, y_external_pred_lr, lambda y_true, y_pred: precision_score(y_true, y_pred, average='weighted', zero_division=0)),
        'recall_ci': bootstrap_metric_ci(y_external_test, y_external_pred_lr, lambda y_true, y_pred: recall_score(y_true, y_pred, average='weighted')),
        'f1_score_ci': bootstrap_metric_ci(y_external_test, y_external_pred_lr, lambda y_true, y_pred: f1_score(y_true, y_pred, average='weighted')),
        'auc_ci': bootstrap_metric_ci(y_external_test, y_external_proba_lr, roc_auc_score)
    }

    # 保存结果
    best_models_lr[i] = {
        'features': current_features,
        'model': best_model_lr,
        'train_metrics': train_metrics_lr,
        'train_ci': train_ci_lr,
        'internal_metrics': internal_metrics_lr,
        'internal_ci': internal_ci_lr,
        'external_metrics': external_metrics_lr,
        'external_ci': external_ci_lr,
        'best_params': grid_lr.best_params_
    }

# 输出结果
for i, results in best_models_lr.items():
    print(f"\nModel with top {i} features: {results['features']}")
    print("Best Parameters:", results['best_params'])
    
    print("Training Set Metrics:")
    for metric, value in results['train_metrics'].items():
        print(f"{metric.capitalize()}: {value:.4f} 置信区间: {results['train_ci'][metric+'_ci'][0]:.4f}-{results['train_ci'][metric+'_ci'][1]:.4f}")

    print("Internal Validation Metrics:")
    for metric, value in results['internal_metrics'].items():
        print(f"{metric.capitalize()}: {value:.4f} 置信区间: {results['internal_ci'][metric+'_ci'][0]:.4f}-{results['internal_ci'][metric+'_ci'][1]:.4f}")
    
    print("External Validation Metrics:")
    for metric, value in results['external_metrics'].items():
        print(f"{metric.capitalize()}: {value:.4f} 置信区间: {results['external_ci'][metric+'_ci'][0]:.4f}-{results['external_ci'][metric+'_ci'][1]:.4f}")



Model with top 1 features: ['CONS']
Best Parameters: {'C': 0.1, 'max_iter': 100, 'solver': 'lbfgs'}
Training Set Metrics:
Accuracy: 0.7647 置信区间: 0.7059-0.8235
Precision: 0.5848 置信区间: 0.4983-0.6782
Recall: 0.7647 置信区间: 0.7059-0.8235
F1_score: 0.6627 置信区间: 0.5842-0.7438
Auc: 0.7939 置信区间: 0.7064-0.8738
Internal Validation Metrics:
Accuracy: 0.7273 置信区间: 0.6364-0.8182
Precision: 0.5289 置信区间: 0.4050-0.6694
Recall: 0.7273 置信区间: 0.6364-0.8182
F1_score: 0.6124 置信区间: 0.4949-0.7364
Auc: 0.7077 置信区间: 0.5747-0.8272
External Validation Metrics:
Accuracy: 0.7500 置信区间: 0.6635-0.8269
Precision: 0.5625 置信区间: 0.4402-0.6838
Recall: 0.7500 置信区间: 0.6635-0.8269
F1_score: 0.6429 置信区间: 0.5292-0.7486
Auc: 0.7628 置信区间: 0.6643-0.8610

Model with top 2 features: ['CONS', 'LDH']
Best Parameters: {'C': 0.5, 'max_iter': 100, 'solver': 'liblinear'}
Training Set Metrics:
Accuracy: 0.8627 置信区间: 0.8137-0.9069
Precision: 0.8594 置信区间: 0.8081-0.9080
Recall: 0.8627 置信区间: 0.8137-0.9069
F1_score: 0.8606 置信区间: 0.8090-0.9065
A

In [3]:
import pandas as pd
import os

# 设置输出文件夹路径
output_folder = r"D:\desktop\B\结果\表格\内部外部验证结果"
os.makedirs(output_folder, exist_ok=True)  # 确保输出文件夹存在

# 用于存储结果的列表
output_data = []

# 遍历模型结果并将其组织为字典
for i, results in best_models_lr.items():
    model_name = f"Model with top {i} features"
    for metric in results['train_metrics'].keys():
        output_data.append({
            'Model': model_name,
            'Dataset': 'Training Set',
            'Metric': metric.capitalize(),
            'Value': results['train_metrics'][metric],
            'Confidence Interval': f"{results['train_ci'][metric+'_ci'][0]:.4f}-{results['train_ci'][metric+'_ci'][1]:.4f}"
        })
        output_data.append({
            'Model': model_name,
            'Dataset': 'Internal Validation Set',
            'Metric': metric.capitalize(),
            'Value': results['internal_metrics'][metric],
            'Confidence Interval': f"{results['internal_ci'][metric+'_ci'][0]:.4f}-{results['internal_ci'][metric+'_ci'][1]:.4f}"
        })
        output_data.append({
            'Model': model_name,
            'Dataset': 'External Validation Set',
            'Metric': metric.capitalize(),
            'Value': results['external_metrics'][metric],
            'Confidence Interval': f"{results['external_ci'][metric+'_ci'][0]:.4f}-{results['external_ci'][metric+'_ci'][1]:.4f}"
        })

# 将列表转换为 DataFrame
output_df = pd.DataFrame(output_data)

# 设置文件名
output_file = os.path.join(output_folder, "LR_Results.xlsx")

# 将 DataFrame 保存为 Excel 文件
output_df.to_excel(output_file, index=False)

print(f"结果已保存至：{output_file}")


结果已保存至：D:\desktop\B\结果\表格\内部外部验证结果\LR_Results.xlsx
