今天的内容是进行多种机器学习模型的建立与分析，数据集依然是之前的heart.csv（已编码好）

In [54]:
import pandas as  pd
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

data=pd.read_csv('heart.csv')

# 医学特征中英文映射字典
medical_feature_map = {
    "age": "年龄",
    "sex": "性别",
    "cp": "胸痛类型",
    "trestbps": "静息血压",  # 单位：mm Hg（毫米汞柱）
    "chol": "血清胆固醇浓度",  # 单位：mg/dl（毫克/分升）
    "fbs": "空腹血糖",  # 单位：mg/dl，通常以120mg/dl为分界，大于120mg/dl为1
    "restecg": "静息心电图结果",
    "thalach": "最大心率",
    "exang": "运动诱发心绞痛",
    "oldpeak": "运动相对静息的ST段压低",  # 单位：mV（毫伏）
    "slope": "ST段峰值斜率",
    "ca": "荧光检查显示的主要血管数量",  # 0-3支
    "thal": "地中海贫血症状态",
    "target": "患病目标变量"  # 通常1=患病，0=正常
}

data = data.rename(columns=medical_feature_map)
data.head()

Unnamed: 0,年龄,性别,胸痛类型,静息血压,血清胆固醇浓度,空腹血糖,静息心电图结果,最大心率,运动诱发心绞痛,运动相对静息的ST段压低,ST段峰值斜率,荧光检查显示的主要血管数量,地中海贫血症状态,患病目标变量
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


可视化部分见Day 9

这里为了增强模型鲁棒性，不对数据进行异常值处理

此时我们对连续变量还不能进行归一化/标准化，是为了避免关键的训练集/测试集数据泄露问题。一旦在划分数据集之前对全集应用此类预处理，训练过程就间接利用了测试集的**均值和标准差等统计信息**，这会导致对模型在未知数据上性能的乐观估计。

---
# 机器学习建模
## 数据划分

In [55]:
data.columns

Index(['年龄', '性别', '胸痛类型', '静息血压', '血清胆固醇浓度', '空腹血糖', '静息心电图结果', '最大心率',
       '运动诱发心绞痛', '运动相对静息的ST段压低', 'ST段峰值斜率', '荧光检查显示的主要血管数量', '地中海贫血症状态',
       '患病目标变量'],
      dtype='object')

In [56]:
from sklearn.model_selection import train_test_split
X=data.drop(['患病目标变量'], axis=1)
y=data['患病目标变量']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'训练集形状：{X_train.shape}, 测试集形状：{X_test.shape}')

训练集形状：(242, 13), 测试集形状：(61, 13)


## 数据归一化
接下来对连续特征进行归一化

In [57]:
from sklearn.preprocessing import MinMaxScaler,StandardScaler

continuous_features=['年龄', '静息血压', '血清胆固醇浓度', '最大心率', '运动相对静息的ST段压低']

min_max_scaler = MinMaxScaler()

# 仅在训练集上 fit (学习最大值和最小值)
# 然后对训练集进行 transform (应用缩放)
# 注意：Scikit-learn 返回 NumPy 数组，需要重新赋值给 DataFrame
X_train[continuous_features]=min_max_scaler.fit_transform(X_train[continuous_features])

# 使用训练集学到的参数 (scaler) 直接对测试集进行 transform
# 绝对不能对测试集使用 fit_transform()
X_test[continuous_features] = min_max_scaler.transform(X_test[continuous_features])
X_test

Unnamed: 0,年龄,性别,胸痛类型,静息血压,血清胆固醇浓度,空腹血糖,静息心电图结果,最大心率,运动诱发心绞痛,运动相对静息的ST段压低,ST段峰值斜率,荧光检查显示的主要血管数量,地中海贫血症状态
179,0.583333,1,0,0.571429,0.334873,0,0,0.210526,1,0.107143,1,1,1
228,0.625000,1,3,0.775510,0.362587,0,0,0.622807,0,0.035714,1,0,3
111,0.583333,1,2,0.571429,-0.011547,1,1,0.745614,0,0.035714,2,1,3
246,0.562500,0,0,0.408163,0.642032,0,0,0.543860,1,0.339286,1,2,3
60,0.875000,0,2,0.163265,0.309469,1,0,0.368421,0,0.000000,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
249,0.833333,1,2,0.469388,0.284065,0,0,0.508772,0,0.357143,1,3,3
104,0.437500,1,2,0.357143,0.150115,0,1,0.657895,0,0.000000,2,0,2
300,0.812500,1,0,0.510204,0.143187,1,1,0.464912,0,0.607143,1,2,3
193,0.645833,1,0,0.520408,0.348730,0,0,0.473684,1,0.500000,1,2,3


## 模型训练与评估

In [58]:
from sklearn.svm import SVC #支持向量机分类器
from sklearn.neighbors import KNeighborsClassifier #K近邻分类器
from sklearn.linear_model import LogisticRegression #逻辑回归分类器
import xgboost as xgb #XGBoost分类器
import lightgbm as lgb #LightGBM分类器
from sklearn.ensemble import RandomForestClassifier #随机森林分类器
from sklearn.tree import DecisionTreeClassifier #决策树分类器
from sklearn.naive_bayes import GaussianNB #高斯朴素贝叶斯分类器
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标
from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵
import warnings
warnings.filterwarnings('ignore')

In [59]:
# svm
svm_model=SVC(random_state=42)
svm_model.fit(X_train,y_train)
svm_pred=svm_model.predict(X_test)
print('SVM分类报告：')
print(classification_report(y_test,svm_pred))
print('SVM混淆矩阵：')
print(confusion_matrix(y_test,svm_pred))

# 计算 SVM 评估指标，这些指标默认计算正类的性能
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_precision = precision_score(y_test, svm_pred)
svm_recall = recall_score(y_test, svm_pred)
svm_f1 = f1_score(y_test, svm_pred)
print("SVM 模型评估指标：")
print(f"准确率: {svm_accuracy:.4f}")
print(f"精确率: {svm_precision:.4f}")
print(f"召回率: {svm_recall:.4f}")
print(f"F1 值: {svm_f1:.4f}")

SVM分类报告：
              precision    recall  f1-score   support

           0       0.89      0.83      0.86        29
           1       0.85      0.91      0.88        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

SVM混淆矩阵：
[[24  5]
 [ 3 29]]
SVM 模型评估指标：
准确率: 0.8689
精确率: 0.8529
召回率: 0.9062
F1 值: 0.8788


In [60]:
# KNN
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)

print("\nKNN分类报告：")
print(classification_report(y_test, knn_pred))
print("KNN混淆矩阵：")
print(confusion_matrix(y_test, knn_pred))

knn_accuracy = accuracy_score(y_test, knn_pred)
knn_precision = precision_score(y_test, knn_pred)
knn_recall = recall_score(y_test, knn_pred)
knn_f1 = f1_score(y_test, knn_pred)
print("KNN模型评估指标：")
print(f"准确率: {knn_accuracy:.4f}")
print(f"精确率: {knn_precision:.4f}")
print(f"召回率: {knn_recall:.4f}")
print(f"F1 值: {knn_f1:.4f}")


KNN分类报告：
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        29
           1       0.86      0.78      0.82        32

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61

KNN混淆矩阵：
[[25  4]
 [ 7 25]]
KNN模型评估指标：
准确率: 0.8197
精确率: 0.8621
召回率: 0.7812
F1 值: 0.8197


In [61]:
# 逻辑回归
logreg_model = LogisticRegression(random_state=42)
logreg_model.fit(X_train, y_train)
logreg_pred = logreg_model.predict(X_test)

print("\n逻辑回归分类报告：")
print(classification_report(y_test, logreg_pred))
print("逻辑回归混淆矩阵：")
print(confusion_matrix(y_test, logreg_pred))

logreg_accuracy = accuracy_score(y_test, logreg_pred)
logreg_precision = precision_score(y_test, logreg_pred)
logreg_recall = recall_score(y_test, logreg_pred)
logreg_f1 = f1_score(y_test, logreg_pred)
print("逻辑回归模型评估指标：")
print(f"准确率: {logreg_accuracy:.4f}")
print(f"精确率: {logreg_precision:.4f}")
print(f"召回率: {logreg_recall:.4f}")
print(f"F1 值: {logreg_f1:.4f}")


逻辑回归分类报告：
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        29
           1       0.87      0.81      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

逻辑回归混淆矩阵：
[[25  4]
 [ 6 26]]
逻辑回归模型评估指标：
准确率: 0.8361
精确率: 0.8667
召回率: 0.8125
F1 值: 0.8387


In [62]:
# 朴素贝叶斯
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

print("\n朴素贝叶斯分类报告：")
print(classification_report(y_test, nb_pred))
print("朴素贝叶斯混淆矩阵：")
print(confusion_matrix(y_test, nb_pred))

nb_accuracy = accuracy_score(y_test, nb_pred)
nb_precision = precision_score(y_test, nb_pred)
nb_recall = recall_score(y_test, nb_pred)
nb_f1 = f1_score(y_test, nb_pred)
print("朴素贝叶斯模型评估指标：")
print(f"准确率: {nb_accuracy:.4f}")
print(f"精确率: {nb_precision:.4f}")
print(f"召回率: {nb_recall:.4f}")
print(f"F1 值: {nb_f1:.4f}")
    


朴素贝叶斯分类报告：
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        29
           1       0.90      0.84      0.87        32

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61

朴素贝叶斯混淆矩阵：
[[26  3]
 [ 5 27]]
朴素贝叶斯模型评估指标：
准确率: 0.8689
精确率: 0.9000
召回率: 0.8438
F1 值: 0.8710


In [63]:
# 决策树
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

print("\n决策树分类报告：")
print(classification_report(y_test, dt_pred))
print("决策树 混淆矩阵：")
print(confusion_matrix(y_test, dt_pred))

dt_accuracy = accuracy_score(y_test, dt_pred)
dt_precision = precision_score(y_test, dt_pred)
dt_recall = recall_score(y_test, dt_pred)
dt_f1 = f1_score(y_test, dt_pred)
print("决策树模型评估指标：")
print(f"准确率: {dt_accuracy:.4f}")
print(f"精确率: {dt_precision:.4f}")
print(f"召回率: {dt_recall:.4f}")
print(f"F1 值: {dt_f1:.4f}")


决策树分类报告：
              precision    recall  f1-score   support

           0       0.70      0.90      0.79        29
           1       0.88      0.66      0.75        32

    accuracy                           0.77        61
   macro avg       0.79      0.78      0.77        61
weighted avg       0.79      0.77      0.77        61

决策树 混淆矩阵：
[[26  3]
 [11 21]]
决策树模型评估指标：
准确率: 0.7705
精确率: 0.8750
召回率: 0.6562
F1 值: 0.7500


In [64]:
# 随机森林
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

print("\n随机森林分类报告：")
print(classification_report(y_test, rf_pred))
print("随机森林混淆矩阵：")
print(confusion_matrix(y_test, rf_pred))

rf_accuracy = accuracy_score(y_test, rf_pred)
rf_precision = precision_score(y_test, rf_pred)
rf_recall = recall_score(y_test, rf_pred)
rf_f1 = f1_score(y_test, rf_pred)
print("随机森林模型评估指标：")
print(f"准确率: {rf_accuracy:.4f}")
print(f"精确率: {rf_precision:.4f}")
print(f"召回率: {rf_recall:.4f}")
print(f"F1 值: {rf_f1:.4f}")


随机森林分类报告：
              precision    recall  f1-score   support

           0       0.83      0.83      0.83        29
           1       0.84      0.84      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

随机森林混淆矩阵：
[[24  5]
 [ 5 27]]
随机森林模型评估指标：
准确率: 0.8361
精确率: 0.8438
召回率: 0.8438
F1 值: 0.8438


In [65]:
# XGBoost
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_test)

print("\nXGBoost 分类报告：")
print(classification_report(y_test, xgb_pred))
print("XGBoost 混淆矩阵：")
print(confusion_matrix(y_test, xgb_pred))

xgb_accuracy = accuracy_score(y_test, xgb_pred)
xgb_precision = precision_score(y_test, xgb_pred)
xgb_recall = recall_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred)
print("XGBoost 模型评估指标：")
print(f"准确率: {xgb_accuracy:.4f}")
print(f"精确率: {xgb_precision:.4f}")
print(f"召回率: {xgb_recall:.4f}")
print(f"F1 值: {xgb_f1:.4f}")


XGBoost 分类报告：
              precision    recall  f1-score   support

           0       0.78      0.86      0.82        29
           1       0.86      0.78      0.82        32

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61

XGBoost 混淆矩阵：
[[25  4]
 [ 7 25]]
XGBoost 模型评估指标：
准确率: 0.8197
精确率: 0.8621
召回率: 0.7812
F1 值: 0.8197


In [66]:
import warnings
warnings.filterwarnings('ignore')
# LightGBM
lgb_model = lgb.LGBMClassifier(random_state=42)
lgb_model.fit(X_train, y_train)
lgb_pred = lgb_model.predict(X_test)

print("\nLightGBM 分类报告：")
print(classification_report(y_test, lgb_pred))
print("LightGBM 混淆矩阵：")
print(confusion_matrix(y_test, lgb_pred))

lgb_accuracy = accuracy_score(y_test, lgb_pred)
lgb_precision = precision_score(y_test, lgb_pred)
lgb_recall = recall_score(y_test, lgb_pred)
lgb_f1 = f1_score(y_test, lgb_pred)
print("LightGBM 模型评估指标：")
print(f"准确率: {lgb_accuracy:.4f}")
print(f"精确率: {lgb_precision:.4f}")
print(f"召回率: {lgb_recall:.4f}")
print(f"F1 值: {lgb_f1:.4f}")

[LightGBM] [Info] Number of positive: 133, number of negative: 109
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000224 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 237
[LightGBM] [Info] Number of data points in the train set: 242, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.549587 -> initscore=0.199001
[LightGBM] [Info] Start training from score 0.199001

LightGBM 分类报告：
              precision    recall  f1-score   support

           0       0.81      0.86      0.83        29
           1       0.87      0.81      0.84        32

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61

LightGBM 混淆矩阵：
[[25  4]
 [ 6 26]]
LightGBM 模型评估指标：
准确率: 0.8361
精确率: 0.8667
召回率: 0.8125
F1 值: 0.8387
