# 糖尿病预测数据预处理与建模（基于 BRFSS 数据集）

本 Notebook 包括以下模块：

- 数据划分与标准化  
- 类别分布检查与上采样  
- 标准化可视化  
- XGBoost 模型训练与评估  


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import MinMaxScaler

# 对数据进行预处理

In [2]:

# 🔶 2. 读取原始 CSV（假设文件名为 brfss_raw.csv）
df = pd.read_csv('data/CDC_BRFSS2015.csv')

# 🔶 3. 字段重命名（可选，便于统一命名）
df.columns = [
    'Diabetes_binary', 'HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker',
    'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth',
    'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income'
]

# 🔶 4. 处理异常值（如极端 BMI）
df = df[df['BMI'] <= 60]

# 🔶 5. 构造新特征（特征工程，结合教材内容）

# 是否肥胖
df['IsObese'] = (df['BMI'] >= 30).astype(int)

# BMI 分段
df['BMI_Category'] = pd.cut(df['BMI'], bins=[0, 18.5, 25, 30, 60],
                            labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# 年龄段
df['AgeGroup'] = pd.cut(df['Age'], bins=[0, 30, 50, 80],
                        labels=['Young', 'Middle', 'Old'])

# 不健康得分
df['TotalUnhealthyDays'] = df['MentHlth'] + df['PhysHlth']
df['HealthRiskScore'] = df['TotalUnhealthyDays'] * (df['DiffWalk'] + 1)

# 🔶 6. 归一化
scaler = MinMaxScaler()
scaled_cols = ['BMI', 'MentHlth', 'PhysHlth', 'TotalUnhealthyDays', 'HealthRiskScore']
df[scaled_cols] = scaler.fit_transform(df[scaled_cols])

# 🔶 7. 最终保留字段
final_cols = [
    'HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke',
    'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies',
    'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk',
    'Sex', 'Age', 'Education', 'Income', 'BMI', 'IsObese', 'HealthRiskScore',
    'Diabetes_binary'
]
df_final = df[final_cols]

# 🔶 8. 保存结果
df_final.to_csv('aftprocessdata/processed_brfss.csv', index=False)
print("✅ 已生成处理后的文件：processed_brfss.csv")



✅ 已生成处理后的文件：processed_brfss.csv


# 加载数据（请确保已预处理为 processed_brfss.csv）

In [3]:

data = pd.read_csv('aftprocessdata/processed_brfss.csv')
data.head()
#类别数量检查
print("Class 0:", (data.iloc[:,-1] == 0).sum())
print("Class 1:", (data.iloc[:,-1] == 1).sum())


Class 0: 217700
Class 1: 35175


# 上采样函数定义与执行
复制少数类样本使其数量增加到与多数类相等，这样可以平衡训练数据的正负样本数量；


In [4]:
def oversample(df):
    if ((df.iloc[:,-1] == 0).sum()) >= ((df.iloc[:,-1] == 1).sum()):
        df_majority = df[df.iloc[:,-1] == 0]
        df_minority = df[df.iloc[:,-1] == 1]
    else:
        df_majority = df[df.iloc[:,-1] == 1]
        df_minority = df[df.iloc[:,-1] == 0]
    df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)
    df_upsampled = pd.concat([df_majority, df_minority_upsampled])
    return df_upsampled.sample(frac=1, random_state=0).reset_index(drop=True)

# 执行上采样
data_balanced = oversample(data)
data_balanced['Diabetes_binary'].value_counts()


Diabetes_binary
0.0    217700
1.0    217700
Name: count, dtype: int64

# 数据划分以及标准化

In [5]:
def data_split(df):
    X = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    return train_test_split(X, y, test_size=0.2, stratify=y, random_state=0)

def data_scaling(train_x, test_x):
    scaler = StandardScaler()
    train_x_scaled = scaler.fit_transform(train_x)
    test_x_scaled = scaler.transform(test_x)
    return train_x_scaled, test_x_scaled

# 执行划分与标准化
train_x, test_x, train_y, test_y = data_split(data_balanced)
train_x_scaled, test_x_scaled = data_scaling(train_x, test_x)


# 创建训练集，测试集，验证集

In [7]:
df = pd.read_csv('aftprocessdata/processed_brfss.csv')

# 2. 拆分特征和标签
X = df.drop(columns=['Diabetes_binary'])
y = df['Diabetes_binary']

# 3. 第一步：先划分为训练集（70%）和临时集（30%）
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# 4. 第二步：将临时集划分为验证集和测试集（各占15%）
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

# 5. 输出各数据集大小
print(f"训练集：{len(X_train)}")
print(f"验证集：{len(X_val)}")
print(f"测试集：{len(X_test)}")

# 6.保存各数据集
X_train.to_csv("sign/X_train.csv", index=False)
y_train.to_csv("sign/y_train.csv", index=False)
X_val.to_csv("sign/X_val.csv", index=False)
y_val.to_csv("sign/y_val.csv", index=False)
X_test.to_csv("sign/X_test.csv", index=False)
y_test.to_csv("sign/y_test.csv", index=False)

训练集：177012
验证集：37931
测试集：37932
