In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split  
from sklearn.feature_selection import SelectKBest, chi2  
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score  
from imblearn.over_sampling import SMOTE 
from imblearn.pipeline import Pipeline 
from imblearn.under_sampling import RandomUnderSampler

#加载数据  
file_path = r"D:\cxdownload\data.csv"  
data = pd.read_csv(file_path)

#删除第一列并处理 'sex' 列  
data.drop(data.columns[0], axis=1, inplace=True)  
data['Sex'] = (data['Sex'] >= 0.5).astype(int)  # 将值转换为二进制  

#对 'sex' 列进行独热编码  
data = pd.get_dummies(data, columns=['Sex'], drop_first=True)

print(data.shape)
data.head()

(300000, 22)


Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Age,Education,Income,target,Sex_1
0,0.0,0.0,1.0,18.11946,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,2.0,1.88054,0.0,0.0,6.0,6.0,8.0,0.0,False
1,0.0,1.0,1.0,28.0,1.0,0.0,0.0,1.0,1.0,1.0,...,0.0,2.0,2.0,0.0,0.0,7.0,6.0,8.0,0.0,False
2,0.0,0.0,1.0,32.0,0.0,0.0,0.0,1.0,1.0,1.0,...,0.0,3.0,0.0,0.487226,0.0,9.0,4.0,6.512774,0.0,False
3,1.0,1.0,1.0,39.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,4.0,3.0,0.0,1.0,10.0,6.0,5.0,2.0,True
4,1.0,0.0,1.0,33.0,0.0,0.0,0.0,1.0,1.0,0.0,...,0.0,5.0,20.0,15.0,0.0,6.0,4.0,5.0,0.0,False


In [22]:
# 分离特征和目标变量  
X = data.drop(columns=['target'])  # 特征  
y = data['target']  # 目标变量  

# 使用卡方检验选择前 15 个特征  
kbest = SelectKBest(chi2, k=18)  # 基于卡方检验选择前 15 个特征  
X_new = kbest.fit_transform(X, y)  

# 获取选择的特征名称  
selected_features = kbest.get_support(indices=True)  
X_selected = X.iloc[:, selected_features]  # 提取特征子集  

print("选择的特征名称:", X_selected.columns.tolist()) 

# 步骤 6: 将数据分为训练集和测试集  
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)  

# 步骤 2: 创建过采样和欠采样的组合  
over = SMOTE(sampling_strategy='minority')  # 只过采样少数类  
under = RandomUnderSampler(sampling_strategy='majority')  # 欠采样多数类  
pipeline = Pipeline(steps=[('o', over), ('u', under)])  

# 步骤 3: 在训练集上应用重采样  
X_resampled, y_resampled = pipeline.fit_resample(X_train, y_train)  

# 查看重采样后的类别分布  
print("重采样后的类别分布：")  
print(pd.Series(y_resampled).value_counts()) 

#选择模型：KNN，带有权重的KNN
models = []
models.append(("KNN", KNeighborsClassifier(n_neighbors=2)))
models.append(("KNN with weights", KNeighborsClassifier(
    n_neighbors=2, weights="distance")))


#分别计算score的值
results = []
for name, model in models:
    model.fit(X_train, y_train)
    results.append((name, model.score(X_test, y_test)))
for i in range(len(results)):
    print("name: {}; score: {}".format(results[i][0],results[i][1]))


选择的特征名称: ['HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income']
重采样后的类别分布：
target
1.0    201689
0.0     33554
2.0     33554
Name: count, dtype: int64
name: KNN; score: 0.8853333333333333
name: KNN with weights; score: 0.9044


In [12]:
#把数据集分成10份,其中1份作为交叉验证数据集计算模型准确性,9份作为训练数据集
#计算出10次不同训练数据集和交叉验证数据集组合得到的模型准确性评分，再求平均值，得到更准确的结果
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

results = []
for name, model in models:
    kfold = KFold(n_splits=10)
    cv_result = cross_val_score(model, X, y, cv=kfold)
    results.append((name, cv_result))
for i in range(len(results)):
    print("name: {}; cross val score: {}".format(
        results[i][0],results[i][1].mean()))


name: KNN; cross val score: 0.89277
name: KNN with weights; cross val score: 0.9144233333333334


In [21]:
knn = KNeighborsClassifier(n_neighbors=2,weights="distance")
knn.fit(X_train, y_train)
train_score = knn.score(X_train, y_train)
test_score = knn.score(X_test, y_test)
print("train score: {}; test score: {}".format(train_score, test_score))


train score: 0.9939666666666667; test score: 0.9044


In [22]:
# 步骤 8: 进行预测  
y_pred = knn.predict(X_test)  

# 步骤 9: 评估模型  
# 打印混淆矩阵和分类报告  
print("混淆矩阵:")  
print(confusion_matrix(y_test, y_pred))  
print("\n分类报告:")  
print(classification_report(y_test, y_pred))  

hhh=knn.predict(X_train)
f11 = f1_score(y_train, hhh, average='macro')  # 计算 F1 分数  
print("\nF11 Score (macro):", f11)
# 计算并打印 F1 分数  
f1 = f1_score(y_test, y_pred, average='macro')  # 计算 F1 分数  
print("\nF1 Score (macro):", f1)


混淆矩阵:[[46918   61  3332] [  978   121   144] [ 5418    11  3017]]分类报告:              precision    recall  f1-score   support         0.0       0.88      0.93      0.91     50311         1.0       0.63      0.10      0.17      1243         2.0       0.46      0.36      0.41      8446    accuracy                           0.80     60000   macro avg       0.65      0.48      0.50     60000weighted avg       0.84      0.65      0.72     60000

F11 Score (macro): 0.9838509829131882

F1 Score (macro): 0.7084622313109533
