In [34]:
import joblib
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [35]:
heart_disease = pd.read_csv("../data/heart_disease.csv")
heart_disease

Unnamed: 0,年龄,性别,胸痛类型,静息血压,胆固醇,空腹血糖,静息心电图结果,最大心率,运动性心绞痛,运动后的ST下降,峰值ST段的斜率,主血管数量,地中海贫血,是否患有心脏病
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [36]:
# 处理缺失值
heart_disease.dropna(inplace=True)
heart_disease

Unnamed: 0,年龄,性别,胸痛类型,静息血压,胆固醇,空腹血糖,静息心电图结果,最大心率,运动性心绞痛,运动后的ST下降,峰值ST段的斜率,主血管数量,地中海贫血,是否患有心脏病
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [37]:
heart_disease.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   年龄        1025 non-null   int64  
 1   性别        1025 non-null   int64  
 2   胸痛类型      1025 non-null   int64  
 3   静息血压      1025 non-null   int64  
 4   胆固醇       1025 non-null   int64  
 5   空腹血糖      1025 non-null   int64  
 6   静息心电图结果   1025 non-null   int64  
 7   最大心率      1025 non-null   int64  
 8   运动性心绞痛    1025 non-null   int64  
 9   运动后的ST下降  1025 non-null   float64
 10  峰值ST段的斜率  1025 non-null   int64  
 11  主血管数量     1025 non-null   int64  
 12  地中海贫血     1025 non-null   int64  
 13  是否患有心脏病   1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [38]:
X = heart_disease.drop("是否患有心脏病",axis=1)
y = heart_disease['是否患有心脏病']

In [39]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=100)

In [40]:
# 数值型特征
numerical_features = ["年龄", "静息血压", "胆固醇", "最大心率", "运动后的ST下降", "主血管数量"]
# 类别型特征
categorical_features = ["胸痛类型", "静息心电图结果", "峰值ST段的斜率", "地中海贫血"]
# 二元特征
binary_features = ["性别", "空腹血糖", "运动性心绞痛"]

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num',StandardScaler(),numerical_features),
        ('cat',OneHotEncoder(drop='first'),categorical_features),
        ('binary','passthrough',binary_features)
    ]
)
x_train = preprocessor.fit_transform(x_train)
x_test = preprocessor.transform(x_test)

In [42]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x_train,y_train)
knn.score(x_test,y_test)

0.9253246753246753

In [43]:
joblib.dump(knn, "knn_heart_disease")

['knn_heart_disease']

In [44]:
knn_loaded = joblib.load('knn_heart_disease')
# 预测
y_pred = knn_loaded.predict(x_test[10:11])
# 打印真实值与预测值
print(y_test.iloc[10], y_pred)

0 [0]


# 网格搜索

In [45]:
from sklearn.model_selection import GridSearchCV
knn = KNeighborsClassifier()
param_grid = {'n_neighbors':[1,2,3,4,5,6,7,8,9,10]}
knn = GridSearchCV(estimator=knn,param_grid=param_grid,cv=10)
knn.fit(x_train, y_train)
print(pd.DataFrame(knn.cv_results_))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.001001      0.000896         0.003844        0.000766   
1       0.000702      0.000459         0.003098        0.000303   
2       0.000700      0.000458         0.003399        0.000495   
3       0.000804      0.000402         0.003028        0.000380   
4       0.000654      0.000453         0.003046        0.000126   
5       0.000503      0.000503         0.003136        0.000532   
6       0.000704      0.000461         0.002821        0.000354   
7       0.000575      0.000476         0.005135        0.004324   
8       0.000905      0.000302         0.005397        0.004219   
9       0.000704      0.000461         0.003370        0.000438   

   param_n_neighbors               params  split0_test_score  \
0                  1   {'n_neighbors': 1}           0.986111   
1                  2   {'n_neighbors': 2}           0.930556   
2                  3   {'n_neighbors': 3}           0.888889   
3     

In [46]:
print(knn.best_estimator_) # 最佳模型
print(knn.best_score_) # 最佳得分

KNeighborsClassifier(n_neighbors=1)
0.9707355242566511


In [47]:
# 使用最佳模型进行评估
knn = knn.best_estimator_
print(knn.score(x_test, y_test))

0.9902597402597403
