In [69]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier

In [82]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)
df.head(10)

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3
5,64,0,4,130,303,0,0,122,0,2.0,2,2,3,0
6,63,1,4,130,308,0,0,138,1,2.0,2,?,?,2
7,58,1,2,130,251,0,0,110,0,0.0,?,?,?,0
8,42,1,2,150,268,0,0,136,0,0.0,?,?,?,0
9,54,1,3,120,258,0,2,147,0,4.0,2,0,7,0


In [71]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [108]:
# handle missing value

header = feature.columns.values.tolist()
feature_impute = feature.replace('?',np.nan)


imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature_impute[discrete_value])
feature_impute[discrete_value] = imputer_mode.transform(feature_impute[discrete_value])

imputer_mean.fit(feature_impute[continues_value])
feature_impute[continues_value] = imputer_mean.transform(feature_impute[continues_value])

feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [109]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.051624,0.532316,0.794607,-0.396834,0.145063,-0.407637,-0.745075,0.065227,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
1,0.156899,0.532316,0.794607,1.383611,0.154309,-0.407637,-0.745075,-1.119498,1.305703,-0.19077,0.4698444,0.0,-0.57954
2,0.051624,-1.878582,-0.28567,0.142695,0.958673,2.453165,-0.745075,1.249952,-0.765871,-0.522566,-1.50503,-1.261202,-0.57954
3,-0.580027,-1.878582,-0.28567,-0.666598,-0.049093,-0.407637,-0.745075,-0.527135,-0.765871,-0.522566,-4.385103e-16,0.0,-0.57954
4,-0.369477,0.532316,0.794607,-0.666598,-1.851978,-0.407637,0.492481,0.69708,1.305703,-0.522566,-1.50503,0.0,1.219112


In [110]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 0.35500668737181995
Column2 0.2591122494555663
Column3 0.387827436918273
Column4 0.11219603416675152
Column5 -0.22660619371755436
Column6 0.12959342746947924
Column7 0.1449938817339308
Column8 -0.35097943499695294
Column9 0.3552562329958971
Column10 0.21986764610588136
Column11 0.26465461503796067
Column12 0.3088923320458529
Column13 0.29820868614856366


In [111]:
# drop feature with low correlation
feature_scale.drop("Column2",axis=1,inplace=True)
feature_scale.drop("Column4",axis=1,inplace=True)
feature_scale.drop("Column12",axis=1,inplace=True)
feature_scale.drop("Column13",axis=1,inplace=True)

In [114]:
max = 0
k = 0
for i in range(1,701):
    # construct model
    knn_model = KNeighborsClassifier(n_neighbors=i).fit(X_train,y_train)

    y_predict = knn_model.predict(X_test)

    # display accuracy score
    if(max<round(accuracy_score(y_test, y_predict),2)):
        max = round(accuracy_score(y_test, y_predict),2)
        k = i
print('Accuracy score: ',max,' with k = ',k)

Accuracy score:  0.62  with k =  1
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.63  with k =  2
Accuracy score:  0.6

Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with 

Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with k =  45
Accuracy score:  0.65  with 