KNN : predict whether a person will have diabetes or not 

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split #to split yr data
from sklearn.preprocessing import StandardScaler #preprocessing
from sklearn.neighbors import KNeighborsClassifier #KNN
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

#these are the 3 metrics for measuring accuracy 


In [2]:
dataset=pd.read_csv('diabetes.csv')

In [3]:
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
dataset.shape

(768, 9)

In [6]:
#replace zeros
zeros_not_accepted=['Glucose','BloodPressure','SkinThickness','BMI','BMI']
for column in zeros_not_accepted:
    dataset[column]=dataset[column].replace(0,np.NaN)
    mean=int(dataset[column].mean(skipna=True)) #skipna=true gonna exclude NA values while counting the result
    dataset[column]=dataset[column].replace(np.NaN,mean)

In [9]:
dataset['Glucose']

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
5      116.0
6       78.0
7      115.0
8      197.0
9      125.0
10     110.0
11     168.0
12     139.0
13     189.0
14     166.0
15     100.0
16     118.0
17     107.0
18     103.0
19     115.0
20     126.0
21      99.0
22     196.0
23     119.0
24     143.0
25     125.0
26     147.0
27      97.0
28     145.0
29     117.0
30     109.0
31     158.0
32      88.0
33      92.0
34     122.0
35     103.0
36     138.0
37     102.0
38      90.0
39     111.0
40     180.0
41     133.0
42     106.0
43     171.0
44     159.0
45     180.0
46     146.0
47      71.0
48     103.0
49     105.0
50     103.0
51     101.0
52      88.0
53     176.0
54     150.0
55      73.0
56     187.0
57     100.0
58     146.0
59     105.0
60      84.0
61     133.0
62      44.0
63     141.0
64     114.0
65      99.0
66     109.0
67     109.0
68      95.0
69     146.0
70     100.0
71     139.0
72     126.0
73     129.0
74      79.0
75     121.0
76      62.0

In [8]:
pd.set_option('max_rows',790)

In [11]:
result=dataset[dataset['Glucose']==0]

In [12]:
result

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome


In [14]:
#split the data
X=dataset.iloc[:,0:8]
y=dataset.iloc[:,8]
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=0.2)

In [15]:
#feature scaling
scaler=StandardScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)


In [17]:
#define the model using KneighborsClassifier and fir the train data in the model
#calculate value of K
import math
math.sqrt(len(y_test))
#since it's better to choose a odd number then K=11

12.409673645990857

In [19]:
#define the model
classifier=KNeighborsClassifier(n_neighbors=11,p=2,metric='euclidean')
#p=2 whether diabete or not 
#euclidean is a metric to calcultate the nearest neighbors (distance)

In [20]:
#fit the model
classifier.fit(X_train,y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=11)

In [21]:
#predict
y_pred=classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [22]:
#evaluate the model using confusion matrix
cm=confusion_matrix(y_test,y_pred)
print(cm)

[[97 10]
 [18 29]]


In [23]:
print(f1_score(y_test,y_pred))

0.6744186046511628


In [24]:
print(accuracy_score(y_test,y_pred))

0.8181818181818182


In [None]:
#the accuracy of 80% tells us that it correctly classified 80% of observations