# Predict whether a patient will be diagnosed with diabetes or not

In [14]:
#import Scikit-learn libraries
import pandas as pd #pandas dataframe
import numpy as np

from sklearn.model_selection import train_test_split #split to train particular model
from sklearn.preprocessing import StandardScaler #normalize/uniform values to avoid scew results
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix #test the model
from sklearn.metrics import f1_score 
from sklearn.metrics import accuracy_score

In [15]:
#Load the dataset
dataset = pd.read_csv('diabetes.csv')
print(len(dataset))
dataset.head()

2000


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,2,138,62,35,0,33.6,0.127,47,1
1,0,84,82,31,125,38.2,0.233,23,0
2,0,145,0,0,0,44.2,0.63,31,1
3,0,135,68,42,250,42.3,0.365,24,1
4,1,139,62,41,480,40.7,0.536,21,0


In [16]:
#Replace zeroes as you cant have zero skin thickness, or you might be dead
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN) #replace zero with NaN
    mean = int(dataset[column].mean(skipna=True))  #calculate the mean value of a particular column
    dataset[column] = dataset[column].replace(np.NaN, mean)  #Replace all NaN with the mean value of a particular column

In [17]:
#Split the dataset into train and test sets
x = dataset.iloc[:,0:8] #in pandas list all rows, 8 columns
y = dataset.iloc[:,8] #all rows, just column 8 which is the OutCome column
x_train, x_test, y_train, y_test = train_test_split(x,y, random_state = 0, test_size = 0.2)


In [18]:
#Rule of thumb: Any algorithm that computes distance or assumes normality, scale your features.
#Feature scaling
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

# Using KNeighbors Classifier 

In [19]:
import math
math.sqrt(len(y_test))

20.0

N_Neighbors is 'K'.
p is the power parameter to define the metric used, which is 'Euclidean' in our case.
Define the model using KNeighborsClassifier and fit the train data in the model.

In [22]:
#Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=19, p=2, metric = 'euclidean')  
#n_neighbors=19 cuz when we vote, we want odd neighbor number so we get to see which side wins
#when we did the squareroot math, we got 20, so we chose 19 since 19 is an odd number
#p=2 because we want to see the outcome is diabetic or not
#metric = 'euclidean' because it is the common one

In [23]:
#Fit model
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=19, p=2,
           weights='uniform')

In [25]:
#Predict the test set results
y_pred = classifier.predict(x_test)
y_pred

array([0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1,

In [26]:
#Evaluate model
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[236  36]
 [ 45  83]]


In [27]:
print(f1_score(y_test, y_pred))
#The highest possible value of F1 is 1, indicating perfect precision and recall, and the lowest value is 0.

0.672064777328


In [29]:
print(accuracy_score(y_test, y_pred))

0.7975


We have an accuracy of 79% tells us that it is a pretty fair fit in the model.