In [1]:
import os 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import math


In [2]:
dataset = pd.read_csv('C:\Jupyter\diabetescsv.zip')
dataset.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
#######Steps for feature engineering#######
##describe will tell you which of the columns have min values as zeros
dataset.describe()


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [4]:
##below code will help you understand if there are any missing values
dataset.isnull().values.any()

False

In [5]:
remove_zeros = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for column in remove_zeros:
    dataset[column] = dataset[column].replace(0,np.NaN)
    mean = dataset[column].mean(skipna=True)
    dataset[column] = dataset[column].replace(np.NaN,mean)
###By checking again, all the columns must have non zero min values
dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,4.494673,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,2.975395,30.435949,12.096346,8.790942,85.021108,6.875151,0.331329,11.760232,0.476951
min,1.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,2.0,99.75,64.0,25.0,121.5,27.5,0.24375,24.0,0.0
50%,4.494673,117.0,72.202592,29.15342,155.548223,32.4,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,155.548223,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
####Splitting the data into train and test#####
##Picking your X and y columns
X = dataset.iloc[:,0:8]
y = dataset.iloc[:,8]
##help(train_test_split)
##splitting the data into test and train
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 0, test_size=0.2)
#X_test.info(),X_train.info()


In [11]:

#######Further Steps for feature engineering#######
##StandardScaler standadizes all the values of all the columns
scaledX =  StandardScaler()
X_train = scaledX.fit_transform(X_train)
X_test  = scaledX.transform(X_test) 
X_test


array([[-1.18394458,  2.55672007,  0.27943597, ...,  1.46959259,
         2.78594417, -0.96569189],
       [-0.85230955, -0.48575468,  0.11279888, ...,  0.13885774,
        -0.1876381 , -0.88240283],
       [-0.18903948, -1.51093639, -0.88702365, ...,  0.19609364,
        -0.22668514, -0.71582471],
       ...,
       [-0.18903948,  0.67170854,  1.1126214 , ...,  1.62699134,
         0.53623395, -0.96569189],
       [-0.52067451, -0.18812128,  0.11279888, ..., -0.90569758,
        -1.07971278, -0.79911377],
       [-1.18394458, -0.48575468, -0.05383821, ..., -0.26179362,
         1.06487079, -0.79911377]])

In [12]:
math.sqrt(len(y_test))
#-- gives 12, we convert it to 11 for better voting results


12.409673645990857

In [13]:
####Defining the model####
##help(KNeighborsClassifier)
classifier = KNeighborsClassifier(n_neighbors=11,p=2,metric='euclidean')

###Fit Model
classifier.fit(X_train,y_train)

##Make predictions 
y_pred = classifier.predict(X_test)
y_pred



array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
      dtype=int64)

In [14]:

###Capturing results ####
cm = confusion_matrix(y_test,y_pred)
f1 = f1_score(y_test,y_pred)
score = accuracy_score(y_test,y_pred)

print('---Confusion Matrix---',end='\n')
print(cm)
print('---F1 Score---',end='\n')
print(f1)
print('---Accuracy Score---',end='\n')
print(score)

---Confusion Matrix---
[[94 13]
 [19 28]]
---F1 Score---
0.6363636363636364
---Accuracy Score---
0.7922077922077922
