In [1]:
# Import needed packages for classification
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix

from sklearn.metrics import f1_score

from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
#read data
data = pd.read_csv('diabetes.csv')

In [3]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
#make sure to handle all Nan values

zero_not_accepted = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']
# for col in zero_not_accepted:
#     for i in data[col]:
#         if i==0:
#             colSum = sum(data[col])
#             meanCol=colSum/len(data[col])
#             data[col]=meanCol

for col in zero_not_accepted:
    data[col]= data[col].replace(0,np.NaN)
    mean = int(data[col].mean(skipna=True))
    data[col] = data[col].replace(np.NaN,mean)

In [5]:
#let us display one of the columns
#you will not see anything that looks like missing data
data['Glucose']

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64

In [6]:
#do you recall slicing data
#all rows but we are looking on on columns 0 to 8
#column 9 is the outcome and it is not part of the training data.
#recall the last index is excluded
#Doing the following slicing we are just excluding the last column 'outcome' which is not part of 
#of our training data

X = data.iloc[:,0:8]
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,155.0,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,29.0,155.0,30.1,0.349,47


In [7]:
#still slicing data
#here we only need the answer. So we need just column 8. Which is the column named 'outcome'

y = data.iloc[:,8]
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [8]:
#recall we imported the train_test_split
#we provided the X, y and test size and random_state
#note the random_state is not required but so that all of us have same answers 
#that we can compare
#test_size is 0.2 that means we are going to take 20% of the data and put it aside so that we can 
#test it later.

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [9]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
603,7,150.0,78.0,29.0,126.0,35.2,0.692,54
118,4,97.0,60.0,23.0,155.0,28.2,0.443,22
247,0,165.0,90.0,33.0,680.0,52.3,0.427,23
157,1,109.0,56.0,21.0,135.0,25.2,0.833,23
468,8,120.0,72.0,29.0,155.0,30.0,0.183,38
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
192,7,159.0,66.0,29.0,155.0,30.4,0.383,36
629,4,94.0,65.0,22.0,155.0,24.7,0.148,21
559,11,85.0,74.0,29.0,155.0,30.1,0.300,35


In [10]:
X_test

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
661,1,199.0,76.0,43.0,155.0,42.9,1.394,22
122,2,107.0,74.0,30.0,100.0,33.6,0.404,23
113,4,76.0,62.0,29.0,155.0,34.0,0.391,25
14,5,166.0,72.0,19.0,175.0,25.8,0.587,51
529,0,111.0,65.0,29.0,155.0,24.6,0.660,31
...,...,...,...,...,...,...,...,...
476,2,105.0,80.0,45.0,191.0,33.7,0.711,29
482,4,85.0,58.0,22.0,49.0,27.8,0.306,28
230,4,142.0,86.0,29.0,155.0,44.0,0.645,22
527,3,116.0,74.0,15.0,105.0,26.3,0.107,24


In [11]:
y_train

603    1
118    0
247    0
157    0
468    1
      ..
763    0
192    1
629    0
559    0
684    0
Name: Outcome, Length: 614, dtype: int64

In [12]:
y_test

661    1
122    0
113    0
14     1
529    0
      ..
476    1
482    0
230    1
527    0
380    0
Name: Outcome, Length: 154, dtype: int64

In [13]:
#create an instance or object of StandardScaler call it scaler
#call the method fit_transform on X_train
#call transform on X_test

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
X_train

array([[ 0.90832902,  0.93641795,  0.44764174, ...,  0.36863635,
         0.67740401,  1.69955804],
       [ 0.03644676, -0.81630913, -1.05200558, ..., -0.63294341,
        -0.07049698, -0.96569189],
       [-1.12606292,  1.43247278,  1.44740662, ...,  2.81535261,
        -0.11855487, -0.88240283],
       ...,
       [ 0.03644676, -0.91552009, -0.63543688, ..., -1.13373329,
        -0.95656442, -1.04898095],
       [ 2.0708387 , -1.21315299,  0.11438678, ..., -0.36108605,
        -0.50001442,  0.11706589],
       [ 0.32707418,  0.47343344,  0.7808967 , ..., -0.08922869,
         0.52121586,  2.94889395]])

In [15]:
X_test

array([[-0.8354355 ,  2.55686374,  0.28101426, ...,  1.47037408,
         2.78594417, -0.96569189],
       [-0.54480808, -0.4856059 ,  0.11438678, ...,  0.13970383,
        -0.1876381 , -0.88240283],
       [ 0.03644676, -1.51078589, -0.8853781 , ...,  0.19693696,
        -0.22668514, -0.71582471],
       ...,
       [ 0.03644676,  0.67185537,  1.11415166, ...,  1.62776519,
         0.53623395, -0.96569189],
       [-0.25418066, -0.187973  ,  0.11438678, ..., -0.90480077,
        -1.07971278, -0.79911377],
       [-0.8354355 , -0.4856059 , -0.0522407 , ..., -0.26092807,
         1.06487079, -0.79911377]])

In [16]:
len(y)

768

In [17]:
import math
math.sqrt(len(X_train))

24.779023386727733

In [18]:
import math
math.sqrt(len(y_test))

12.409673645990857

In [19]:
classifier = KNeighborsClassifier(n_neighbors=11,p=2,metric='euclidean')

In [20]:
classifier.fit(X_train,y_train)

KNeighborsClassifier(metric='euclidean', n_neighbors=11)

In [21]:
y_pred = classifier.predict(X_test)

In [22]:
conf_matrix = confusion_matrix(y_test,y_pred)
print(conf_matrix)
print(f1_score(y_test,y_pred))

[[94 13]
 [15 32]]
0.6956521739130436


In [23]:
#add more information and explanation 
print(accuracy_score(y_test,y_pred))

0.8181818181818182


In [24]:
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0
764,2,122.0,70.0,27.0,155.0,36.8,0.340,27,0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30,0
766,1,126.0,60.0,29.0,155.0,30.1,0.349,47,1


In [39]:
ex_list = [[6, 148.0, 72.0, 35.0, 155.0, 33.6, 0.627, 50]]
ex_list = scaler.transform(ex_list)
ex_list

array([[ 0.6177016 ,  0.87027731, -0.0522407 ,  0.65409323, -0.01555866,
         0.13970383,  0.48216882,  1.3664018 ]])

In [40]:
predii = classifier.predict(ex_list)
predii

array([1], dtype=int64)

In [44]:
predii[0]

1

In [47]:
import uuid
uuid.uuid4().hex

'13e75254682c41b0b263d25101d177fa'

In [42]:
import pickle

In [None]:
pickle.dump(classifier, open("classifierModel.pkl", "wb"))

In [43]:
pickle.dump(scaler, open('scalerObject.pkl', 'wb'))