In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:
# this is a very common dataset available on kaggle.com, used for classification problem, to classify diabetic vs. non-diabetic
dataset = pd.read_csv('diabetes.csv')

print(len(dataset))
print(dataset.head())

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [3]:
# Data Cleaning, having zero values for certain columns doesn't make sense, we need to clean them, one way is to repace the missing values with the mean values
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for column in zero_not_accepted:
	dataset[column] = dataset[column].replace(0, np.NaN)
	mean = int(dataset[column].mean(skipna=True))
	dataset[column] = dataset[column].replace(np.NaN, mean)

In [4]:
# split dataset into train & test (80 train :20 test)
x = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]

x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=0,test_size=0.2)

In [5]:
# feature scaling
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [6]:
# k in knn is chosen using the below algorithm, round down to the nearest integer and if even scale it down or else use as is
import math
k = int(math.sqrt(len(y_test)))

if k % 2 == 0:
	k = k - 1

In [7]:
# Define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=k, p=2,metric='euclidean')

In [8]:
# Fit the model
classifier.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [9]:
# Predict the test set results
y_pred = classifier.predict(x_test)
# print (y_pred)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0]


In [10]:
# Evaluate the model, using confusion matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[94 13]
 [15 32]]
0.6956521739130436
0.8181818181818182


In [11]:
# Evaluate the model, using f1_score
print(f1_score(y_test, y_pred))

0.6956521739130436


In [12]:
# Evaluate the model, using accuracy_score
print(accuracy_score(y_test, y_pred))

0.8181818181818182
