In [6]:
import pandas as pd 
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [7]:
# loading in the dataset and checking to see that it has loaded propperly

dataset = pd.read_csv('./Datasets/diabetes.csv')
print(len(dataset))
print(dataset.head())

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI   
0            6      148             72             35        0  33.6  \
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [8]:
# removing no inputs in the data with the mean number of the data to help the effectivness of the k-nn 

zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', "BMI", 'Insulin']

for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0,np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)


In [11]:
# splitting the input and output data
X = dataset.iloc[:, 0:8]
Y = dataset.iloc[:, 8]
# splitting the data into the training data and the testing data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=0, test_size = 0.2)

In [13]:
# standardizing the data to be of numbers between 0-1
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [14]:
# setting up and training k-nn
classifier = KNeighborsClassifier(n_neighbors =11, p=2, metric='euclidean' )
classifier.fit(X_train, Y_train)

In [15]:
# testing the k-nn and outputing the confusin matrix
Y_pred = classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
print(cm)

[[95 12]
 [18 29]]


In [16]:
# producing f1 score and accuracy score
print(f1_score(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

0.6590909090909092
0.8051948051948052
