In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

#these are all about test needs 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [3]:
dataset = pd.read_csv('diabetes.csv')
print(len(dataset))
print(dataset.head()) 

768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  Unnamed: 9  
0                     0.627   50        1         NaN  
1                     0.351   31        0         NaN  
2                     0.672   32        1         NaN  
3                     0.167   21        0         NaN  
4                     2.288   33        1         NaN  


In [5]:
#replace zeroes
zero_not_accepted =['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin'] 


#using the avg (mean) to replace nulls or zeros is a good practice
for column in zero_not_accepted:
    dataset[column] = dataset[column].replace(0, np.NaN)
    mean = int(dataset[column].mean(skipna=True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [6]:
print(dataset['Glucose'])

0      148.0
1       85.0
2      183.0
3       89.0
4      137.0
       ...  
763    101.0
764    122.0
765    121.0
766    126.0
767     93.0
Name: Glucose, Length: 768, dtype: float64


In [8]:
#before proceeding any further, lets split the dataset into train and test
#split dataset

X = dataset.iloc[:, 0:8]
y = dataset.iloc[:, 8]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, test_size=0.2)

In [9]:
#feature scaling - ****any algo that computes distnce or assumes normatility, scale your features****

#feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [13]:
import math
math.sqrt(len(y_test))

12.409673645990857

In [10]:
#define the model using KNeighbors Classifier and fit hte train data into the model
#N_neighbors here is "k"; p is the power parameter to define the metric used, which is 'Euclidiean' in our case

#define the model: Init K-NN
classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')

In [11]:
#Fit model
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [16]:
#predict the test set results
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [20]:
#important to evaluate the model

#evaluate model
cm = confusion_matrix(y_test, y_pred)
print(cm)

#f1 score takes into account both sides of the balance of false positives;  this lets us know if there are more false positives
print(f1_score(y_test, y_pred))

#accuracy
print(accuracy_score(y_test, y_pred))

[[94 13]
 [15 32]]
0.6956521739130436
0.8181818181818182
