### KNN
- Build a KNN model for predicting if a person will have diabetes or not with a high accuracy 
score.
- Perform some appropriate Pre-Processing steps on the given dataset for better results. 
- Implement the KNN algorithm on your own.
- Try other possible processes that can be done to dataset and tuning the model to increase accuracy such as Increase K value, Normalization and 
Different Distance Metrics. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./dataset/diabetes.csv")
df.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.drop_duplicates(keep='first', inplace=True)

In [4]:
df.dropna(inplace=True)

In [5]:
x = df.drop(['Outcome'], axis = 1)
y = df.drop(x.columns, axis = 1)

In [6]:
x = x.to_numpy() # to remove column names
y = y.to_numpy()

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = 0.2)

In [1]:
class KNN:
    def __init__(self, k):
        self.k = k
        
    def fit(self, x_train, y_train):
        self.x_train = x_train
        self.y_train = y_train.ravel() # make 1D array

    def predict(self, x_test):
        return [self.__predict(x) for x in x_test]

    def __predict(self, x):
        distances = [self.__euc_dist(x, i) for i in self.x_train]
        # distances = [self.__man_dist(x, i) for i in self.x_train]

        indices = np.argsort(distances)[:self.k]

        k_labels = [self.y_train[i] for i in indices]

        res = max(set(k_labels), key = k_labels.count)

        return res

    def __euc_dist(self, x, y):
        return np.sqrt(np.sum( (x - y) ** 2) )

    def __man_dist(self, x, y):
        return np.sum(abs(x - y))

In [10]:
model = KNN(5)
model.fit(x_train, y_train)

In [11]:
y_pred = model.predict(x_test)

In [12]:
from sklearn.metrics import confusion_matrix

In [13]:
confusion_matrix(y_test, y_pred)

array([[83, 16],
       [34, 21]], dtype=int64)