<a href="https://colab.research.google.com/github/kristianJW54/ML-AI-Models-Projects/blob/main/KNN_Model_Build.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [102]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.stats import mode

from sklearn.neighbors import KNeighborsClassifier

**Select Data and Split into train and validate sets**

In [137]:
path = "/content/drive/MyDrive/Colab Notebooks/ML - AI Learning/diabetes.csv"

data = pd.read_csv(path)

data.head()

X = data.iloc[:,:-1].values

Y = data.iloc[:,-1:].values

Y = Y.ravel()

# Splitting dataset into train and test set

X_train, X_test, Y_train, Y_test = train_test_split(
  X, Y, test_size = 1/3, random_state = 0 )

# Scikit-learn k-NN implementation
knn_sklearn = KNeighborsClassifier(n_neighbors=4)
knn_sklearn.fit(X_train, Y_train)
Y_predict_sklearn = knn_sklearn.predict(X_test)




**Basic KNN**


A data point will have a list of features : [x1, x2, x3...xn]

Which will have a seperate classified label : [0] or [1]



Distance = the square root of the squared difference between the training data and the test/unclassified data



**Build Classifier Class**

- First create a method for calculating the distance
- Next a method to classify the data
- A method to determine validation error is needed to determine the best n for K
- A fit() method to store the training data
- Finally a predict method to predict new data and classify

In [135]:
class knn():
  def __init__(self, k):
    self.k = k
    # self.X_train = X_train
    # self.y_train = y_train

  #Storing Training Set
  def fit(self, X_train, Y_train):
    self.X_train = X_train
    self.Y_train = Y_train
    self.validation_error = 0.0
    self.Y_predict = None

    # no_of_test_examples, no_of_features
    self.m, self.n = self.X_train.shape


  def distance(self, x, x_train):
    return np.sqrt( np.sum( np.square( x - x_train ) ) )

  def find_nearest(self, x):

    # calculate all the euclidean distances between current
    # test example x and training set X_train

    #Initialise array of zeros for the length of X_train
    distances = np.zeros(self.m)


    for i in range(self.m):
      #Finding the distance using the previous created method, between current test (x)
      #And the current training row in the loop: X_train[i]
      d = self.distance(x , self.X_train[i])
      # print(f"X train Row: {self.X_train[i]}")
      # print(f"X test Row: {x}")
      # print(f"Distance for d: {d} Number {i}")
      #Updating the distances array of zeros with the actual distance at the specified index from the loop
      distances[i] = d

      # sort Y_train according to euclidean_distance_array and
      # store into Y_train_sorted

    inds = distances.argsort()
    Y_train_sorted = self.Y_train[inds]
    # print(f"Distances Updated: {inds}")
    #Return the K number of list items in the sorted array
    # print("Y_train_sorted")
    # print(Y_train_sorted.T)
    return Y_train_sorted[:self.k]


  def mode(self, neighbors):
    num1 = sum(neighbors)
    num0 = len(neighbors) - num1

    # Return the majority class, or 0 in case of a tie
    if num1 > num0:
        return 1
    elif num1 < num0:
        return 0
    else:
        return 0  # Return 0 in case of a tie

  def classify(self, X_test):

    self.X_test = X_test

    #Storing the length and features length of the test set
    self.m_test, self.n = X_test.shape

    #Initialise an array of zeros for Y_train using the length of the test set
    Y_predict = np.zeros(self.m_test, dtype="int")
    # print("Y Initial:")
    # print(Y_predict)

    for i in range(self.m_test):

      #This will loop through each row of the test set
      x = self.X_test[i]
      # print(f"Looped X (test) Number {i}:")
      # print(x)

      neighbor = np.zeros(self.k)
      # print("Initial Neighbor:")
      # print(neighbor)
      neighbor = self.find_nearest(x)
      # print("Neighbor:")
      # print(neighbor)


      Y_predict[i] = self.mode(neighbor)
      # most frequent class in K neighbors

      # Y_predict[i] = mode( neighbor )[0][0]

    self.Y_predict = Y_predict
    return Y_predict


test = knn(k=4)
test.fit(X_train, Y_train)
test.classify(X_test)
print("Custom:")
print(test.Y_predict)
print("SKlearn")
print(Y_predict_sklearn)
print("Actual")
print(Y_test.T)


Custom:
[0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
SKlearn
[0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0]
Actual
[1 0 0 1 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 1 1 1 0 0 1 1 0 1 1 0 0 0 0 0]


**Comparing Accuracy of Custom Model to SKLearn Model**

In [136]:
from sklearn.metrics import accuracy_score

#Testing accuracy scores


# Custom k-NN implementation
test_custom = knn(k=3)
test_custom.fit(X_train, Y_train)
Y_predict_custom = test_custom.classify(X_test)

# Scikit-learn k-NN implementation
knn_sklearn = KNeighborsClassifier(n_neighbors=3)
knn_sklearn.fit(X_train, Y_train)
Y_predict_sklearn = knn_sklearn.predict(X_test)

# Compare the predictions
print("Custom k-NN Predictions:")
print(Y_predict_custom)

print("\nScikit-learn k-NN Predictions:")
print(Y_predict_sklearn)

# Compare accuracy
accuracy_custom = accuracy_score(Y_test, Y_predict_custom)
accuracy_sklearn = accuracy_score(Y_test, Y_predict_sklearn)

print("\nAccuracy (Custom):", accuracy_custom)
print("Accuracy (Scikit-learn):", accuracy_sklearn)


Custom k-NN Predictions:
[0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0]

Scikit-learn k-NN Predictions:
[0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0]

Accuracy (Custom): 0.6388888888888888
Accuracy (Scikit-learn): 0.6388888888888888


**Optimizing the class**

- Experimenting with feature scaling to optimize the accuracy
- Implementing a K identifier by comparing accuracy scores for all K values and choosing the best one (basic grid search)

In [163]:
class knn_optimal():
  def __init__(self, k=None):
    self.k = k
    self.X_train = None
    self.Y_train = None
    self.Y_predict = None
    self.accuracy = None
    self.scaled = None

  def fit(self, X_train, Y_train, normalize=True):
    if normalize:
      self.X_train = self.normalize(X_train)
      self.scaled = True
    else:
      self.X_train = X_train
      self.scaled = False

    self.Y_train = Y_train

    self.m, self.n = self.X_train.shape

  def normalize(self, x):
    #Scale each row
    mean = x.mean(axis=0)
    std = x.std(axis=0)
    normalized = (x - mean) / std
    return normalized

  def accuracy_score(self, Y_test):
    length = len(Y_test)
    sum = np.sum(self.Y_predict == Y_test)
    accuracy = sum/length

    self.accuracy = accuracy
    return accuracy

  def mode(self, vector):
    num1 = sum(vector)
    num0 = len(vector) - num1
    if num1 > num0:
        return 1
    elif num1 < num0:
        return 0
    else:
        return 0

  #Grid Search method
  def grid_search(self, X_test, Y_test):

    best_k = None
    best_accuracy = 0.0
    max_k = int(np.sqrt(len(self.X_train)))

    for k in range(1, max_k + 1):
      self.k = k
      self.fit(self.X_train, self.Y_train, self.scaled)
      Y_val_predict = self.predict(X_test)
      accuracy = self.accuracy_score(Y_test)

      if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_k = k

    print(f"Best Accuracy: {best_accuracy}")
    print(f"Best K: {best_k}")


  #Distance Method
  def distance(self, x, x_train):
    return np.sqrt(np.sum(np.square(x - x_train)))

  def find_nearest(self, x):

    distances = np.zeros(self.m)

    for i in range(self.m):

      d = self.distance(x, self.X_train[i])
      distances[i] = d

    indexes = distances.argsort()
    Y_train_sorted = self.Y_train[indexes]

    return Y_train_sorted[:self.k]


  def predict(self, X_test):
    if self.scaled:
      self.X_test = self.normalize(X_test)
    else:
      self.X_test = X_test

    self.m_test, self.n = self.X_test.shape

    Y_predict = np.zeros(self.m_test, dtype="int")

    for i in range(self.m_test):

      x = self.X_test[i]

      neighbor = np.zeros(self.k)
      neighbor = self.find_nearest(x)

      Y_predict[i] = self.mode(neighbor)
    self.Y_predict = Y_predict

    return Y_predict



test2 = knn_optimal(k=1)
test2.fit(X_train, Y_train)
predict = test2.predict(X_test)
print(predict)
test2.grid_search(X_test, Y_test)





[0 0 0 0 1 0 0 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0]
Best Accuracy: 0.6944444444444444
Best K: 1


**Improved Accuracy of predictions by 0.05555555555555558**