# import libraries

In [None]:
import numpy as np
from collections import Counter

# define euclidian distances

In [None]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))

# prediction function

In [None]:
def knn_predict(training_data, training_labels, test_point, k):
    distances = []
    for i in range(len(training_data)):
        dist = euclidean_distance(test_point, training_data[i])
        distances.append((dist, training_labels[i])) # save distance with corresponding label
    distances.sort(key=lambda x: x[0]) # sort by distance
    k_nearest_labels = [label for _, label in distances[:k]]
    return Counter(k_nearest_labels).most_common(1)[0][0]

# train set

In [None]:
training_data = [[1, 2], [2, 3], [3, 4], [6, 7], [7, 8]]
training_labels = ['A', 'A', 'A', 'B', 'B']
test_point = [4, 5]
k = 3

In [None]:
prediction = knn_predict(training_data, training_labels, test_point, k)
print(prediction)

# using SKlearn

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Reading the Dataset

In [None]:
df = pd.read_csv('https://www.kaggle.com/datasets/yasserh/breast-cancer-dataset.csv') 

# Separate dependent and independent variables
y = df['diagnosis'] 
X = df.drop('diagnosis', axis = 1) 
X = X.drop('Unnamed: 32', axis = 1) 
X = X.drop('id', axis = 1) 

# Splitting the data into a training set and a test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# training

In [None]:
K = []
training = []
test = []
scores = {}

for k in range(2, 21):
    clf = KNeighborsClassifier(n_neighbors = k)
    clf.fit(X_train, y_train)

    training_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    K.append(k)

    training.append(training_score)
    test.append(test_score)
    scores[k] = [training_score, test_score]

# evaluate

In [None]:
for keys, values in scores.items():
    print(keys, ':', values)

# Plotting the training and test scores graph

In [None]:
ax = sns.stripplot(x=K, y=training)  # Use x and y as keyword arguments
ax.set(xlabel='Values of k', ylabel='Training Score')
plt.show()

In [None]:
ax = sns.stripplot(x=K, y=test)  # Use x and y as keyword arguments
ax.set(xlabel='Values of k', ylabel='Test Score')
plt.show()

In [None]:
plt.scatter(K, training, color='k')
plt.scatter(K, test, color='g')
plt.show()

### From the above scatter plot, we can come to the conclusion that the optimum value of k will be around 5.

# resources
https://www.geeksforgeeks.org/machine-learning/k-nearest-neighbor-algorithm-in-python/

https://www.geeksforgeeks.org/machine-learning/implementation-of-k-nearest-neighbors-from-scratch-using-python/

https://www.geeksforgeeks.org/machine-learning/mathematical-explanation-of-k-nearest-neighbour/

https://www.geeksforgeeks.org/machine-learning/weighted-k-nn/

https://www.geeksforgeeks.org/machine-learning/elbow-method-for-optimal-value-of-k-in-kmeans/