In [93]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from mlxtend.plotting import plot_decision_regions
import numpy as np
from collections import Counter

In [94]:
df = pd.read_csv('/content/data.csv')

In [95]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [96]:
df.drop(columns=['id','Unnamed: 32'],inplace=True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [97]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

# splitting data

In [98]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=4)


# scaling data in order to perform KNN

In [99]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# perform KNN

In [100]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train,y_train)

In [101]:
accuracy_score(y_test,knn.predict(X_test))

0.9912280701754386

# Check best value for K

In [102]:
scores = []
for i in range(1,16):
  knn = KNeighborsClassifier(n_neighbors=i)
  knn.fit(X_train,y_train)
  scores.append(accuracy_score(y_test,knn.predict(X_test)))

In [103]:
scores

[0.956140350877193,
 0.9736842105263158,
 0.9824561403508771,
 0.9736842105263158,
 0.9649122807017544,
 0.9736842105263158,
 0.9649122807017544,
 0.9736842105263158,
 0.9824561403508771,
 0.9824561403508771,
 0.9736842105263158,
 0.9824561403508771,
 0.9824561403508771,
 0.9824561403508771,
 0.9912280701754386]

# Creating own KNN class from scratch

In [104]:
class MyKNN:
  def __init__(self,k):
    self.k = k
    self.X_train = None
    self.y_train = None

  def fit(self,X_train,y_train):
    self.X_train = X_train
    self.y_train = y_train


  def predict(self,X_test):
    y_pred = []
    for i in X_test:
      distance = []
      for j in self.X_train:
        distance.append(self.calculate_euclidean(i,j))
      distance = sorted(list(enumerate(distance),key=lambda x:x[1]))[0:self.k]
      y_pred.append(self.calculate_labels(distance))

    return np.array(y_pred)

  def calculate_labels(self,data):
    labels = []
    for i in data:
        labels.append(self.y_train[i[0]])
    return(Counter(labels).most_common(1)[0][0])


  def calculate_euclidean(self,x,y):
    return np.linalg.norm(x-y)

In [105]:
myknn = MyKNN(k=15)
myknn.fit(X_train,y_train)
accuracy_score(y_test,knn.predict(X_test))

0.9912280701754386