## Imports

In [None]:
import numpy as np
from sklearn import preprocessing, model_selection, neighbors
import pandas as pd 
from math import sqrt
from scipy.spatial import distance
import random
from collections import Counter

df = pd.read_csv('breast-cancer-wisconsin.data.txt')


## Understand The Data

In [None]:
print(df.shape[0]) 
print(df.shape[1])

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
for column in df.columns:
    print(f"{column} : {len(df[column].unique())}")

## Clean And Prepare The Data

In [None]:
df.replace('?', -99999, inplace= True)

In [None]:
df.drop(['id'], axis= 1,inplace= True)

In [None]:
fullData = df.astype(float).values.tolist()

In [None]:
random.shuffle(fullData)

In [None]:
print(fullData[:5])

## K Nearest Neighbors 

In [None]:
def kNearestNeighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups!')
    distances = []
    for group in data:
        for features in data[group]:
            euclidean = np.linalg.norm(np.array(features) - np.array(predict))
            distances.append([euclidean, group])

    votes = [i[1] for i in sorted(distances)[:k]]        

    voteResult = Counter(votes).most_common(1)[0][0]

    return voteResult

In [None]:
testSize = 0.2
trainSet = {2:[], 4:[]}
testSet = {2:[], 4:[]}

trainData = fullData[:-int(testSize*len(fullData))]
testData = fullData[:-int(testSize*len(fullData)):]

In [None]:
for i in trainData:
    trainSet[i[-1]].append(i[:-1])

    for i in trainData:
        testSet[i[-1]].append(i[:-1])


In [None]:
correct = 0
total = 0

for group in testSet:
    for data in testSet[group]:
        vote = kNearestNeighbors(trainSet, data, k=5)  # Pass 'data' as the 'predict' argument
        if group == vote:
            correct += 1
        total += 1

print('Accuracy:', correct / total)
