# K Nearest Neighbors

### Importing the necessary modules

In [1]:
import numpy as np
from math import sqrt
import warnings
from matplotlib import style
from collections import Counter
import random
import pandas as pd

### Creating the KNN function

In [2]:
def k_nearest_neighbors(data, predict, k=3):
    if len(data) >= k:
        warnings.warn('K is set to a value less than total voting groups')
    
    distances = []
    for group in data:
        for features in data[group]:
            euclidian_distance = np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidian_distance, group])
            
    votes = [i[1] for i in sorted(distances)[:k]]
    vote_results = Counter(votes).most_common(1)[0][0]
    confidence = Counter(votes).most_common(1)[0][1]/k
    return vote_results,confidence

### Reading the Data

In [3]:
df = pd.read_csv('breast-cancer-wisconsin.data')

### Exploring the Data Set

In [4]:
df.head()

Unnamed: 0,id,clump_thickness,unif_cell_size,unif_cell_shape,marg_adhesion,single_epith_cell_size,bare_nuclei,bland_chrom,norm_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [5]:
df.isnull().any()

id                        False
clump_thickness           False
unif_cell_size            False
unif_cell_shape           False
marg_adhesion             False
single_epith_cell_size    False
bare_nuclei               False
bland_chrom               False
norm_nucleoli             False
mitoses                   False
class                     False
dtype: bool

In [6]:
# There are some '?' values that need to be replaced. KNN penalizes outliers, so it makes sense to replace these
# columns with extreme values. The ID column can also be dropped as it is not important for the calculations
df.replace('?',-99999, inplace = True)
df.drop(['id'],1,inplace=True)
full_data=df.astype(float).values.tolist()
random.shuffle(full_data)

### Dividing the Data into Training and Test Sets

In [7]:
random.shuffle(full_data) # Shuffling the data for the split
test_size = 0.2 # Split Ratio
train_set = {2:[], 4:[]} # Creating dictionaries for each set, 2 = Benign, 4 = Malignant
test_set = {2:[], 4:[]}
train_data = full_data[:-int(test_size*len(full_data))]
test_data = full_data[-int(test_size*len(full_data)):]

for i in train_data: 
    train_set[i[-1]].append(i[:-1]) # Append to the dictionary based on classification
    
for i in test_data:
    test_set[i[-1]].append(i[:-1])

### Testing the model

In [8]:
correct = 0
total = 0
k = 5
for group in test_set:
    for data in test_set[group]:
        vote,confidence = k_nearest_neighbors(train_set, data, k)
        if group == vote:
            correct +=1
        total +=1
print('Accuracy: {}%'.format(correct/total*100))

Accuracy: 96.40287769784173%


### Creating a Prediction function that works with many values at once

In [9]:
def predict(data):
    result = []
    for i in data:
        result.append(k_nearest_neighbors(train_set, i, k))
    
    return result

### Predicting some values

In [10]:
# Creating prediction data
predict_data = []
for i in range(10):
    predict_data.append(full_data[i][:-1])

In [11]:
# Prediction Results (Prediction / Confidence)
predict(predict_data)

[(2, 1.0),
 (4, 1.0),
 (2, 1.0),
 (2, 1.0),
 (4, 1.0),
 (4, 1.0),
 (4, 1.0),
 (2, 1.0),
 (2, 1.0),
 (2, 1.0)]