In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from math import sqrt
import pandas as pd                   #for insering data files
import random

In [2]:
df = pd.read_csv('breast-cancer-wisconsin.data.txt')
df.replace('?',-99999,inplace= True)                 #replacing all ? values with large negative values(making them outliers)
df.drop(['id'],1,inplace=True)                                    #dropping the id coloumn since it's of no use in algorithm
full_data=df.astype(float).values.tolist()            #converting all our values to float and then to list

In [3]:
full_data

[[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [5.0, 4.0, 4.0, 5.0, 7.0, 10.0, 3.0, 2.0, 1.0, 2.0],
 [3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 3.0, 1.0, 1.0, 2.0],
 [6.0, 8.0, 8.0, 1.0, 3.0, 4.0, 3.0, 7.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 3.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [8.0, 10.0, 10.0, 8.0, 7.0, 10.0, 9.0, 7.0, 1.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 10.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 5.0, 2.0],
 [4.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [5.0, 3.0, 3.0, 3.0, 2.0, 3.0, 4.0, 4.0, 1.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 1.0, 1.0, 2.0],
 [8.0, 7.0, 5.0, 10.0, 7.0, 9.0, 5.0, 5.0, 4.0, 4.0],
 [7.0, 4.0, 6.0, 4.0, 6.0, 1.0, 4.0, 3.0, 1.0, 4.0],
 [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [10.0, 7.0, 7.0, 6.0, 4.0, 10.0, 4.0, 1

In [4]:
random.shuffle(full_data)       #to shuffle the data so that we get all labels in both training as well as training data
test_size=0.2              
train_set={2:[],4:[]}        #dictionary for train_set(it'll have both unique labels(2 and 4))
test_set={2:[],4:[]}              #dictionary for test_set(it'll have both unique labels(2 and 4))

train_data=full_data[:-int(test_size*len(full_data))]        
# or train_data=full_data[:(int(len(full_data))-int((test_size*len(full_data))))]
test_data=full_data[-int(test_size*len(full_data)):]


In [5]:
train_data

[[5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [6.0, 10.0, 10.0, 10.0, 10.0, 10.0, 8.0, 10.0, 10.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 4.0, 3.0, 1.0, 1.0, 1.0, 2.0],
 [1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [4.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [3.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [3.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [10.0, 10.0, 10.0, 10.0, 7.0, 10.0, 7.0, 10.0, 4.0, 4.0],
 [6.0, 9.0, 7.0, 5.0, 5.0, 8.0, 4.0, 2.0, 1.0, 2.0],
 [8.0, 4.0, 6.0, 3.0, 3.0, 1.0, 4.0, 3.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [3.0, 6.0, 6.0, 6.0, 5.0, 10.0, 6.0, 8.0, 3.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [5.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 7.0, 1.0, 1.0, 2.0],
 [3.0, 2.0, 2.0, 3.0, 2.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [9.0, 8.0, 8.0, 5.0, 6.0, 2.0, 

In [6]:
test_data

[[3.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [5.0, 4.0, 6.0, 8.0, 4.0, 1.0, 8.0, 10.0, 1.0, 4.0],
 [5.0, 3.0, 3.0, 3.0, 2.0, 3.0, 4.0, 4.0, 1.0, 4.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [8.0, 4.0, 4.0, 1.0, 2.0, 9.0, 3.0, 3.0, 1.0, 4.0],
 [10.0, 8.0, 8.0, 2.0, 8.0, 10.0, 4.0, 8.0, 10.0, 4.0],
 [10.0, 10.0, 10.0, 10.0, 10.0, 1.0, 8.0, 8.0, 8.0, 4.0],
 [7.0, 5.0, 6.0, 10.0, 5.0, 10.0, 7.0, 9.0, 4.0, 4.0],
 [4.0, 3.0, 1.0, 1.0, 2.0, 1.0, 4.0, 8.0, 1.0, 2.0],
 [5.0, 2.0, 2.0, 2.0, 2.0, 1.0, 1.0, 1.0, 2.0, 2.0],
 [2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [5.0, 3.0, 4.0, 1.0, 8.0, 10.0, 4.0, 9.0, 1.0, 4.0],
 [10.0, 6.0, 6.0, 2.0, 4.0, 10.0, 9.0, 7.0, 1.0, 4.0],
 [5.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 2.0, 1.0, 3.0, -99999.0, 1.0, 1.0, 1.0, 2.0],
 [1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0],
 [6.0, 1.0, 1.0, 1.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0],
 [5.0, 10.0, 10.0, 8.0, 5.0

In [7]:
if train_data+test_data==full_data:
    print("division successful")

division successful


In [8]:
for i in train_data:                             #insering values from train_data to train_set dictionary
    train_set[i[-1]].append(i[:-1])
    
for i in test_data:                              #insering values from test_data to test_set dictionary
    test_set[i[-1]].append(i[:-1])

In [9]:
def k_nearest_neighbors(data,predict,k=3):
    
    if len(data)>=k:
        print("k is set to a a value less than total voting group")
    
    distances=[]
    for group in data:
        for features in data[group]:
            euclidean_distance=np.linalg.norm(np.array(features)-np.array(predict))
            distances.append([euclidean_distance,group])
    #print distances        
    #print sorted distances
    #print (sorted (distances)[:k])
    
    votes=[i[1] for i in sorted (distances)[:k]]
   
    vote_result=Counter(votes).most_common(1)[0][0]
    return vote_result

In [10]:
for k in range(1,60):
    correct=0
    total=0

    for group in test_set:
        for data in test_set[group]:
            vote=k_nearest_neighbors(train_set,data,k=6)
            if group==vote:
                correct+=1
            total+=1
        Accuracy=correct/total
    print("Accuracy", Accuracy)

('Accuracy', 0)
('Accuracy', 0)
('Accuracy', 0)
('Accuracy', 0)
('Accuracy', 0)
('Accuracy', 0)
('Accuracy', 0)


KeyboardInterrupt: 