In [161]:
import csv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [162]:
class distancePredictedResult:
    def __init__(self, distance, predictedResult):
        self.distance = distance
        self.predictedResult = predictedResult

In [163]:
#load the dataset
def loadDataset(filename, header):
    df = pd.read_csv(filename)
    df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})
    df = df.astype(float)
    df.columns = header
    df = df[[c for c in df if c not in ['diagnosis']] + ['diagnosis']]
    df = df.drop('id', 1)
    #data normalization
    normalized_df = (df - df.min()) / (df.max() - df.min())
    #split the dataset into training set (0.66) and testing set (0.34)
    train, test = train_test_split(normalized_df, test_size=0.28)
    return train, test

In [164]:
#find out the euclideanDistance of two instances
import math
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow(float(instance1[x])-float(instance2[x]), 2)
    return math.sqrt(distance)

In [181]:
#get the k nearest neighbors
def getNeighbors(trainingSet, predictData, k):
    distances = []
    neighbors = []
    length = len(trainingSet.iloc[0])-1
    for index, row in trainingSet.iterrows():
        tempDistance = euclideanDistance(predictData, row, length)
        distances.append(distancePredictedResult(tempDistance, row[length]))
    distances.sort(key=lambda x: x.distance)
    for i in range(k):
        neighbors.append(distances[i])
    return neighbors

In [182]:
#get the prediction
def getPrediction(neighbors):
    votes = {}
    for i in range(len(neighbors)):
        classResult = neighbors[i].predictedResult
        if classResult not in votes:
            votes[classResult] = 1
        else:
            votes[classResult] += 1
    sortedKeys = sorted(votes, key=votes.get, reverse=True)
    return sortedKeys[0]

In [186]:
#get the accuracy
def getAccuracy(testingSet, predictions, columnToBePredict):
    votes = {'true':0, 'false':0}
    for index, row in testingSet.iterrows():
        if row[columnToBePredict] == predictions[index]:
            votes['true'] += 1
        print('prediction: ', predictions[index], '     result: ', row[columnToBePredict])
        
    return (votes['true']/float(len(testingSet)))*100

In [193]:
def main():
    
    filename = 'wisc_bc_data.csv'
    header = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'points_mean', 'symmetry_mean', 'dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'points_se', 'symmetry_se', 'dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'points_worst', 'symmetry_worst', 'dimension_worst']
    columnToBePredict = 'diagnosis'
    trainingSet, testingSet = loadDataset(filename, header)
    predictions = {}
    k = int(math.sqrt(len(trainingSet)))

    for index, row in testingSet.iterrows():
        tempNeighbors = getNeighbors(trainingSet, row, k)
        predictions[index] = getPrediction(tempNeighbors)
    
    print(getAccuracy(testingSet, predictions, columnToBePredict))
    

In [194]:
if __name__ == "__main__":
    main()

prediction:  0.0      result:  0.0
prediction:  0.0      result:  0.0
prediction:  0.0      result:  0.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  1.0      result:  1.0
prediction:  1.0      result:  1.0
prediction:  0.0      result:  0.0
prediction:  0.0      result:  0.0
prediction:  0.0    