In [1]:
import pandas as pd
import numpy as np

In [2]:
names = ["ID", "Clump Thickness", "Cell Size Uniformity",
         "Cell Shape Uniformity", "Marginal Adhesion", 
         "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin",
         "Normal Nucleoli", "Mitoses", "Class"]
df = pd.read_csv("breast-cancer-wisconsin.data", names=names)
df = df.replace("?", np.NAN)
df

Unnamed: 0,ID,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2


In [3]:
data = df.values
data = data.astype(dtype=np.float64, copy=False)
data = data[:, 1:]
data[:, -1] -= 2
data[:, -1] /= 2
data

array([[ 5.,  1.,  1., ...,  1.,  1.,  0.],
       [ 5.,  4.,  4., ...,  2.,  1.,  0.],
       [ 3.,  1.,  1., ...,  1.,  1.,  0.],
       ...,
       [ 5., 10., 10., ..., 10.,  2.,  1.],
       [ 4.,  8.,  6., ...,  6.,  1.,  1.],
       [ 4.,  8.,  8., ...,  4.,  1.,  1.]])

In [4]:
means = np.nanmean(data, axis=0, dtype=np.float64)
means

array([4.41773963, 3.13447783, 3.2074392 , 2.80686695, 3.21602289,
       3.54465593, 3.43776824, 2.86695279, 1.58941345, 0.34477825])

In [5]:
for col in range(data.shape[1]):
    for row in range(data.shape[0]):
        if np.isnan(data[row, col]):
            data[row, col] = means[col]
df = pd.DataFrame(data=data, index=range(data.shape[0]), columns=names[1:])
df

Unnamed: 0,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5.0,1.0,1.0,1.0,2.0,1.000000,3.0,1.0,1.0,0.0
1,5.0,4.0,4.0,5.0,7.0,10.000000,3.0,2.0,1.0,0.0
2,3.0,1.0,1.0,1.0,2.0,2.000000,3.0,1.0,1.0,0.0
3,6.0,8.0,8.0,1.0,3.0,4.000000,3.0,7.0,1.0,0.0
4,4.0,1.0,1.0,3.0,2.0,1.000000,3.0,1.0,1.0,0.0
5,8.0,10.0,10.0,8.0,7.0,10.000000,9.0,7.0,1.0,1.0
6,1.0,1.0,1.0,1.0,2.0,10.000000,3.0,1.0,1.0,0.0
7,2.0,1.0,2.0,1.0,2.0,1.000000,3.0,1.0,1.0,0.0
8,2.0,1.0,1.0,1.0,2.0,1.000000,1.0,1.0,5.0,0.0
9,4.0,2.0,1.0,1.0,2.0,1.000000,2.0,1.0,1.0,0.0


In [6]:
num_train = round(df.shape[0] * 0.8)
num_test = df.shape[0] - num_train

In [7]:
train_data = df.iloc[:num_train, :-1]
train_labels = df.iloc[:num_train, -1]
test_data = df.iloc[num_train:, :-1]
test_labels = df.iloc[num_train:, -1]
test_data

Unnamed: 0,Clump Thickness,Cell Size Uniformity,Cell Shape Uniformity,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
559,5.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
560,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
561,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
562,1.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0
563,3.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,1.0
564,4.0,1.0,1.0,1.0,2.0,1.0,3.0,2.0,1.0
565,5.0,7.0,10.0,10.0,5.0,10.0,10.0,10.0,1.0
566,3.0,1.0,2.0,1.0,2.0,1.0,3.0,1.0,1.0
567,4.0,1.0,1.0,1.0,2.0,3.0,2.0,1.0,1.0
568,8.0,4.0,4.0,1.0,6.0,10.0,2.0,5.0,2.0


In [8]:
from kNN import *

In [9]:
def single_validate(model, test_data, test_labels):
    gen_labels = model.classify(test_data)
    tn, fn, tp, fp = generate_confusion_matrix(test_labels, gen_labels)
    print("    True Negative:", tn)
    print("    False Negative:", fn)
    print("    True Positive:", tp)
    print("    False Positive", fp)
    acc = get_accuracy(tn, fn, tp, fp)
    print("    Accuracy:", acc)
    tpr = get_true_positive_rate(tp, fn)
    print("    True Positive Rate:", tpr)
    ppv = get_precision(tp, fp)
    print("    Positive Predictive Value:", ppv)
    tnr = get_true_negative_rate(tn, fp)
    print("    True Negative Rate:", tnr)
    f1 = get_f1_score(ppv, tpr)
    print("    F1 Score:", f1)

In [10]:
model = kNearestNeighbors(2, train_data, train_labels)
kvals = [2, 3, 4, 5, 6, 7, 8, 17, 33]
for k in kvals:
    print("k =", k)
    model.updateK(k)
    single_validate(model, test_data, test_labels)

k = 2
    True Negative: 105
    False Negative: 2
    True Positive: 33
    False Positive 0
    Accuracy: 0.9857142857142858
    True Positive Rate: 0.9428571428571428
    Positive Predictive Value: 1.0
    True Negative Rate: 1.0
    F1 Score: 0.9705882352941176
k = 3
    True Negative: 104
    False Negative: 1
    True Positive: 34
    False Positive 1
    Accuracy: 0.9857142857142858
    True Positive Rate: 0.9714285714285714
    Positive Predictive Value: 0.9714285714285714
    True Negative Rate: 0.9904761904761905
    F1 Score: 0.9714285714285714
k = 4
    True Negative: 105
    False Negative: 2
    True Positive: 33
    False Positive 0
    Accuracy: 0.9857142857142858
    True Positive Rate: 0.9428571428571428
    Positive Predictive Value: 1.0
    True Negative Rate: 1.0
    F1 Score: 0.9705882352941176
k = 5
    True Negative: 104
    False Negative: 1
    True Positive: 34
    False Positive 1
    Accuracy: 0.9857142857142858
    True Positive Rate: 0.9714285714285714
  