In [56]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score

from sklearn.neighbors import KNeighborsClassifier

import math

In [10]:
dataset = pd.read_csv("C:\\MyRWork\\Data\\UCI-Breast-Cancer-Wisconsin\\breast-cancer-wisconsin.data", header=["Sample_code_number"
,"Clump_Thickness"
,"Uniformity_of_Cell_Size"
,"Uniformity_of_Cell_Shape"
,"Marginal_Adhesion"
,"Single_Epithelial_Cell_Size"
,"Bare_Nuclei"
,"Bland_Chromatin"
,"Normal_Nucleoli"
,"Mitoses"
,"Class"])

In [22]:
dataset.head()
print(dataset.info)
print(dataset.dtypes)

<bound method DataFrame.info of      Sample_code_number  Clump_Thickness  Uniformity_of_Cell_Size  \
0               1000025                5                        1   
1               1002945                5                        4   
2               1015425                3                        1   
3               1016277                6                        8   
4               1017023                4                        1   
5               1017122                8                       10   
6               1018099                1                        1   
7               1018561                2                        1   
8               1033078                2                        1   
9               1033078                4                        2   
10              1035283                1                        1   
11              1036172                2                        1   
12              1041801                5                        3   
13

In [13]:
# Does it contain any missing value

dataset[dataset.isnull().any(axis=1)].head()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class


In [17]:
# Does it contain any zero values

dataset[dataset.isin([0]).any(axis=1)].head()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class


In [28]:
# Does it contain any special symbols

dataset[dataset.isin(['?']).any(axis=1)].head()

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2


In [30]:
Spl_Char_Not_Accepted = ["Sample_code_number"
,"Clump_Thickness"
,"Uniformity_of_Cell_Size"
,"Uniformity_of_Cell_Shape"
,"Marginal_Adhesion"
,"Single_Epithelial_Cell_Size"
,"Bare_Nuclei"
,"Bland_Chromatin"
,"Normal_Nucleoli"
,"Mitoses"
,"Class"]

for column in Spl_Char_Not_Accepted:
    dataset[column] = dataset[column].replace('?', np.NaN)


In [31]:
dataset['Bare_Nuclei'] = dataset[['Bare_Nuclei']].apply(pd.to_numeric)

In [32]:
for column in Spl_Char_Not_Accepted:
    mean = int(dataset[column].mean(skipna = True))
    dataset[column] = dataset[column].replace(np.NaN, mean)

In [34]:
dataset.iloc[[23,40,139,145,158]]

Unnamed: 0,Sample_code_number,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,3.0,7,3,1,4
40,1096800,6,6,6,9,6,3.0,7,8,1,2
139,1183246,1,1,1,1,1,3.0,2,1,1,2
145,1184840,1,1,3,1,2,3.0,2,1,1,2
158,1193683,1,1,2,1,3,3.0,1,1,1,2


In [36]:
# Split the dataset

X = dataset.iloc[:,0:10]
y = dataset.iloc[:,10]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2)

# Feature Scaling
Sc_X = StandardScaler()
X_train = Sc_X.fit_transform(X_train)
X_test = Sc_X.fit_transform(X_test)

In [45]:
# decide for n_neighbors = k, because the dataset has 699 rows

math.sqrt(len(y_test))

# take 11 or 13 to start with classifier,odd values will make it easy to pick the group

11.832159566199232

In [44]:
# Define the model : Init Knn

Classifier = KNeighborsClassifier(n_neighbors=11, p=2, metric='euclidean')
Classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
           metric_params=None, n_jobs=1, n_neighbors=11, p=2,
           weights='uniform')

In [49]:
y_pred = Classifier.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm

array([[83,  2],
       [ 2, 53]], dtype=int64)

In [57]:
print("accuracy_score: ", accuracy_score(y_test, y_pred))
print("f1_score: ", f1_score(y_test, y_pred, average="macro"))
print("precision_score: ", precision_score(y_test, y_pred, average="macro"))

accuracy_score:  0.9714285714285714
f1_score:  0.9700534759358288
precision_score:  0.9700534759358288
