# 1 Importing libraries

In [14]:
# import environment first
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# 2 Data browsing and analysis

Import and check the date sets

In [15]:
from sklearn import datasets
df = pd.read_csv('Breast_cancer_data.csv')
df

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0
1,20.57,17.77,132.90,1326.0,0.08474,0
2,19.69,21.25,130.00,1203.0,0.10960,0
3,11.42,20.38,77.58,386.1,0.14250,0
4,20.29,14.34,135.10,1297.0,0.10030,0
...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0
565,20.13,28.25,131.20,1261.0,0.09780,0
566,16.60,28.08,108.30,858.1,0.08455,0
567,20.60,29.33,140.10,1265.0,0.11780,0


The dataset have 569 rows and 6 columns.
The first five columns are features,and the last,'diagnosis', is the target.

In [16]:
df['diagnosis'].count()
df['diagnosis'].value_counts()

1    357
0    212
Name: diagnosis, dtype: int64

For the 'diagnosis',1 means dbenign which are 358 rows,0 means malignant which are 212 rows.

In [17]:
df[df.isnull().values==True]

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis


No missing value in the dataset.

# 3 Data training

In [18]:
from sklearn.preprocessing import StandardScaler
features = ['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area','mean_smoothness']
target = ['diagnosis']

# Separating out the features 'mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area','mean_smoothness'
x = df.loc[:, features].values
# Separating out the target 'diagnosis'。
y = df.loc[:, target].values
# Standardizing the features
x = StandardScaler().fit_transform(x)

df=pd.DataFrame(data=x, columns = features)
df

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness
0,1.097064,-2.073335,1.269934,0.984375,1.568466
1,1.829821,-0.353632,1.685955,1.908708,-0.826962
2,1.579888,0.456187,1.566503,1.558884,0.942210
3,-0.768909,0.253732,-0.592687,-0.764464,3.283553
4,1.750297,-1.151816,1.776573,1.826229,0.280372
...,...,...,...,...,...
564,2.110995,0.721473,2.060786,2.343856,1.041842
565,1.704854,2.085134,1.615931,1.723842,0.102458
566,0.702284,2.045574,0.672676,0.577953,-0.840484
567,1.838341,2.336457,1.982524,1.735218,1.525767


In [22]:
y=y.ravel()
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 1)

In [31]:
# choss k=3
knn_model = KNeighborsClassifier(n_neighbors = 3) 

knn_model.fit(x_train, y_train) 
# train our model to the data set we have. The training set is all the data points

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [33]:
y_pred = knn_model.predict(x_test)

print(y_pred)

[1 1 1 0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 1 1 1 0
 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
 1 0 0 1 1 0 1 0 1 0 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1]


In [16]:
from sklearn.metrics import accuracy_score

print(accuracy_score(y_test, y_pred))

0.9090909090909091


In [46]:
k_array=np.arange(1,20,1)
k_array

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [60]:
for k in k_array:
    knn_try=KNeighborsClassifier(n_neighbors=k)
    knn_try.fit(x_train,y_train)
    y_pred=accuracy_score(y_test,knn_try.predict(x_test))
    print(k,y_pred)

1 0.8601398601398601
2 0.8181818181818182
3 0.8811188811188811
4 0.8881118881118881
5 0.9020979020979021
6 0.8951048951048951
7 0.8951048951048951
8 0.8951048951048951
9 0.8811188811188811
10 0.9020979020979021
11 0.9020979020979021
12 0.9020979020979021
13 0.9090909090909091
14 0.9020979020979021
15 0.916083916083916
16 0.916083916083916
17 0.916083916083916
18 0.916083916083916
19 0.9230769230769231


In [69]:
knn_5 = KNeighborsClassifier(n_neighbors = 5)
knn_5.fit(x_train, y_train)
y_pred5= knn_5.predict(x_test)
print(accuracy_score(y_test, y_pred5))

0.9020979020979021


# 4 Test our Model

In [76]:
y_pred = knn_5.predict(x_test)
print(y_pred) # our prediction
print(y_test) # actual values

[1 1 1 0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0
 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
 1 0 0 1 1 0 1 0 1 0 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1]
[1 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 0
 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0
 1 0 0 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 0 0 1]


In [79]:
print(y_pred) # our prediction
print(y_test) # actual values

[1 1 1 0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 1 1 0
 0 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 0
 1 0 0 1 1 0 1 0 1 0 1 1 0 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 0 0 1 1 1 0 1 0 0 1 1 0 1 0 1]
[1 0 1 0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 1 1 0 1 0 1 1 0 0 0 0 1 0 0 1 1 0
 1 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 0
 1 0 0 0 1 0 1 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 1 0 0 1 1 1 0 0 1]


In [80]:
# we should test how accurate our model is 

from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9020979020979021
