In [2]:
# Import the packages required
import numpy as np
from sklearn import datasets, metrics
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [3]:
# load the data from the datasets available in scikit learn
dataset = datasets.load_breast_cancer()
print(type(dataset))

<class 'sklearn.utils.Bunch'>


In [16]:
# print the data structure information
print(dataset.keys())
print(dataset.data.shape)
print(dataset.feature_names)
print(dataset.target_names)
print(dataset.DESCR)

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
(569, 30)
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
['malignant' 'benign']
.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from ce

In [13]:
# select the x and y data
x = dataset.data
y = dataset.target
print(x.shape, y.shape)
print(x[0], y[0])

(569, 30) (569,)
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01] 0


In [6]:
# Split the dataset in training data and testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(455, 30) (455,)
(114, 30) (114,)


In [7]:
# Define the algorithm to be used, fit the model and generate a prediction
alg = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
alg.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [8]:
# Generate a prediction over the test data using the model
y_pred = alg.predict(x_test)
print(y_pred.shape)
print(y_pred[:10])

(114,)
[1 0 0 1 0 1 1 1 0 0]


In [9]:
# Validate, using the confusion matrix
matrix = metrics.confusion_matrix(y_test, y_pred)
print(matrix)

[[37  4]
 [ 4 69]]


In [14]:
# Calculate the metrics
precision = metrics.precision_score(y_test, y_pred)
print('Precision :', precision * 100, '%')
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy  :', accuracy * 100, '%')
recall = metrics.recall_score(y_test, y_pred)
print('Recall    :', recall * 100, '%')
f1 = metrics.f1_score(y_test, y_pred)
print('F1        :', f1 * 100, '%')
roc_auc = metrics.roc_auc_score(y_test, y_pred)
print('ROC/AUC   :', roc_auc * 100, '%')

Precision : 94.52054794520548 %
Accuracy  : 92.98245614035088 %
Recall    : 94.52054794520548 %
F1        : 94.52054794520548 %
ROC/AUC   : 92.38222519211494 %


In [15]:
# Verify the model error based on R²
print('certainty:', alg.score(x_train, y_train) * 100, '%')

certainty: 94.5054945054945 %
