In [17]:
import numpy as np
from numpy.random import rand
import pandas as pd 
from pandas import Series, DataFrame
import math
import scipy

import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sb

import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics

In [18]:
%matplotlib inline
rcParams['figure.figsize'] = 5, 4
sb.set_style('whitegrid')

### Background
Breast cancer is an uncontrolled growth of breast cells. Breast cancer occurs as a result of mutations, or abnormal changes, in the genes responsible for regulating the growth of cells and keeping them healthy. A tumor can be benign or malignant.*

\*Source: http://www.breastcancer.org/symptoms/understand_bc/what_is_bc


### Data
Dataset source: https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)

### Attributes

* id_number - Sample code number
* bclump_thickness - Clump Thickness (1 - 10)
* cell_size - Uniformity of Cell Size (1 - 10)
* cell_shape - Uniformity of Cell Shape (1 - 10)
* marginal_adhesion - Marginal Adhesion (1 - 10)
* epi_cell_size - Single Epithelial Cell Size (1 - 10)
* bare_nuclei - Bare Nuclei (1 - 10)
* bland_chromatin - Bland Chromatin (1 - 10)
* normal_nucleoli - Normal Nucleoli (1 - 10)
* mitoses -  Mitoses (1 - 10)
* class - Class (2 for benign, 4 for malignant)

In [19]:
# read file
f = open('breast-cancer-wisconsin.data.txt', 'r')
data = f.read()
data = data.split('\n')

# 2d array
data_arr = []
for row in data:
    row_split = row.split(",")
    data_arr.append(row_split)
data_arr

# create a DF
data_df = pd.DataFrame(data_arr)

# asssign column names
column_names = ["id_number", "clump_thickness", "cell_size","cell_shape", "marginal_adhesion", "epi_cell_size", "bare_nuclei", "bland_chromatin", "normal_nucleoli", "mitoses", "class"]
data_df.columns = column_names

### Cleaning Data

In [20]:
# drop id - since it's a feature that is not useful for arithmetic differences between features
data_df = data_df.drop(['id_number'], axis=1)

# replace unknown character `?` with -99999
data_df.replace("?", -99999, inplace=True)

# remove last row 699 - because it has NaN values
data_df = data_df.drop(data_df.index[699])
data_df.tail()
data_df.dtypes

clump_thickness      object
cell_size            object
cell_shape           object
marginal_adhesion    object
epi_cell_size        object
bare_nuclei          object
bland_chromatin      object
normal_nucleoli      object
mitoses              object
class                object
dtype: object

### Confirm Data is Cleaned

In [21]:
data_df.isnull().sum()

clump_thickness      0
cell_size            0
cell_shape           0
marginal_adhesion    0
epi_cell_size        0
bare_nuclei          0
bland_chromatin      0
normal_nucleoli      0
mitoses              0
class                0
dtype: int64

### Preparing Data Set for K-NN Classification

In [22]:
# Extract all labeled features except 'class'
X = data_df.drop(['class'], axis=1)

# Extract only 'class'
y = data_df['class']

In [23]:
# Split our data set for training and testing. Test size: 33%
x_train, x_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.33, random_state=17)

In [24]:
# Create a KNeighborsClassifier instance
clf = neighbors.KNeighborsClassifier()

# Train the model with training data set
clf.fit(x_train, y_train)

print(clf)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


In [25]:
# check for model score
accuracy = clf.score(x_test, y_test)
print(accuracy) 

0.974025974025974


### Findings
Wow 97% is a really a good score for our model!! 

In [26]:
y_expect = y_test

# Predict with test data set 
y_predict = clf.predict(x_test)

# classification report
print(metrics.classification_report(y_expect, y_predict))

             precision    recall  f1-score   support

          2       0.98      0.98      0.98       156
          4       0.96      0.96      0.96        75

avg / total       0.97      0.97      0.97       231



### Predictions

In [27]:
predict_measure_one = np.array([4,2, 1, 1, 1, 2, 3, 2, 1])
predict_measure_one = predict_measure_one.reshape(1, -1)
prediction_one = clf.predict(predict_measure_one)
print(prediction_one)

['2']


### Results

The model's accuracy score is 97% which means when we pass in new unlabeled data, we can predict with high confidence whether the tumor is benign or malignant.

Author: Venkata Karthik Thota  
kthotav@gmail.com  
https://www.linkedin.com/in/kthotav/