# Project Cancer Detection

## Breast Cancer Wisconsin (Diagnostic) Data Set

In [2]:
import os
#os.chdir('..')
#os.chdir('..')
#os.getcwd()

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


In [6]:
col = ['id', 'Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Blend Chromatin', 'Normal Nucleoli', 'Mitoses', 'Class']
df = pd.read_csv('Data/Cancer/breast-cancer-wisconsin.data.csv', names=col, header=None)
df.head()

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


### Data Pre-processing

In [7]:
np.where(df.isnull())

(array([], dtype=int32), array([], dtype=int32))

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id                             699 non-null int64
Clump Thickness                699 non-null int64
Uniformity of Cell Size        699 non-null int64
Uniformity of Cell Shape       699 non-null int64
Marginal Adhesion              699 non-null int64
Single Epithelial Cell Size    699 non-null int64
Bare Nuclei                    699 non-null object
Blend Chromatin                699 non-null int64
Normal Nucleoli                699 non-null int64
Mitoses                        699 non-null int64
Class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 57.4+ KB


In [10]:
df['Bare Nuclei'].describe()

count     699
unique     11
top         1
freq      402
Name: Bare Nuclei, dtype: object

In [11]:
df['Bare Nuclei'].value_counts()

1     402
10    132
2      30
5      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: Bare Nuclei, dtype: int64

In [12]:
df[df['Bare Nuclei'] == "?"]

Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitoses,Class
23,1057013,8,4,5,1,2,?,7,3,1,4
40,1096800,6,6,6,9,6,?,7,8,1,2
139,1183246,1,1,1,1,1,?,2,1,1,2
145,1184840,1,1,3,1,2,?,2,1,1,2
158,1193683,1,1,2,1,3,?,1,1,1,2
164,1197510,5,1,1,1,2,?,3,1,1,2
235,1241232,3,1,4,1,2,?,3,1,1,2
249,169356,3,1,1,1,2,?,3,1,1,2
275,432809,3,1,3,1,2,?,2,1,1,2
292,563649,8,8,8,1,2,?,6,10,1,4


In [13]:
df['Class'].value_counts()

2    458
4    241
Name: Class, dtype: int64

Looks like we are okay to remove the "?'s" because they are mostly 2's and we have enough 2's to get rid of some.

In [14]:
df['Bare Nuclei'].replace("?", np.NAN, inplace=True)
df = df.dropna()

In [15]:
df.shape

(683, 11)

Note that for class: 2 is benign, 4 is for malignant

Need to turn Class into 0/1 binary

$$\frac{\text{df["Class"]}}{2}-1$$

In [16]:
df['Class'] = df['Class'] / 2 - 1

In [18]:
df['Class'].value_counts()

0.0    444
1.0    239
Name: Class, dtype: int64

In [19]:
df.columns

Index(['id', 'Clump Thickness', 'Uniformity of Cell Size',
       'Uniformity of Cell Shape', 'Marginal Adhesion',
       'Single Epithelial Cell Size', 'Bare Nuclei', 'Blend Chromatin',
       'Normal Nucleoli', 'Mitoses', 'Class'],
      dtype='object')

In [20]:
X = df.drop(['id', 'Class'], axis=1)
X_col = X.columns

In [21]:
y = df['Class']

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
X = StandardScaler().fit_transform(X.values)



**Training**

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
df1 = pd.DataFrame(X, columns=X_col)

In [26]:
df1.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Blend Chromatin,Normal Nucleoli,Mitoses
0,0.197905,-0.702212,-0.741774,-0.639366,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484
1,0.197905,0.277252,0.262783,0.758032,1.695166,1.772867,-0.181827,-0.285105,-0.3484
2,-0.511643,-0.702212,-0.741774,-0.639366,-0.555608,-0.424217,-0.181827,-0.612927,-0.3484
3,0.552679,1.583204,1.602192,-0.639366,-0.105454,0.125054,-0.181827,1.354008,-0.3484
4,-0.156869,-0.702212,-0.741774,0.059333,-0.555608,-0.698853,-0.181827,-0.612927,-0.3484


In [27]:
X_train, X_test, y_train, y_test = train_test_split(df1, y,
                                                   train_size=0.8,
                                                   random_state=42)



In [28]:
from sklearn.neighbors import KNeighborsClassifier

In [29]:
knn = KNeighborsClassifier(n_neighbors=5,
                           p=2, ##p=2 essentially converts it to euclidean distance
                           metric='minkowski')

In [30]:
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [None]:
from sklearn.model_selection impo