# ANN on Breast Cancer Data

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [2]:
# uplaod data file
df = pd.read_csv(r"C:\Users\Jwpel\Downloads\breast_cancer_wisconsin.csv")
df.head()

Unnamed: 0,id_number,clump_thickness,uniformity_cell_size,uniformity_cell_shape,marginal_adhesion,single_epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
df.shape

(699, 11)

In [4]:
# info on columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id_number                      699 non-null int64
clump_thickness                699 non-null int64
uniformity_cell_size           699 non-null int64
uniformity_cell_shape          699 non-null int64
marginal_adhesion              699 non-null int64
single_epithelial_cell_size    699 non-null int64
bare_nuclei                    699 non-null object
bland_chromatin                699 non-null int64
normal_nucleoli                699 non-null int64
mitoses                        699 non-null int64
class                          699 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [5]:
# frequency on object variable
df["bare_nuclei"].value_counts()

1     402
10    132
5      30
2      30
3      28
8      21
4      19
?      16
9       9
7       8
6       4
Name: bare_nuclei, dtype: int64

In [6]:
# frequency on target variable
df["class"].value_counts()

2    458
4    241
Name: class, dtype: int64

In [39]:
# replace na with mean value
df["bare_nuclei"].fillna(df["bare_nuclei"].mean(), inplace=True)

In [40]:
# check if replacement worked
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
id_number                      699 non-null int64
clump_thickness                699 non-null int64
uniformity_cell_size           699 non-null int64
uniformity_cell_shape          699 non-null int64
marginal_adhesion              699 non-null int64
single_epithelial_cell_size    699 non-null int64
bare_nuclei                    699 non-null float64
bland_chromatin                699 non-null int64
normal_nucleoli                699 non-null int64
mitoses                        699 non-null int64
class                          699 non-null int64
dtypes: float64(1), int64(10)
memory usage: 60.1 KB


In [52]:
# check impute with mean
df["bare_nuclei"].value_counts()

1.000000     402
10.000000    132
5.000000      30
2.000000      30
3.000000      28
8.000000      21
4.000000      19
3.544656      16
9.000000       9
7.000000       8
6.000000       4
Name: bare_nuclei, dtype: int64

In [41]:
# assigned predictor and target variables
X = df.drop(["class"], axis=1)
y = df["class"]
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 10 columns):
id_number                      699 non-null int64
clump_thickness                699 non-null int64
uniformity_cell_size           699 non-null int64
uniformity_cell_shape          699 non-null int64
marginal_adhesion              699 non-null int64
single_epithelial_cell_size    699 non-null int64
bare_nuclei                    699 non-null float64
bland_chromatin                699 non-null int64
normal_nucleoli                699 non-null int64
mitoses                        699 non-null int64
dtypes: float64(1), int64(9)
memory usage: 54.7 KB


In [42]:
# split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=9)

In [44]:
# scale the train and test data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """


In [47]:
# configuration of ANN model
mlp = MLPClassifier(solver = "sgd", hidden_layer_sizes=(20,20,20), max_iter=3000, learning_rate="adaptive")
mlp.fit(X_train, y_train)
mlp_predict = mlp.predict(X_test)

In [49]:
# assign accuracy score
accuracy = accuracy_score(y_test, mlp_predict)
accuracy

0.9857142857142858

In [50]:
# test prediction frequency 
y_test.value_counts()

2    89
4    51
Name: class, dtype: int64

In [51]:
# confusion matrix
matrix = confusion_matrix(y_test, mlp_predict)
conf_mat = pd.DataFrame(data=matrix, columns = [["Pred:Benign", "Pred:Malignant"]],
                       index = [["Act:Benign", "Act:Malignant"]])
conf_mat

Unnamed: 0,Pred:Benign,Pred:Malignant
Act:Benign,87,2
Act:Malignant,0,51
