# Breast Cancer Dataset (Machine Learning Demo)

## Import Packages

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, export_text, plot_tree
from sklearn.neural_network import MLPClassifier

## Load Breast Cancer Dataset

In [2]:
dataframe = pd.read_csv('Breast_cancer_data.csv')
dataframe

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0
1,20.57,17.77,132.90,1326.0,0.08474,0
2,19.69,21.25,130.00,1203.0,0.10960,0
3,11.42,20.38,77.58,386.1,0.14250,0
4,20.29,14.34,135.10,1297.0,0.10030,0
...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.11100,0
565,20.13,28.25,131.20,1261.0,0.09780,0
566,16.60,28.08,108.30,858.1,0.08455,0
567,20.60,29.33,140.10,1265.0,0.11780,0


## Data Preparation

We split the data into two: train and test set.

In [3]:
train_df = dataframe.sample(frac = 0.8, random_state=0)
test_df = dataframe.drop(train_df.index)

train_X = train_df.drop(columns={'diagnosis'})
train_y = train_df['diagnosis']

test_X = test_df.drop(columns={'diagnosis'})
test_y = test_df['diagnosis']

In [4]:
print('number of rows:',len(dataframe),len(train_df),len(test_df))

number of rows: 569 455 114


### Train set

In [5]:
train_df

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
512,13.40,20.52,88.64,556.7,0.11060,0
457,13.21,25.25,84.10,537.9,0.08791,1
439,14.02,15.66,89.59,606.5,0.07966,1
298,14.26,18.17,91.22,633.1,0.06576,1
37,13.03,18.42,82.61,523.8,0.08983,1
...,...,...,...,...,...,...
86,14.48,21.46,94.25,648.2,0.09444,0
266,10.60,18.95,69.28,346.4,0.09688,1
36,14.25,21.72,93.63,633.0,0.09823,0
193,12.34,26.86,81.15,477.4,0.10340,0


In [6]:
train_X,train_y

(     mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness
 512        13.40         20.52           88.64      556.7          0.11060
 457        13.21         25.25           84.10      537.9          0.08791
 439        14.02         15.66           89.59      606.5          0.07966
 298        14.26         18.17           91.22      633.1          0.06576
 37         13.03         18.42           82.61      523.8          0.08983
 ..           ...           ...             ...        ...              ...
 86         14.48         21.46           94.25      648.2          0.09444
 266        10.60         18.95           69.28      346.4          0.09688
 36         14.25         21.72           93.63      633.0          0.09823
 193        12.34         26.86           81.15      477.4          0.10340
 58         13.05         19.31           82.61      527.2          0.08060
 
 [455 rows x 5 columns],
 512    0
 457    1
 439    1
 298    1
 37     1
       ..
 

### Test Set

In [7]:
test_df

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,diagnosis
0,17.99,10.38,122.80,1001.0,0.11840,0
9,12.46,24.04,83.97,475.9,0.11860,0
23,21.16,23.04,137.20,1404.0,0.09428,0
28,15.30,25.27,102.40,732.4,0.10820,0
41,10.95,21.35,71.90,371.1,0.12270,0
...,...,...,...,...,...,...
544,13.87,20.70,89.77,584.8,0.09578,1
551,11.13,22.44,71.49,378.4,0.09566,1
558,14.59,22.68,96.39,657.1,0.08473,1
559,11.51,23.93,74.52,403.5,0.09261,1


In [8]:
test_X,test_y

(     mean_radius  mean_texture  mean_perimeter  mean_area  mean_smoothness
 0          17.99         10.38          122.80     1001.0          0.11840
 9          12.46         24.04           83.97      475.9          0.11860
 23         21.16         23.04          137.20     1404.0          0.09428
 28         15.30         25.27          102.40      732.4          0.10820
 41         10.95         21.35           71.90      371.1          0.12270
 ..           ...           ...             ...        ...              ...
 544        13.87         20.70           89.77      584.8          0.09578
 551        11.13         22.44           71.49      378.4          0.09566
 558        14.59         22.68           96.39      657.1          0.08473
 559        11.51         23.93           74.52      403.5          0.09261
 568         7.76         24.54           47.92      181.0          0.05263
 
 [114 rows x 5 columns],
 0      0
 9      0
 23     0
 28     0
 41     0
       ..
 

## Create, Train, and Evaluate the Models

### Decision Tree Model

In [9]:
dt_classifier = DecisionTreeClassifier()
dt_classifier = dt_classifier.fit(train_X, train_y)

features = train_X.columns.tolist()
tree_text = export_text(dt_classifier, feature_names=features)
# print(tree_text)

dt_prediction = dt_classifier.predict(test_X)
print("predictions:",list(dt_prediction))
print("correct ans:",list(test_y))

# Accuracy
accdt = dt_classifier.score(test_X, test_y)

print("Decision Tree accuracy:", accdt)

predictions: [0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
correct ans: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]
Decision Tree accuracy: 0.9122807017543859


### Neural Network Model

In [10]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(100,))
mlp_classifier = mlp_classifier.fit(train_X, train_y)

mlp_prediction = mlp_classifier.predict(test_X)
print("predictions:",list(mlp_prediction))
print("correct ans:",list(test_y))

# Accuracy
accmlp = mlp_classifier.score(test_X, test_y)

print("Neural Net accuracy:", accmlp)

predictions: [0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1]
correct ans: [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1]
Neural Net accuracy: 0.8508771929824561
