In [53]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, cross_validate, KFold

In [54]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
bc_data = pd.read_csv('/content/drive/My Drive/HI1020/BCdataset.csv')
bc_data

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.380,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.990,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.570,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.910,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.540,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,25.450,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,23.690,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,18.980,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,25.740,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400


In [56]:
keep = bc_data[['diagnosis','radius_mean', 'perimeter_mean', 'smoothness_mean', 'concave points_mean']]
bc_df = keep.dropna()
bc_df

Unnamed: 0,diagnosis,radius_mean,perimeter_mean,smoothness_mean,concave points_mean
0,M,17.99,122.80,0.11840,0.14710
1,M,20.57,132.90,0.08474,0.07017
2,M,19.69,130.00,0.10960,0.12790
3,M,11.42,77.58,0.14250,0.10520
4,M,20.29,135.10,0.10030,0.10430
...,...,...,...,...,...
564,M,21.56,142.00,0.11100,0.13890
565,M,20.13,131.20,0.09780,0.09791
566,M,16.60,108.30,0.08455,0.05302
567,M,20.60,140.10,0.11780,0.15200


In [58]:
# Step 4: Making the dataset available for “Supervised” Machine Learning algorithms

In [94]:
## Select variables (x-values) and the outcome (y-values)
X = bc_df.values[:, 1:5]
Y = bc_df.values[:, 0]

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)

In [96]:
## Data normalization
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

bc_train_norm = pd.DataFrame(X_train_scaled)
bc_test_norm= pd.DataFrame(X_test_scaled)

In [97]:
bc_train_norm

Unnamed: 0,0,1,2,3
0,0.607175,0.595743,0.412386,0.472068
1,0.494060,0.488632,0.433059,0.327883
2,0.317052,0.303849,0.362372,0.093439
3,0.273984,0.264184,0.314706,0.142346
4,0.333617,0.317877,0.343685,0.094235
...,...,...,...,...
393,0.306640,0.301638,0.495351,0.156660
394,0.531923,0.528713,0.561253,0.526839
395,0.221449,0.206689,0.207457,0.057753
396,0.278243,0.272269,0.421685,0.115060


In [98]:
bc_test_norm

Unnamed: 0,0,1,2,3
0,0.517251,0.557045,0.635280,0.595427
1,0.348289,0.336673,0.419157,0.228777
2,0.623740,0.603345,0.248894,0.383996
3,0.193999,0.182572,0.433059,0.128380
4,0.247953,0.246562,0.514309,0.107654
...,...,...,...,...
166,0.313739,0.305853,0.381421,0.223111
167,0.318472,0.320710,0.598267,0.297465
168,0.299068,0.286435,0.299630,0.120477
169,0.601022,0.595052,0.409317,0.548211


In [100]:
## Split data into train/test sets with K-fold cross validation
cv_bc = KFold(n_splits=5, random_state=1, shuffle=True)
DT_results = cross_validate(DecisionTreeClassifier(), X, Y, cv= cv_bc, scoring=['accuracy'], return_train_score=True)

accuracy_score_train= np.around(DT_results['train_accuracy'], 2)
mean_accuracy_train= DT_results['train_accuracy'].mean()
accuracy_score_test= np.around(DT_results['test_accuracy'], 3)
mean_accuracy_test= DT_results['test_accuracy'].mean()

print("\nAccuracy score on the Train set using DT:", accuracy_score_train)
print("Mean Accuracy score on the Train set using DT: ", mean_accuracy_train)
print("\nAccuracy score on Test set using DT:", accuracy_score_test)
print("Mean Accuracy score on the Test set using DT: ", mean_accuracy_test)


Accuracy score on the Train set using DT: [1. 1. 1. 1. 1.]
Mean Accuracy score on the Train set using DT:  1.0

Accuracy score on Test set using DT: [0.904 0.895 0.877 0.93  0.92 ]
Mean Accuracy score on the Test set using DT:  0.9051234280391244


In [101]:
# Step 5: Building and evaluating decision tree machine learning predictive model

In [102]:
diagnosis_class= DecisionTreeClassifier()
diagnosis_class.fit(X_train_scaled, y_train)
y_pred = diagnosis_class.predict(X_test_scaled)

In [103]:
accuracy = accuracy_score(y_test, y_pred)
precision= precision_score(y_test, y_pred, average='weighted')
recall= recall_score(y_test, y_pred, average='weighted')

print("Decision Tree Accuracy: %.3f " % accuracy)
print("Decision Tree Precision: %.3f " % precision)
print("Decision Tree Recall: %.3f " % recall)

Decision Tree Accuracy: 0.901 
Decision Tree Precision: 0.900 
Decision Tree Recall: 0.901 
