# Wisconsin Diagnostic Breast Cancer (WDBC)

This Jupyter Notebook contains the development of a pattern classifier in the Wisconsin Diagnostic Breast Cancer (WDBC) dataset

In [None]:
# Importing Libraries
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
from sklearn import tree

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from matplotlib import pyplot as plt

import seaborn as sns

import shap
import xgboost

import plotly.express as px

In [None]:
# Read the dataset and drop id and NaN data column
df = pd.read_csv('data.csv')
df = df.drop(['Unnamed: 32', 'id'], axis = 1)

In [None]:
# Check output class distribution
ax = sns.countplot(df['diagnosis'], label="Count")

In [None]:
# Run Decision Tree Classifier on the entire Dataset with K-Folds = 10
X, y = df.drop(['diagnosis'], axis = 1), df['diagnosis'].replace({'B': 0, 'M': 1})
X, y = np.array(X), np.array(y)

kf = KFold(n_splits=10, shuffle = True)
kf.get_n_splits(X)

print(kf)

Metrics = []

Classifier = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    SubMetric = dict()
    
    SubMetric['Accuracy'] = accuracy_score(y_test, y_pred)
    SubMetric['F1'] = f1_score(y_test, y_pred)
    SubMetric['Precision'] = precision_score(y_test, y_pred)
    SubMetric['Recall'] = recall_score(y_test, y_pred)
    
    Metrics.append(SubMetric)
    Classifier.append(clf)
    
Metrics = pd.DataFrame.from_dict(Metrics)

print('\n Predict test results:')
print(Metrics)

print('\n Predict test statistics:')
print(Metrics.describe([.5, .9]).transpose())

In [None]:
# Use Shap to determine the most important feature on the classification
X, y = df.drop(['diagnosis'], axis = 1), df['diagnosis'].replace({'B': 0, 'M': 1})

shap.initjs()

model = xgboost.train({"learning_rate": 0.01}, xgboost.DMatrix(X, label=y), 10)

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
# Plot a 3D Scatter with the 3 most important features obtained with shap
fig = px.scatter_3d(df, x='radius_worst', y='concave points_worst', z='texture_worst',
              color='diagnosis')
fig.show()

In [None]:
# Plot a 3D Scatter with the 3 most important features obtained with shap
fig = px.scatter(df, x='radius_worst', y='concave points_worst', color='diagnosis')
fig.show()

In [None]:
# Run Decision Tree Classifier on the 3 most important features obtained with shap with K-Folds = 10
X, y = df[['radius_worst', 'concave points_worst', 'texture_worst']], df['diagnosis'].replace({'B': 0, 'M': 1})
X, y = np.array(X), np.array(y)

kf = KFold(n_splits=10, shuffle = True)
kf.get_n_splits(X)

print(kf)

Metrics = []

Classifier = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    SubMetric = dict()
    
    SubMetric['Accuracy'] = accuracy_score(y_test, y_pred)
    SubMetric['F1'] = f1_score(y_test, y_pred)
    SubMetric['Precision'] = precision_score(y_test, y_pred)
    SubMetric['Recall'] = recall_score(y_test, y_pred)
    
    Metrics.append(SubMetric)
    Classifier.append(clf)
    
Metrics = pd.DataFrame.from_dict(Metrics)

print('\n Predict test results:')
print(Metrics)

print('\n Predict test statistics:')
print(Metrics.describe([.5, .9]).transpose())

In [None]:
# Run Decision Tree Classifier on the 2 most important features obtained with shap with K-Folds = 10
X, y = df[['radius_worst', 'concave points_worst']], df['diagnosis'].replace({'B': 0, 'M': 1})
X, y = np.array(X), np.array(y)

kf = KFold(n_splits=10, shuffle = True)
kf.get_n_splits(X)

print(kf)

Metrics = []

Classifier = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    SubMetric = dict()
    
    SubMetric['Accuracy'] = accuracy_score(y_test, y_pred)
    SubMetric['F1'] = f1_score(y_test, y_pred)
    SubMetric['Precision'] = precision_score(y_test, y_pred)
    SubMetric['Recall'] = recall_score(y_test, y_pred)
    
    Metrics.append(SubMetric)
    Classifier.append(clf)
    
Metrics = pd.DataFrame.from_dict(Metrics)

print('\n Predict test results:')
print(Metrics)

print('\n Predict test statistics:')
print(Metrics.describe([.5, .9]).transpose())