In [None]:
import graphviz
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, export_graphviz

# Predicting Malignant Breast Cancer Cells

Reading data from the University of California - Irvine machine learning database. The data are read in for you below with appropriate column names based on the original description of the dataset.

https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/

In [None]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/'
                 'breast-cancer-wisconsin.data', sep=',', na_values='?',
                 names=['samplenum', 'clump thickness', 'uniform cell size', 'uniform cell shape',
                        'marginal adhesion', 'single epithelial cell size','bare nuclei', 'bland chromatin',
                        'normal nucleoli', 'mitoses', 'class'])

Here I drop rows that contain at least one NaN value and describe the overall dataset. Note that the class variable contains two values [2, 4], which represent ['benign', 'malignant']

In [None]:
df = df.drop(columns=['samplenum']).dropna()
df.describe()

A quick look at the correlation between feature variables and the target variable (class).

In [None]:
plt.figure(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, cmap=plt.cm.coolwarm)

plt.show()

Describe the balance of the target class outcomes (e.g., the 'class' column)

In [None]:
df['cancer_class'] = df['class'].map({2: 'benign', 4: 'malignant'})

sns.histplot(df, x='cancer_class', hue=df['cancer_class'])
plt.title('Histogram of the two classes from the whole dataset')
plt.show()

Split the dataset into feature and target variables including only the most highly correlated predictors.

In [None]:
features = df[['uniform cell size', 'bare nuclei', 'bland chromatin', 'clump thickness', 'normal nucleoli']]
target = df['cancer_class']

Xtrain, Xtest, ytrain, ytest = train_test_split(features, target, test_size=.3, random_state=1)

How many values are in the training dataset? How many are in the testing dataset?

Fit a simple Naive Bayes model...

In [None]:
model = GaussianNB()
model.fit(Xtrain, ytrain)
y_model = model.predict(Xtest)

What is the accuracy, recall, and precision of the model of the Naive Bayes model?

Create a decision tree classifer for the same data that are used in the Naive Bayes model...

What is the accuracy, recall, and precision of the model of the decision tree model?

What are the important features from the decision tree model?