In [12]:
import sklearn
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [13]:
data = load_breast_cancer()     # import dataset (retrieved as a dictionary)

In [14]:
# Organize data (into lists):
label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

In [15]:
# Take a look at the data:
print(label_names)  # Class names
print(labels[0])    # Class names mapped to either 0/1 representing malignant or benign
print(feature_names[0]) # Different feature labels for each tumour 
print(features[0])  # Feature values

['malignant' 'benign']
0
mean radius
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]


Output shows that our first
data instance is a malignant tumor whose mean radius is
1.79900000e+01.

In [16]:
# Split our data:
train, test, train_labels, test_labels = train_test_split(features, 
labels,
test_size=0.33,     # Randomly splits data, 33% of the data will be our original dataset(test), the rest is training data
random_state=42)

In [17]:
gnb = GaussianNB()  # Initializing our classifier
model = gnb.fit(train, train_labels)    # Training classifier

In [18]:
# Now we will make predictions using our trained model:
preds = gnb.predict(test)
print(preds)

[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0
 1 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0
 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 1 0 0
 0 1 1]


In [19]:
# Evaluating classifier accuracy:
print(accuracy_score(test_labels, preds))   # Evaluate accuracy

0.9414893617021277


Hence, the 30 features in our dataset are good indicators of tumour class.