# Introductory classifier

In [None]:
import sklearn                                        # In addition to the tools we'll
from sklearn.datasets import load_breast_cancer       # need here, we are also importing
from sklearn.model_selection import train_test_split  # the Wisconsin Breast Cancer
from sklearn.naive_bayes import GaussianNB            # dataset into a Bunch called wbc.
from sklearn.metrics import accuracy_score            # Bunches are sklearn objects, very
wbc = load_breast_cancer()                            # similar to Python dicts.

In [None]:
diagnoses = wbc['target_names']       # Diagnosis: benign or
                                      # malignant. 
    
results = wbc['target']               # Benign diagnoses are 
                                      # labeled with 1 and
                                      # malignacies with 0.                                      # with 0.

observations = wbc['feature_names']   # Properties of cells imaged
                                      # after fine needle aspiration.
                                      # These properties include mean,
                                      # standard error, and worst error
                                      # values for characteristics such
                                      # as radius, concavity, etc.
                    
measurements = wbc['data']            # The actual measurements.

In [None]:
# The train_test_split() method allows us to split the data
# into training and testing subsets. 
# 
# The parameter train_size determines the fraction of the dataset
# that we use to train the classifier. We start with only 5%
# which is a very small fraction, and go from there.
#
# The remaining of the dataset is then used to test the classifier.
#
# (\ are used for spreading a Python statement across multiple lines)


trainingSet, testingSet, \
trainingDiagnoses, testingDiagnoses = \
\
train_test_split(measurements, \
                  results, \
                  train_size=0.05, \
                  random_state=40)   

In [None]:
# We'll use a Naive Bayes classifier and because the values for various
# cell characteristics are continuous values, we'll use a Gaussian version
# of the classifier.

classifier = GaussianNB()

# Train our classifier
trainedModel = classifier.fit(trainingSet, trainingDiagnoses)

# Use the trained model to make predictions about the diagnoses 
# in the test subset. Then compare the model's predictions with
# the actual diagnoses, to assess the efficacy of the classifier.

modelPredicts = classifier.predict(testingSet)

In [None]:
# How accurate is the model?
print(accuracy_score(testingDiagnoses, modelPredicts))

In [None]:
# Idea for a possible assignment: parametrize evaluation of the model
# ploting accurancy v. train_size for train_size values from 5% to 95%.
# Maybe compare it to another classifier?