In [1]:
# imports
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import sklearn

from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
# random seed and test fraction declaration
random_seed = 25
np.random.seed(random_seed)

test_data_fraction = 0.2

In [8]:
import warnings
warnings.filterwarnings('ignore')

# load the data and put into a dataframe
bc_sk = datasets.load_breast_cancer()
bc_sk.data = MinMaxScaler().fit_transform(bc_sk.data)
bc_data = pd.DataFrame(data= np.c_[bc_sk['data'], bc_sk['target']],columns= list(bc_sk['feature_names'])+['target'])
bc_features = bc_data.iloc[:,0:-1]
bc_labels = bc_data["target"]

# split the data into test and train and create a decision tree
X_train, X_test, Y_train, Y_test = train_test_split(bc_features, bc_labels, test_size=test_data_fraction,  random_state=random_seed)
Y_test_predicted = DecisionTreeClassifier(criterion = "gini", random_state=random_seed).fit(X=X_train, y=Y_train).predict(X_test)

In [4]:
# display the confusion matrix on the test data
confusion_matrix(Y_test, Y_test_predicted)

array([[33,  6],
       [ 3, 72]], dtype=int64)

In [5]:
# this is where the support numbers come from
print("Total records in Y_test_predicted = ", len(Y_test_predicted))
print("Total records in Y_test_predicted = 0 = 33+3 = ", 33+3)
print("Total records in Y_Test = ", len(Y_test))
print("Total records in Y_Test = 0 = 33 + 6 = ", 33+6, '\n')

# this is how the accuracy was calculated
print("Accuracy = (33+72)/114 = ", (33+72)/114)
print(f'Accuracy: {sklearn.metrics.accuracy_score(Y_test, Y_test_predicted)}' + '\n')

# this is how the macro precision is calculated
print("Precision Macro for label 0 = 33/(33+3) = ", 33/(33+3))
print("Precision Macro for label 1 = 72/(6+72) = ", 72/(6+72))
print("Precision Macro average = (33/(33+3) + 72/(6+72))/2 = ", (33/(33+3) + 72/(6+72))/2)
print(f'Precision Macro: {sklearn.metrics.precision_score(Y_test, Y_test_predicted, average="macro")}', '\n')

# this is how the recal macro is calculated
print("Recal Macro for label 0 = 33/(33+6) = ", 33/(33+6))
print("Recal Macro for label 1 = 72/(3+72) = ", 72/(3+72))
print("Recal Macro average = (33/(33+6) + 72/(3+72))/2 = ", (33/(33+6) + 72/(3+72))/2)
print(f'Recall Macro: {sklearn.metrics.recall_score(Y_test, Y_test_predicted, average="macro")}', '\n')

# This is how the F1 macro is calculated
print("F1 Macro for label 0 = 2*33/(2*33 + 6 + 3)", 2*33/(2*33 + 6 + 3)) 
print("F1 Macro for label 1 = 2*72/(2*72 + 6 + 3)", 2*72/(2*72 + 6 + 3))
print("F1 Macro average = (2*33/(2*33 + 6 + 3) + 2*72/(2*72 + 6 + 3))/2 = ", (2*33/(2*33 + 6 + 3) + 2*72/(2*72 + 6 + 3))/2)
print(f'F1 Macro: { sklearn.metrics.f1_score(Y_test, Y_test_predicted, average="macro") }' + '\n')

print("Weighted average for precision = .9167*39/114 + .9231*75/114 = ", .9167*39/114 + .9231*75/114)
print("Weighted average for recal = .8462*39/114 + .9600*75/114 = ", .8462*39/114 + .9600*75/114)
print("Weighted average for precision = .8800*39/114 + .9412*75/114 = ", .8800*39/114 + .9412*75/114)

Total records in Y_test_predicted =  114
Total records in Y_test_predicted = 0 = 33+3 =  36
Total records in Y_Test =  114
Total records in Y_Test = 0 = 33 + 6 =  39 

Accuracy = (33+72)/114 =  0.9210526315789473
Accuracy: 0.9210526315789473

Precision Macro for label 0 = 33/(33+3) =  0.9166666666666666
Precision Macro for label 1 = 72/(6+72) =  0.9230769230769231
Precision Macro average = (33/(33+3) + 72/(6+72))/2 =  0.9198717948717949
Precision Macro: 0.9198717948717949 

Recal Macro for label 0 = 33/(33+6) =  0.8461538461538461
Recal Macro for label 1 = 72/(3+72) =  0.96
Recal Macro average = (33/(33+6) + 72/(3+72))/2 =  0.9030769230769231
Recall Macro: 0.9030769230769231 

F1 Macro for label 0 = 2*33/(2*33 + 6 + 3) 0.88
F1 Macro for label 1 = 2*72/(2*72 + 6 + 3) 0.9411764705882353
F1 Macro average = (2*33/(2*33 + 6 + 3) + 2*72/(2*72 + 6 + 3))/2 =  0.9105882352941177
F1 Macro: 0.9105882352941176

Weighted average for precision = .9167*39/114 + .9231*75/114 =  0.9209105263157895
Weig

In [6]:
print("Decision Tree")
print(classification_report(Y_test,Y_test_predicted,digits=4))

Decision Tree
              precision    recall  f1-score   support

         0.0     0.9167    0.8462    0.8800        39
         1.0     0.9231    0.9600    0.9412        75

    accuracy                         0.9211       114
   macro avg     0.9199    0.9031    0.9106       114
weighted avg     0.9209    0.9211    0.9202       114



In [7]:
# the following shows the complete calculations for the micro metrics
print(confusion_matrix(Y_test, Y_test_predicted), '\n')

print("Precision Micro = (33+72)/(33+72+3+6) = ", (33+72)/(33+72+3+6))
print(f'Precision Micro: {sklearn.metrics.precision_score(Y_test, Y_test_predicted, average="micro")}', '\n')

print("Recal Micro = (33+72)/(33+72+3+6) = ", (33+72)/(33+72+3+6))
print(f'Recall Micro: {sklearn.metrics.recall_score(Y_test, Y_test_predicted, average="micro")}', '\n')

print("F1 Micro = (2*recal*precision) / (recal + precision) = ", (2*.910526315789473*.910526315789473)/(.910526315789473+.910526315789473))
print(f'F1 Micro: { sklearn.metrics.f1_score(Y_test, Y_test_predicted, average="micro") }')

[[33  6]
 [ 3 72]] 

Precision Micro = (33+72)/(33+72+3+6) =  0.9210526315789473
Precision Micro: 0.9210526315789473 

Recal Micro = (33+72)/(33+72+3+6) =  0.9210526315789473
Recall Micro: 0.9210526315789473 

F1 Micro = (2*recal*precision) / (recal + precision) =  0.910526315789473
F1 Micro: 0.9210526315789473
