# Lab 2 : Decision Tree

## Preparing the dataset

Import necessities

In [1]:
import os
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report, confusion_matrix
import graphviz

Download the dataset

In [2]:
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


Load dataset from drive to memory in `pandas.DataFrame`

In [3]:
from ucimlrepo import fetch_ucirepo
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

feature = breast_cancer_wisconsin_diagnostic.data.features
label = breast_cancer_wisconsin_diagnostic.data.targets

Split dataset into two parts: training and testing (validation)

In [4]:
train_features, train_labels = dict(), dict()
test_features, test_labels = dict(), dict()

train_test_propotions = ['60/40', '40/60', '80/20', '90/10']

for propotion in train_test_propotions:
    test_size = 1 - int(propotion.split('/')[0]) / 100
    feature_train, feature_test, label_train, label_test = train_test_split(
        feature, label, 
        test_size=test_size,
        stratify=label)
    train_features[propotion] = feature_train
    train_labels[propotion] = label_train
    test_features[propotion] = feature_test
    test_labels[propotion] = label_test

## Building the tree classifier

Train 4 models and save them along with their accuracies

In [5]:
models = dict()

for propotion in train_test_propotions:
    model = DecisionTreeClassifier()
    model.fit(train_features[propotion], train_labels[propotion])
    models[propotion] = model

`export_model_image` function exports visualizing image of the decision tree, with a custom name.

In [6]:
def export_model_image(model, filename, save_picture=True):
    dot_data = export_graphviz(model, 
                            out_file=None, 
                            feature_names=[feature_name for feature_name in feature], 
                            class_names=['benign', 'malignant'],
                            filled=True, 
                            rounded=True,
                            special_characters=True)

    graph = graphviz.Source(dot_data)  
    graph.format = 'png'
    graph.render(filename=filename)
    # graph.view()

Export visualizations of all models

In [7]:
visualization_folder = 'images'

if os.path.exists(visualization_folder) == False:
    os.mkdir(visualization_folder)

for propotion, model in models.items():
    propotion = propotion.replace('/', '_')
    export_model_image(model, f'{visualization_folder}/Decision_Tree_({propotion})')

## Evaluating the decision tree classifiers

Create classification report and confusion matrix for each models

In [10]:
model_classification_reports = dict()
model_confusion_matrices = dict()

for model_name, model in models.items():
    cls_report = classification_report(test_labels[model_name], model.predict(test_features[model_name])) 
    conf_matrix = confusion_matrix(test_labels[model_name], model.predict(test_features[model_name]))
    print(f'Model: {model_name}')
    print(f'Classification Report:\n{cls_report}')
    print(f'Confusion Matrix:\n{conf_matrix}\n')
    model_classification_reports[model_name] = cls_report
    model_confusion_matrices[model_name] = conf_matrix

Model: 60/40
Classification Report:
              precision    recall  f1-score   support

           B       0.97      0.93      0.95       143
           M       0.89      0.95      0.92        85

    accuracy                           0.94       228
   macro avg       0.93      0.94      0.94       228
weighted avg       0.94      0.94      0.94       228

Confusion Matrix:
[[133  10]
 [  4  81]]

Model: 40/60
Classification Report:
              precision    recall  f1-score   support

           B       0.94      0.91      0.92       215
           M       0.86      0.90      0.88       127

    accuracy                           0.91       342
   macro avg       0.90      0.90      0.90       342
weighted avg       0.91      0.91      0.91       342

Confusion Matrix:
[[196  19]
 [ 13 114]]

Model: 80/20
Classification Report:
              precision    recall  f1-score   support

           B       0.92      0.96      0.94        72
           M       0.92      0.86      0.89  