# Lab 2 : Decision Tree

## Preparing the dataset

Import necessities

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import export_graphviz
import graphviz

Download the dataset

In [2]:
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


Load dataset from drive to memory in `pandas.DataFrame`

In [3]:
from ucimlrepo import fetch_ucirepo
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17)

feature = breast_cancer_wisconsin_diagnostic.data.features
label = breast_cancer_wisconsin_diagnostic.data.targets

Split dataset into two parts: training and testing (validation)

In [4]:
train_feature, train_label = [], []
test_feature, test_label = [], []

train_propotions = [0.4, 0.6, 0.8, 0.9]
for propotion in train_propotions:
    feature_train, feature_test, label_train, label_test = train_test_split(
        feature, label, 
        test_size=1-propotion,
        stratify=label)
    train_feature.append(feature_train)
    test_feature.append(feature_test)
    train_label.append(label_train)
    test_label.append(label_test)

In [5]:
len(train_feature) + len(test_feature)

8

## Building the tree classifier

Train 4 models and save them along with their accuracies

In [6]:
models = dict()

for i in range(len(train_feature)):
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(train_feature[i], train_label[i])
    label_pred = model.predict(test_feature[i])
    accuracy = accuracy_score(test_label[i], label_pred)

    models[f"model_{i+1}"] = model, accuracy

`export_model_image` function exports visualizing image of the decision tree, with a custom name.

In [7]:
def export_model_image(model, filename, save_picture=True):
    dot_data = export_graphviz(model, 
                            out_file=None, 
                            feature_names=[feature_name for feature_name in feature], 
                            class_names=['benign', 'malignant'],
                            filled=True, 
                            rounded=True,
                            special_characters=True)

    graph = graphviz.Source(dot_data)  
    graph.format = 'png'
    graph.render(filename=filename)
    graph.view()

Export visualizations of all models

In [8]:
for model_name, (model, accuracy) in models.items():
    export_model_image(model, model_name)