# <center style='color: darkblue'> Lab02: *Decision Tree* </center>

**Student**
> Full name: Huynh Duc Thien <br>
ID: 21127693 <br>
Contact: hdthien21@clc.fitus.edu.vn <br>
Course: MTH00057_21CLC07 <br>
Class: 21CLC07

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz
from IPython.display import display, Image
from sklearn import tree
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

## <font style='color: darkblue'> 1. Preparing the data sets <a id="c1"></a>

In [None]:
col_names = ['parents', 'has_nurs', 'form', 'children', 'housing', 'fiance', 'social', 'health', 'class']
data = pd.read_csv('./nursery.data.csv', names=col_names)

features = data.drop(['class'], axis=1)
labels = data['class']

encoder = LabelEncoder()
categorical_cols = features.select_dtypes(include=['object']).columns
features[categorical_cols] = features[categorical_cols].apply(encoder.fit_transform)

features_name = col_names[:-1]
classes_name = labels.unique().tolist()

proportions = [(0.4, 0.6), (0.6, 0.4), (0.8, 0.2), (0.9, 0.1)]
train_test_sets = []

for prop in proportions:
    features_train, features_test, labels_train, labels_test = train_test_split(features.copy(), 
                                                                                labels.copy(), 
                                                                                train_size=prop[0], 
                                                                                test_size=prop[1], 
                                                                                shuffle=True, 
                                                                                stratify=labels,
                                                                                random_state=0)
    train_test_sets.append((features_train, labels_train, features_test, labels_test))

***Visualize the distribution***

In [None]:
data_counts = labels.value_counts().sort_index()
train_counts = [labels_train.value_counts().reindex(data_counts.index, fill_value=0) 
                for (_, labels_train, _, _) in train_test_sets]
test_counts = [labels_test.value_counts().reindex(data_counts.index, fill_value=0) 
               for (_, _, _, labels_test) in train_test_sets]

for i, proportion in enumerate(proportions):
    plt.figure(figsize=(12, 6))
    plt.title(f"Data Distribution (Train: {proportion[0]}, Test: {proportion[1]})")

    x = np.arange(len(data_counts))
    width = 0.25
    plt.bar(x - width, data_counts, width, label='Original Data')

    x_train = np.arange(len(train_counts[i]))
    plt.bar(x_train, train_counts[i], width, label='Training Data')

    x_test = np.arange(len(test_counts[i]))
    plt.bar(x_test + width, test_counts[i], width, label='Test Data')

    plt.xlabel("Class")
    plt.ylabel("Count")
    plt.xticks(np.arange(len(data_counts)), data_counts.index, rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

## <font style='color: darkblue'> 2. Building the decision tree classifiers <a id="c2"></a>

In [None]:
dtc_list = []
for (features_train, labels_train, features_test, labels_test), proportion in zip(train_test_sets, proportions):
    print(f"------------ Proportion - Train: {proportion[0]} - Test: {proportion[1]} ------------")
    dtc = DecisionTreeClassifier(criterion="entropy", splitter="best", random_state=0)
    dtc.fit(features_train, labels_train)
    dot_data = tree.export_graphviz(dtc, 
                                    out_file=None, 
                                    feature_names=features_name, 
                                    class_names=classes_name,
                                    filled=True,
                                    special_characters=True)
    graph = graphviz.Source(dot_data)
    dtc_list.append(dtc)
    graph.render(f"./dt_with_proportion/decision_tree_proportion_{proportion[0]}_{proportion[1]}", 
                 format='png', 
                 cleanup=True)
    display(Image(f"./dt_with_proportion/decision_tree_proportion_{proportion[0]}_{proportion[1]}.png"))

## <font style='color: darkblue'> 3. Evaluating the decision tree classifiers <a id="c3"></a>

In [None]:
for dtc, (features_train, labels_train, features_test, labels_test), proportion in zip(dtc_list, train_test_sets, proportions):
    labels_pred = dtc.predict(features_test)

    report = classification_report(labels_test, labels_pred, zero_division=1)
    print(f"\n\n\nClassification Report - Proportion: {proportion}")
    print(report)

    matrix = confusion_matrix(labels_test, labels_pred)
    print("Confusion Matrix:")
    print(matrix)

    plt.figure(figsize=(8, 6))
    sns.heatmap(matrix, 
                annot=True, 
                cmap="Purples", 
                fmt="d", 
                cbar=True, 
                xticklabels=dtc.classes_, 
                yticklabels=dtc.classes_)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.title(f"Decision Tree Classifier Heatmap\nProportion: {proportion}")
    plt.show()

## <font style='color: darkblue'> 4. The depth and accuracy of a decision tree <a id="c4"></a>

In [None]:
max_depth_values = [2, 3, 4, 5, 6, 7, None]
accuracy_scores = []
features_train_arc, labels_train_arc, features_test_arc, labels_test_arc = train_test_sets[2]

for max_depth in max_depth_values:
    if max_depth is not None:
        print(f"------------ Max depth: {max_depth} ------------")
    else:
        print("------------ No limit depth ------------")
        
    dtc = DecisionTreeClassifier(max_depth=max_depth)
    dtc.fit(features_train_arc, labels_train_arc)

    labels_pred_arc = dtc.predict(features_test_arc)

    accuracy = accuracy_score(labels_test_arc, labels_pred_arc)
    accuracy_scores.append(accuracy)

    dot_data = tree.export_graphviz(dtc, 
                                    out_file=None, 
                                    feature_names=features_name, 
                                    class_names=classes_name, 
                                    filled=True)
    graph = graphviz.Source(dot_data)
    graph.render(f"./limited_depth_tree/decision_tree_depth_{max_depth}", format='png', cleanup=True)
    display(Image(f"./limited_depth_tree/decision_tree_depth_{max_depth}.png"))


print("\n\nAccuracy board")
print("max_depth \tAccuracy")
for max_depth, accuracy in zip(max_depth_values, accuracy_scores):
    print(f"{max_depth}\t\t{accuracy}")
    
    
max_depth_str_formatted = [str(depth) if depth is not None else 'No limit' for depth in max_depth_values]
plt.plot(max_depth_str_formatted, accuracy_scores, marker='o', color='purple')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.title('Accuracy vs. Max Depth')
plt.grid(True)
plt.show()