## **Zadanie 4 - Drzewo decyzyjne ID3**

Cel zadania polega na implementacji drzewa decyzyjnego tworzonego algorytmem ID3 z ograniczeniem maksymalnej głębokości drzewa, jak również na stworzeniu i zbadaniu jakości klasyfikatora dla zbioru danych [Tic-Tac-Toe Endgame](https://archive.ics.uci.edu/dataset/101/tic+tac+toe+endgame).

**Kroki do wykonania:**
- Zaimplementuj drzewo decyzyjne ID3 (z ograniczeniem jego maksymalnej głębokości).
- Zbadaj skuteczność działania kasyfikatora dla zbioru danych Tic-Tac-Toe Endgame, obliczając dokładność i macierz pomyłek.

**Uwagi**
- Należy pamiętać o podziale danych na zbiory trenujący, walidacyjny i testowy.
- Zaimplementowana metoda powinna być uniwersalna - nie należy "zaszywać" na sztywno w kodzie np. nazwy pliku ze zbiorem danych czy wartości atrybutów.

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# fetch dataset
tic_tac_toe_endgame = fetch_ucirepo(id=101)

# data (as pandas dataframes)
X = tic_tac_toe_endgame.data.features
y = tic_tac_toe_endgame.data.targets

In [None]:
# metadata
print(tic_tac_toe_endgame.metadata)

In [None]:
# variable information
print(tic_tac_toe_endgame.variables)

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# encode target
le = LabelEncoder()
y = le.fit_transform(y)

y = pd.Series(y)

In [None]:
y

In [None]:
ALL_VARIABLES_VALUES = ['x', 'o', 'b']

In [None]:
class Node:
    def __init__(self):
        self.feature_name = None
        self.value = None
        self.label = None
        self.children = []

    def add_child(self, child):
        self.children.append(child)

    def set_feature_name(self, feature_name):
        self.feature_name = feature_name

    def set_value(self, value):
        self.value = value

    def set_label(self, label):
        self.label = label

In [None]:
class DecisionTreeID3:
    def __init__(self, max_depth = 5, leaf_classes = ALL_VARIABLES_VALUES):
        self.max_depth = max_depth
        self.leaf_classes = leaf_classes
        self.root = None

    def calculate_entropy(self, y):
        probabilities = np.bincount(y) / len(y)
        return - np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def calculate_information_gain(self, X, y, feature):
        total_data_entropy = self.calculate_entropy(y)
        unique_values, unique_value_counts = np.unique(X[feature], return_counts=True)
        subset_entropy = 0
        for value, value_count in zip(unique_values, unique_value_counts):
            subset_y = y[X[feature] == value]
            subset_entropy += value_count / len(X) * self.calculate_entropy(subset_y)

        return total_data_entropy - subset_entropy

    # def calculate_dataset_entropy(self, y):
    #     entropy = 0
    #     for class_name in np.unique(y):
    #         p = np.sum(y == class_name) / len(y)
    #         entropy -= p * np.log2(p)
    #     return entropy

    # def calculate_dataset_divide_entropy(self, feature_name, X, y):
    #     entropy = 0
    #     for value in np.unique(X[feature_name]):
    #         entropy += np.sum(X[feature_name] == value) / len(X) * self.calculate_dataset_entropy(y[X[feature_name] == value])

    #     return entropy

    # def information_gain(self, X, y, feature):
    #     return self.calculate_dataset_entropy(y) - self.calculate_dataset_divide_entropy(feature, X, y)

    def choose_best_feature(self, X, y):
        gains = {
            feature: self.calculate_information_gain(X, y, feature)
            for feature in X.columns
        }
        return max(gains, key=gains.get)

    def fit(self, X_train, y_train):
        self.root = Node()
        self.id3(self.root, X_train, y_train, 0)

    def id3(self, node, X, y, depth):
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            # set the value of the node to the most common class in the dataset
            node.set_label(y.mode()[0])
            return None

        split_feature = self.choose_best_feature(X, y)

        node.set_feature_name(split_feature)

        # split the data
        for value in self.leaf_classes:
            child_node = Node()
            child_node.set_value(value)
            node.add_child(child_node)

            subset = X[X[split_feature] == value]

            if subset.empty:
                child_node.set_label(y.mode()[0])
            else:
                self.id3(child_node, X[X[split_feature] == value], y[X[split_feature] == value], depth + 1)

    def predict(self, X):
        y_pred = []
        for i in range(len(X)):
            y_pred.append(self.predict_single(self.root, X.iloc[i]))

        return y_pred

    def predict_single(self, node, x):
        if node.label is not None:
            return node.label

        for child in node.children:
            if x[node.feature_name] == child.value:
                return self.predict_single(child, x)

    def print_tree(self, node, level=0, prefix="Root"):
        indent = "   " * level
        if node.value is not None:
            prefix = f"{prefix} ({node.value})"

        if node.feature_name is None:
            print(f"{indent}{prefix}: Leaf: {node.label}")
        else:
            print(f"{indent}{prefix}: {node.feature_name}")

        for child in node.children:
            self.print_tree(child, level + 1, prefix=f"-")

In [None]:
def get_train_val_test_split(X, y, train_val_test_split = [0.8, 0.1, 0.1]):
    # train is now train_val_test_split[0] of the entire data set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1 - train_val_test_split[0], random_state = 42)
    # split the test set into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = train_val_test_split[2]/
                                                    (train_val_test_split[1] + train_val_test_split[2]), random_state = 42)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_train_val_test_split(X, y)

In [None]:
id3_tree = DecisionTreeID3(max_depth = 5)

In [None]:
id3_tree.fit(X_train, y_train)

In [None]:
id3_tree.print_tree(id3_tree.root)

### Check different depths of the tree

In [None]:
depth_range = range(1, 10)

In [None]:
accuracy = []
cm = []

for depth in depth_range:
    id3_tree = DecisionTreeID3(max_depth = depth)
    id3_tree.fit(X_train, y_train)
    y_pred = id3_tree.predict(X_val)
    accuracy.append(accuracy_score(y_val, y_pred))
    cm.append(confusion_matrix(y_val, y_pred))

In [None]:
def visualize_cm(cm):
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title("Macierz pomyłek")
    plt.xlabel("Predykcja")
    plt.ylabel("Prawdziwa wartość")
    plt.show()

In [None]:
def plot_accuracy(depth_range, accuracy):
    plt.plot(depth_range, accuracy)
    plt.title("Dokładność w zależności od głębokości drzewa")
    plt.xlabel("Głębokość drzewa")
    plt.ylabel("Dokładność")
    plt.show()

In [None]:
def plot_cm(cm):
    for i in range(len(cm)):
        plt.figure()
        visualize_cm(cm[i])

In [None]:
plot_accuracy(depth_range, accuracy)

In [None]:
plot_cm(cm)

### Wyniki dla najlepszej głębokości drzewa na zbiorze testowym

In [None]:
id3_tree = DecisionTreeID3(max_depth = 6)

In [None]:
id3_tree.fit(X_train, y_train)

In [None]:
test_pred = id3_tree.predict(X_test)

In [None]:
test_accuracy = accuracy_score(y_test, test_pred)
test_cm = confusion_matrix(y_test, test_pred)

In [None]:
print(f"Test Accuracy: {test_accuracy}")

In [None]:
visualize_cm(test_cm)