## **Zadanie 4 - Drzewo decyzyjne ID3**

Cel zadania polega na implementacji drzewa decyzyjnego tworzonego algorytmem ID3 z ograniczeniem maksymalnej głębokości drzewa, jak również na stworzeniu i zbadaniu jakości klasyfikatora dla zbioru danych [Tic-Tac-Toe Endgame](https://archive.ics.uci.edu/dataset/101/tic+tac+toe+endgame).

**Kroki do wykonania:**
- Zaimplementuj drzewo decyzyjne ID3 (z ograniczeniem jego maksymalnej głębokości).
- Zbadaj skuteczność działania kasyfikatora dla zbioru danych Tic-Tac-Toe Endgame, obliczając dokładność i macierz pomyłek.

**Uwagi**
- Należy pamiętać o podziale danych na zbiory trenujący, walidacyjny i testowy.
- Zaimplementowana metoda powinna być uniwersalna - nie należy "zaszywać" na sztywno w kodzie np. nazwy pliku ze zbiorem danych czy wartości atrybutów.

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Tuple

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# fetch dataset
tic_tac_toe_endgame = fetch_ucirepo(id=101)

# data (as pandas dataframes)
X = tic_tac_toe_endgame.data.features
y = tic_tac_toe_endgame.data.targets

In [None]:
# variable information
print(tic_tac_toe_endgame.variables)

In [None]:
# encode target
le = LabelEncoder()
y = le.fit_transform(y)

y = pd.Series(y)

In [None]:
ALL_VARIABLES_VALUES = ['x', 'o', 'b']

In [None]:
class Node:
    def __init__(self):
        self.feature_name = None    # feature used for splitting
        self.value = None           # value of the feature used for splitting
        self.label = None           # label of the node (only for leaf nodes)
        self.children = []          # list of child nodes each representing a split

    def add_child(self, child: 'Node') -> None:
        self.children.append(child)

    def set_feature_name(self, feature_name: str) -> None:
        self.feature_name = feature_name

    def set_value(self, value: str) -> None:
        self.value = value

    def set_label(self, label: str) -> None:
        self.label = label

In [None]:
class DecisionTreeID3:
    def __init__(self, max_depth: int = 5, leaf_classes: List[str] = ALL_VARIABLES_VALUES) -> None:
        self.max_depth = max_depth
        self.leaf_classes = leaf_classes
        self.root = None

    def _calculate_entropy(self, y: pd.Series) -> float:
        probabilities = np.bincount(y) / len(y)
        return - np.sum([p * np.log2(p) for p in probabilities if p > 0])

    def _calculate_information_gain(self, X: pd.DataFrame, y: pd.Series, feature: str) -> float:
        total_data_entropy = self._calculate_entropy(y)
        unique_values, unique_value_counts = np.unique(X[feature], return_counts=True)
        subset_entropy = 0
        for value, value_count in zip(unique_values, unique_value_counts):
            subset_y = y[X[feature] == value]
            subset_entropy += value_count / len(X) * self._calculate_entropy(subset_y)

        return total_data_entropy - subset_entropy

    def _choose_best_feature(self, X: pd.DataFrame, y: pd.Series) -> str:
        gains = {
            feature: self._calculate_information_gain(X, y, feature)
            for feature in X.columns
        }
        best_feature = max(gains, key=gains.get)
        return best_feature, gains[best_feature]

    def fit(self, X_train: pd.DataFrame, y_train: pd.Series) -> None:
        self.root = Node()
        self._id3(self.root, X_train, y_train, 0)

    def _id3(self, node: Node, X: pd.DataFrame, y: pd.Series, depth: int) -> None:
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            # set the value of the node to the most common class in the dataset
            node.set_label(y.mode()[0])
            return None

        split_feature, information_gain = self._choose_best_feature(X, y)

        if information_gain <= 0:
            node.set_label(y.mode()[0])
            return None

        node.set_feature_name(split_feature)

        if self.leaf_classes is None:
            self.leaf_classes = np.unique(y)

        for value in self.leaf_classes:
            child_node = Node()
            child_node.set_value(value)
            node.add_child(child_node)

            subset = X[X[split_feature] == value]

            if subset.empty:
                child_node.set_label(y.mode()[0])
            else:
                self._id3(child_node, X[X[split_feature] == value], y[X[split_feature] == value], depth + 1)

    def predict(self, X: pd.DataFrame) -> List[str]:
        y_pred = []
        for i in range(len(X)):
            y_pred.append(self.predict_single(self.root, X.iloc[i]))

        return y_pred

    def predict_single(self, node: Node, x: pd.Series) -> str:
        if node.label is not None:
            return node.label

        for child in node.children:
            if x[node.feature_name] == child.value:
                return self.predict_single(child, x)

    def print_tree(self, node: Node, level: int = 0, prefix: str = "") -> None:
        indent = "   " * level
        if node.value is not None:
            prefix = f"{prefix} ({node.value})"

        if node.feature_name is None:
            print(f"{indent}{prefix}: Leaf: {node.label}")
        else:
            print(f"{indent}{prefix}: {node.feature_name}")

        for child in node.children:
            self.print_tree(child, level + 1, prefix=f"-")

In [None]:
def get_train_val_test_split(X, y, train_val_test_split = [0.8, 0.1, 0.1]):
    # train is now train_val_test_split[0] of the entire data set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1 - train_val_test_split[0], random_state = 42)
    # split the test set into validation and test sets
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = train_val_test_split[2]/
                                                    (train_val_test_split[1] + train_val_test_split[2]), random_state = 42)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = get_train_val_test_split(X, y)

In [None]:
id3_tree = DecisionTreeID3(max_depth = 5)

In [None]:
id3_tree.fit(X_train, y_train)

In [None]:
id3_tree.print_tree(id3_tree.root)

In [None]:
le.inverse_transform([1, 0])

### Check different depths of the tree

In [None]:
DEPTH_RANGE = range(1, 9)

In [None]:
def check_different_depths(depth_range: List[int] = DEPTH_RANGE) -> Tuple[List[float], List[np.ndarray]]:
    accuracy, cm = [], []

    for depth in depth_range:
        id3_tree = DecisionTreeID3(max_depth = depth)
        id3_tree.fit(X_train, y_train)
        y_pred = id3_tree.predict(X_val)
        accuracy.append(accuracy_score(y_val, y_pred))
        cm.append(confusion_matrix(y_val, y_pred))

    return accuracy, cm

In [None]:
accuracy, cm = check_different_depths(DEPTH_RANGE)

In [None]:
def visualize_cm(cm: np.ndarray, title: str) -> None:
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
    plt.title(title)
    plt.xlabel("Predykcja")
    plt.ylabel("Prawdziwa wartość")
    plt.show()

In [None]:
def plot_accuracy(depth_range: List[int], accuracy: List[float]) -> None:
    plt.plot(depth_range, accuracy)
    plt.title("Dokładność w zależności od głębokości drzewa")
    plt.xlabel("Głębokość drzewa")
    plt.ylabel("Dokładność")
    plt.show()

In [None]:
def plot_cm(depth_range: List[int], cm: List[np.ndarray]) -> None:
    for i in range(len(cm)):
        plt.figure()
        visualize_cm(cm[i], f"Macierz pomyłek dla głębokości drzewa {depth_range[i]}")

In [None]:
plot_accuracy(DEPTH_RANGE, accuracy)

In [None]:
plot_cm(DEPTH_RANGE, cm)

### Wyniki dla najlepszej głębokości drzewa na zbiorze testowym

In [None]:
id3_tree = DecisionTreeID3(max_depth = 6)

In [None]:
id3_tree.fit(X_train, y_train)

In [None]:
test_pred = id3_tree.predict(X_test)

In [None]:
test_accuracy = accuracy_score(y_test, test_pred)
test_cm = confusion_matrix(y_test, test_pred)

In [None]:
print(f"Test Accuracy: {test_accuracy}")

In [None]:
visualize_cm(test_cm, "Macierz pomyłek dla zbioru testowego, głębokość drzewa 6")