# ID3 from Scratch

Let us import some libraries

In [1]:
# Dataset loader utility for OpenML
from openml.datasets import get_dataset
# Working with data frames
import pandas as pd
# Matrices, n-dimensional arrays etc.
import numpy as np
# Encoder for labels --> integers
from sklearn.preprocessing import LabelEncoder

Different data sets have different IDs on OpenML
- we can load a dataset based on its id

In [2]:
MUSHROOM = 24
TICTACTOE = 50  # We are using this dataset today
MONK1 = 333
MONK2 = 334

## Utility functions and definitions

Shuffle dataframe by sampling from it with `frac=1`

In [3]:
def shuffle_df(df: pd.DataFrame):
    return df.sample(frac=1).reset_index(drop=True)

### Decision tree utilities

Utilities for checking 
1. if all values in a series are the same
2. what the most common value in the given series is

In [4]:
def is_homogeneous(values: pd.Series):
    return len(np.unique(values)) == 1


def most_common_label(values: pd.Series):
    values, counts = np.unique(values, return_counts=True)
    most_frequent_index = np.argmax(counts)
    return values[most_frequent_index]

Entropy and information gain calculation

In [5]:
def entropy(values: pd.Series):
    num_values = len(values)
    unique_vals = np.unique(values)
    probabilities = [
        np.sum(values == this_value) / num_values
        for this_value in unique_vals
    ]
    entropy = 0
    for probability in probabilities:
        entropy -= probability * np.log2(probability)
    return entropy


def information_gain(data: pd.DataFrame, attribute: str):
    entropy_total =  entropy(data["class"])
    split_entropy = 0
    for att_value, partition in data.groupby(attribute):
        weight = len(partition) / len(data)
        split_entropy += weight * entropy(partition["class"])
    return entropy_total - split_entropy


def find_split_attribute(data: pd.DataFrame):
        attributes = data.columns[data.columns != "class"].to_numpy()
        IG_all_splits = [
            information_gain(data, attribute) for attribute in attributes
        ]
        att_index = np.argmax(IG_all_splits)
        return attributes[att_index]

## Decision Tree

- our fundamental class is an `ID3` node in our decision tree
- each node has
    - children (if not a leaf node)
    - some data (we may discard after building the tree)
    - the assigned label (if a leaf node)
    - a flag indicating if the node is a leaf
    
Note that ID3 works only with categorical data. Newer methods also can handle numerical data.

In [82]:
NOLABEL = -1

class ID3:
    def __init__(self):
        self.children = {}  # an empty dict, will take the form {att_value: child}
        self.split_attribute = None
        self.is_leaf = False
        self.label = NOLABEL
        self.default_prediction = None

    def fit(self, data: pd.DataFrame):
        # ID3 algorithm
        labels = data["class"].to_numpy()
        attributes = data.columns[data.columns != "class"].to_numpy()
        # check if this node should be a leaf node
        if is_homogeneous(labels):
            self.is_leaf = True
            self.label = labels[0]
        if len(attributes) == 0:
            self.is_leaf = True
            self.label = most_common_label(labels)
        else:
            self.split_attribute = find_split_attribute(data)
            for att, partition in data.groupby(self.split_attribute):
                # remove split attribute of this node from remaining partition
                partition_wo_split_attribute = partition.drop(self.split_attribute, axis=1)
                new_node = ID3()
                if len(partition) == 0:
                    label = most_common_label(labels)
                    new_node.label = label
                    new_node.is_leaf = True
                else:
                    # expand tree
                    new_node = ID3()
                    new_node.fit(partition_wo_split_attribute)
                    self.children[att] = new_node
                    self.default_prediction = most_common_label(data["class"])

    def predict(self, data: pd.DataFrame):
        return np.array([
            self.predict_instance(x) for _, x in data.iterrows()
        ])
            
            
    def predict_instance(self, x: pd.Series):
        if self.is_leaf:
            return self.label
        else:
            att_value = x[self.split_attribute]
            if att_value in self.children.keys():
                return self.children[att_value].predict_instance(x)
            else:
                return self.default_prediction

## Running our algorithm
- define dataset
- preprocess data
- build tree
- evaluation

We first load the data from OpenML

In [83]:
data = get_dataset(dataset_id=TICTACTOE).get_data()[0]

### Preprocessing
1. shuffle data
2. rename target column (some are "Class" instead of "class")

In [89]:
data = shuffle_df(data)
data.rename({"Class": "class"}, axis=1, inplace=True)

Encode labels as integers (zeros and ones for binary classification)
- we use the label encoder from scikit-learn

In [90]:
data["class"] = LabelEncoder().fit_transform(data["class"])

Split the data into 10 folds

In [98]:
from sklearn.model_selection import KFold

n_folds = 10
folds = KFold(n_splits=n_folds)

### Building the tree

Fitting the tree

In [99]:
algorithms = {"ID3": ID3()}

- Evaluate on training and test data
- Store data in a nested list (we will afterwards convert to a data frame)

In [101]:
df_columns = ["Algorithm", "Fold", "Data", "Accuracy"]
result = []
for i, (train_indices, test_indices) in enumerate(folds.split(data)):
    train, test = data.iloc[train_indices], data.iloc[test_indices]
    for alg_name, algorithm in algorithms.items():
        algorithm.fit(train)
        
        y_pred_test = algorithm.predict(test.drop("class", axis=1))
        correct_test = y_pred_test == test["class"]
        acc_test = np.mean(correct_test)
        
        y_pred_train = algorithm.predict(train.drop("class", axis=1))
        correct_train = y_pred_train == train["class"]
        acc_train = np.mean(correct_train)
        
        result.append([alg_name, i, "Train", acc_train])
        result.append([alg_name, i, "Test", acc_test])
        
result_df = pd.DataFrame(result, columns=df_columns)
result_df.groupby(["Algorithm", "Data"]).mean().drop("Fold", axis=1)

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy
Algorithm,Data,Unnamed: 2_level_1
ID3,Test,0.858081
ID3,Train,1.0


- we see that this tree overfits to the train data, its performance on test is much lower than on train.
    - how can we solve this?