# Homework #1: Decision Trees using ID3

* This notebook should help guide you in writing code for your homework.
* Please submit this notebook along with your writeup.
* In this assignment, you will implement the ID3 algorithm to build a decision tree.
* Follow the steps below to complete your implementation. Remember to test your code thoroughly using the provided datasets and unit tests.
* Using any assistive tools to generate your code or write up is strictly prohibited per the course guidelines.

Good luck and have fun! <(^_^)>

In [None]:
import csv
import math
import random
from collections import Counter

import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

random.seed(101)

* The above imports should have you covered.
* You may **not** use an additional external packages to complete this assignment. These include, but are not limited to-`numpy` or `pandas`.
* You may use `sklearn` for your confusion matrix.
* You may use built-in libraries like `collections`, `os`, `sys`, and so forth to read in files and handle your data structures.

In [None]:
def parse(filename):
    """
    Takes a filename and returns attribute information and all the data in array of dictionaries

    ------ Do not modify this function --------
    """
    # initialize variables

    out = []
    # note: you may need to add encoding="utf-8" as a parameter
    csvfile = open(filename, "r")
    fileToRead = csv.reader(csvfile)

    headers = next(fileToRead)

    # iterate through rows of actual data
    for row in fileToRead:
        out.append(dict(zip(headers, row)))

    return out

In [None]:
# example usage
house_votes_data = parse("house_votes_84.data")
house_votes_data[0]  # list of dicts

## Implementing Data Structures

Start by writing up your node class.

In [None]:
class Node:
    """
    Represents a node in a ID3 decision tree.

    Attributes:
        attribute (str): The attribute on which the data is split.
        attribute_value (str): The value of the attribute for this node.
        class_label (str): The class label if it's leaf node, otherwise
                           the majority class if it's a non-leaf node.
        children (dict of Node): Dictionary of child nodes of the
                                 form {attribute_value:child_node}.
        is_leaf (bool): True if it's a leaf node, False otherwise.
    """

    def __init__(
        self,
        attribute=None,
        attribute_value=None,
        class_label=None,
        is_leaf=False,
    ):
        self.attribute = attribute
        self.attribute_value = attribute_value
        self.class_label = class_label
        self.children = {}
        self.is_leaf = is_leaf

    def has_children(self):
        """
        Returns True if current node has children, False otherwise.
        """
        return bool(self.children)

    def get_children(self):
        """
        Returns children of current node.
        """
        return self.children.items()

    def add_child(self, attribute_value, child_node):
        """
        Add a child node to the current node with a given attribute value.
        """
        self.children[attribute_value] = child_node

    def get_attribute(self):
        """
        Returns attribute of current node.
        """
        return self.attribute

    def update_attribute(self, attribute):
        """
        Updates attribute of current node.
        """
        self.attribute = attribute

    def update_as_leaf(self, is_leaf=True):
        """
        Update current node to a leaf node.
        """
        self.is_leaf = is_leaf

    def update_class_label(self, class_label):
        """
        Update class label of current node.
        """
        self.class_label = class_label

    def display(self, indent=0):
        """
        (Debugging) Displays a node and its children.
        """
        prefix = "  " * indent
        if self.is_leaf:
            print(f"| {prefix}└─Class: {self.class_label}")
        else:
            print(
                f"{'|' if indent != 0 else ''}{prefix}"
                + f"{'└─' if indent != 0 else ''}"
                + f"Attribute: {self.get_attribute()}"
            )
            for value, child_node in self.get_children():
                print(f"|{prefix} └─Value {value}:")
                child_node.display(indent + 1)

* Now implement the ID3 algorithm using the node data structure you created.
* You may overload the following functions and create more as you please.

In [None]:
def ID3(examples, default):
    """
    Takes in an array of examples, and returns a tree (an instance of Node)
    trained on the examples.  Each example is a dictionary of attribute:value
    pairs, and the target class variable is a special attribute with the name
    "Class". Any missing attributes are denoted with a value of "?".
    """
    # if there are no examples, return the default value
    if len(examples) == 0:
        node = Node(default)
        return node
    attributes = set(
        attribute for attribute in examples[0].keys() if attribute != "Class"
    )
    node = ID3_helper(examples, attributes)
    return node

In [None]:
def get_most_common_class(class_labels):
    """
    Return the class with the most number of examples.
    """
    return Counter(class_labels).most_common()[0][0]


def get_entropy(examples):
    """
    Calculate the entropy for a set of examples.
    """
    class_labels_count = Counter([example["Class"] for example in examples])
    entropy = 0
    for class_label_count in class_labels_count.items():
        proportion = class_label_count[1] / len(examples)
        entropy += -proportion * math.log2(proportion)
    return entropy


def get_info_gain(examples, attribute):
    """
    Return the information gain for a set of examples and attributes.
    """
    parent_entropy = get_entropy(examples)
    attribute_values = set(example[attribute] for example in examples)
    weighted_entropy = 0
    for attribute_value in attribute_values:
        child_examples = [
            example for example in examples if example[attribute] == attribute_value
        ]
        child_entropy = get_entropy(child_examples)
        weighted_entropy += (len(child_examples) / len(examples)) * child_entropy
    info_gain = parent_entropy - weighted_entropy
    return info_gain


def prune(node, examples):
    """
    Takes in a trained tree and a validation set of examples. Prunes nodes in
    order to improve accuracy on the validation data; the precise pruning
    strategy is up to you.
    """
    accuracy_based_pruning(node, examples)


def accuracy_based_pruning(node, examples):
    """
    Recursively prune a tree by cutting off children until accuracy on the
    validation set stops improving.
    """
    # stop once you reach the leaf, since you can't prune further
    if node.is_leaf:
        return

    # get to the max depth recursively
    for _, child_node in node.get_children():
        accuracy_based_pruning(child_node, examples)

    # Pruning starts once we reach the max depth
    pre_pruning_accuracy = test_tree(node, examples)
    node.is_leaf = True
    post_pruning_accuracy = test_tree(node, examples)

    # only prune the tree if the accuracy is better
    if post_pruning_accuracy <= pre_pruning_accuracy:
        node.is_leaf = False
    return

In [None]:
def test_tree(node, examples, return_predictions=False):
    """
    Takes in a trained tree and a test set of examples.
    If return_predictions is True, returns the predicted classes for all examples.
    Otherwise, returns the accuracy (fraction of examples the tree classifies correctly).
    """
    predictions = [evaluate(node, example) for example in examples]

    if return_predictions:
        # Return the list of predictions for confusion matrix
        return predictions

    # Calculate accuracy if return_predictions is False
    num_correct_predictions = sum(
        [pred == example["Class"] for pred, example in zip(predictions, examples)]
    )
    return num_correct_predictions / len(examples)  # accuracy


def evaluate(node, example):
    """
    Takes in a tree and one example. Returns the Class value that the tree
    assigns to the example.
    """
    # recursively traverse the tree, until you reach a leaf node
    if node.is_leaf:
        return node.class_label

    node_attribute = node.get_attribute()
    example_attribute_value = example.get(node_attribute)
    child_node = node.children.get(example_attribute_value)

    # if attribute value is missing or if tree is pruned,
    # we won't have children for certain attribute values,
    # class_label here is the majority class
    if not child_node:
        return node.class_label
    return evaluate(child_node, example)

In [None]:
def ID3_helper(examples, attributes, missing_values="keep"):
    """
    Recursively creates a decision tree.
    """
    node = Node()
    class_labels = [example["Class"] for example in examples]

    # this class label would be useful during pruning
    node.update_class_label(get_most_common_class(class_labels))

    # if all examples belong to the same class, update as leaf and return
    if len(set(class_labels)) == 1:
        node.update_as_leaf()
        return node

    # if no attributes remaining or if no examples remaining, update as leaf
    # and use most common class
    if not attributes or len(examples) == 0:
        node.update_as_leaf()
        return node

    # get best_attribute based on information gain (info_gain)
    attribute_info_gain = {}
    for attribute in attributes:
        attribute_info_gain[attribute] = get_info_gain(examples, attribute)
    best_attribute = max(
        attribute_info_gain,
        key=lambda key: attribute_info_gain[key],
    )
    node.update_attribute(best_attribute)

    # recursively create child nodes based on the best attribute values
    if missing_values == "ignore":
        best_attribute_values = set(
            example[best_attribute]
            for example in examples
            if example[best_attribute] != "?"
        )
    elif missing_values == "keep":
        best_attribute_values = set(example[best_attribute] for example in examples)
    for best_attribute_value in best_attribute_values:
        child_examples = [
            example
            for example in examples
            if example[best_attribute] == best_attribute_value
        ]
        child_attributes = set(
            attribute for attribute in attributes if attribute != best_attribute
        )
        child_node = ID3_helper(child_examples, child_attributes)
        node.add_child(best_attribute_value, child_node)
    return node

## Testing Basic Implementation

* You can test your implementation of ID3 using the function below.
* If your code works as directed, all the test cases would pass.
* They test the following:
    * Case 1: A simple test with two examples that belong to the same class. The decision tree should correctly classify both examples.
    * Case 2: two different class labels.
    * Case 3: Involves different classes and multiple attribute values. The tree should be able to distinguish between different classes.
    * Case 4: Checks whether the implementation can handle missing attributes, denoted by "?". The tree should still classify the examples correctly even when some attributes are missing.

In [None]:
def mini_grader():
    data = [dict(a=1, b=0, Class=1), dict(a=1, b=1, Class=1)]

    try:
        tree = ID3(data, 0)
        if tree != None:
            ans = evaluate(tree, dict(a=1, b=0))
            if ans != 1:
                print("ID3 test 1 failed.")
            else:
                print("ID3 test 1 succeeded.")
        else:
            print("ID3 test 1 failed -- no tree returned")
    except Exception as e:
        print(f"ID3 test 1 failed runtime error: {e}")

    data = [dict(a=1, b=0, Class=0), dict(a=1, b=1, Class=1)]

    try:
        tree = ID3(data, 0)
        if tree != None:
            ans = evaluate(tree, dict(a=1, b=0))
            if ans != 0:
                print("ID3 test 2 failed.")
            else:
                print("ID3 test 2 succeeded.")
        else:
            print("ID3 test 2 failed -- no tree returned")
    except Exception as e:
        print(f"ID3 test 2 failed runtime error: {e}")

    data = [
        dict(a=1, b=0, Class=2),
        dict(a=1, b=1, Class=1),
        dict(a=2, b=0, Class=2),
        dict(a=2, b=1, Class=3),
        dict(a=3, b=0, Class=1),
        dict(a=3, b=1, Class=3),
    ]

    try:
        tree = ID3(data, 0)
        if tree != None:
            ans = evaluate(tree, dict(a=1, b=0))
            if ans != 2:
                print("ID3 test 3-1 failed.")
            else:
                print("ID3 test 3-1 succeeded.")
            ans = evaluate(tree, dict(a=1, b=1))
            if ans != 1:
                print("ID3 test 3-2 failed.")
            else:
                print("ID3 test 3-2 succeeded.")
        else:
            print("ID3 test 3 failed -- no tree returned")
    except Exception as e:
        print(f"ID3 test 3 failed runtime error: {e}")

    data = [
        dict(a=1, b=0, c="?", Class=1),
        dict(a=1, b=3, c=2, Class=1),
        dict(a=2, b="?", c=1, Class=2),
        dict(a=2, b=1, c=3, Class=2),
        dict(a=3, b=0, c=1, Class=3),
        dict(a=3, b=2, c="?", Class=3),
    ]

    try:
        tree = ID3(data, 0)
        if tree != None:
            ans = evaluate(tree, dict(a=1, b=1, c=1))
            if ans != 1:
                print("ID3 test 4-1 failed.")
            else:
                print("ID3 test 4-1 succeeded.")
            ans = evaluate(tree, dict(a=2, b=0, c=0))
            if ans != 2:
                print("ID3 test 4-2 failed.")
            else:
                print("ID3 test 4-2 succeeded.")
        else:
            print("ID3 test 4 failed -- no tree returned")
    except Exception as e:
        print(f"ID3 test 4 failed runtime error: {e}")

In [None]:
mini_grader()

## Plot Learning Curves

**Implement Training and Testing with and without Pruning**

* Implement the logic to train the decision tree on various training set sizes (ranging between 10 and 300 examples).
* For each training size:
    * Perform 100 random runs.
    * In each run, use the selected training examples to train the tree.
    * Test the tree on all examples not used for training.
    * Record the accuracy for each run.

**Plot Learning Curves**

* For each training size, calculate the average accuracy across the 100 runs.
* Plot the learning curves:
    * X-axis: Number of training examples.
    * Y-axis: Average accuracy on the test data.
* Create two lines on the plot:
    * One line representing accuracy with pruning and the other line representing accuracy without pruning.
    * Remember to connect the points for each line to visualize the trends.

In [None]:
def plot_learning_curve(
    training_sizes,
    avg_accuracies_with_pruning,
    avg_accuracies_without_pruning,
    dataset_name,
):
    """
    Plot learning curves.

    Args:
        training_sizes: List of training set sizes
        avg_accuracies_with_pruning: List of average accuracies with pruning
        avg_accuracies_without_pruning: List of average accuracies
                                        without pruning
        dataset_name: name of the dataset of plot title and image filename.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(
        training_sizes,
        avg_accuracies_with_pruning,
        label="With Pruning",
        color="orange",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_accuracies_without_pruning,
        label="Without Pruning",
        color="blue",
        marker="o",
    )
    plt.xlabel("Number of Training Examples")
    plt.ylabel("Average Accuracy on Test Data")
    plt.title(f"Learning Curves for {dataset_name} Data With and Without Pruning")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"images/learning_curve_{dataset_name}.png", bbox_inches="tight")
    print(f"Saved image as images/learning_curve_{dataset_name}.png")

In [None]:
training_sizes = range(10, 305, 5)
loop_size = 100

In [None]:
data = parse("house_votes_84.data")
avg_test_accuracy_with_pruning = []
avg_test_accuracy_without_pruning = []

In [None]:
for train_size in training_sizes:
    with_pruning = []
    without_pruning = []

    for _ in range(loop_size):
        random.shuffle(data)
        validation_size = max(1, math.ceil((int(train_size / 0.8) - train_size) // 2))
        test_size = validation_size
        train = data[:train_size]
        valid = data[train_size : train_size + validation_size]
        test = data[
            train_size + validation_size : train_size + validation_size + test_size
        ]

        # pruned tree
        tree = ID3(train, 0)
        prune(tree, valid)
        acc = test_tree(tree, test)
        with_pruning.append(acc)

        # non-pruned tree
        tree = ID3(train + valid, 0)
        acc = test_tree(tree, test)
        without_pruning.append(acc)

    avg_accuracy_with_pruning = sum(with_pruning) / len(with_pruning)
    avg_accuracy_without_pruning = sum(without_pruning) / len(without_pruning)
    avg_test_accuracy_with_pruning.append(avg_accuracy_with_pruning)
    avg_test_accuracy_without_pruning.append(avg_accuracy_without_pruning)

In [None]:
plot_learning_curve(
    training_sizes,
    avg_test_accuracy_with_pruning,
    avg_test_accuracy_without_pruning,
    dataset_name="House Data",
)

## Random Forests


```txt
               ,@@@@@@@,
       ,,,.   ,@@@@@@/@@,  .oo8888o.
    ,&%%&%&&%,@@@@@/@@@@@@,8888\88/8o
   ,%&\%&&%&&%,@@@\@@@/@@@88\88888/88'
   %&&%&%&/%&&%@@\@@/ /@@@88888\88888'
   %&&%/ %&%%&&@@\ V /@@' `88\8 `/88'
   `&%\ ` /%&'    |.|        \ '|8'
       |o|        | |         | |
       |.|        | |         | |
jgs \\/ ._\//_/__/  ,\_//__\\/.  \_//__/_
```

* In this section you will be building and evaluating a Random Forest classifier.
* Ensure you have your ID3 implementation ready, as you will be using it to construct the trees in your Random Forest.

In [None]:
class RandomForest:
    """
    Random Forest - an ensemble learning technique that builds multiple trees.
    Here, we're building trees using bootstrapping (sampling with replacement)

    Attributes:
        random_forest_nodes (list): A list to store the decision tree nodes
                                     in the random forest.
        num_trees (int): The number of decision trees in the random forest.
    """

    def __init__(self, num_trees):
        self.random_forest_nodes = []
        self.num_trees = num_trees

    def fit(self, examples):
        """
        Fits the random forest to a dataset using bootstrapped samples and
        creates decision trees.
        """
        for _ in range(self.num_trees):
            # create a bootstrapped sample by randomly selecting an example n
            # times; with replacement, i.e., can select one example more than
            # once; n = len(examples)
            bootstrap_sample = [random.choice(examples) for _ in range(len(examples) + 1)]
            available_attributes = [
                attribute for attribute in examples[0].keys() if attribute != "Class"
            ]
            subset_attributes = set(
                random.sample(
                    available_attributes,
                    random.randint(2, len(available_attributes)),
                )
            )
            random_forest_node = ID3_helper(bootstrap_sample, subset_attributes)
            self.random_forest_nodes.append(random_forest_node)

    def test_tree(self, examples, return_predictions=False):
        """
        Tests the accuracy of the random forest on a dataset.
        If return_predictions is True, it returns the predictions for all examples.
        Otherwise, it returns the accuracy.
        """
        predictions = [self.evaluate(example) for example in examples]

        if return_predictions:
            # Return the list of predictions for confusion matrix
            return predictions

        # Calculate accuracy if return_predictions is False
        num_correct_predictions = sum(
            [pred == example["Class"] for pred, example in zip(predictions, examples)]
        )
        return num_correct_predictions / len(examples)

    def evaluate(self, example):
        """
        Evaluates a single example using the random forest's ensemble of
        decision trees. Use majority voting to predict the class.
        """
        predictions = [
            evaluate(random_forest_node, example)
            for random_forest_node in self.random_forest_nodes
        ]
        return get_most_common_class(predictions)

**Best Number of Trees:**

In [None]:
data = parse("candy.data")
num_trees_accuracies = {}

In [None]:
for num_trees in range(2, 15):
    accuracies = []
    for _ in range(25):
        random.shuffle(data)
        split_index = int(0.8 * len(data))
        train = data[:split_index]
        test = data[split_index:]

        random_forest = RandomForest(num_trees)
        random_forest.fit(train)
        acc = random_forest.test_tree(test)
        accuracies.append(acc)
    num_trees_accuracies[num_trees] = sum(accuracies) / len(accuracies)

In [None]:
trees = list(num_trees_accuracies.keys())
accuracy = list(num_trees_accuracies.values())

plt.figure(figsize=(10, 6))
plt.plot(trees, accuracy, marker="o", linestyle="-")
plt.title("Random Forest Accuracy vs. Number of Trees")
plt.xlabel("Number of Trees")
plt.ylabel("Average Accuracy on Test Data")
plt.grid(True)
plt.savefig("images/rf_num_trees_accuracies.png", bbox_inches="tight")

In [None]:
best_num_trees = max(num_trees_accuracies, key=lambda key: num_trees_accuracies[key])

**Comparison with ID3:**

In [None]:
avg_test_accuracy_id3_pruning = []
avg_test_accuracy_id3_not_pruning = []
avg_test_accuracy_rf = []
avg_train_accuracy_id3_pruning = []
avg_train_accuracy_id3_not_pruning = []
avg_train_accuracy_rf = []

In [None]:
training_sizes = range(5, 68, 4)

In [None]:
for train_size in training_sizes:
    with_pruning_acc_test = []
    without_pruning_acc_test = []
    rf_acc_test = []
    with_pruning_acc_train = []
    without_pruning_acc_train = []
    rf_acc_train = []

    for _ in range(25):
        random.shuffle(data)
        validation_size = max(1, math.ceil((int(train_size / 0.8) - train_size) // 2))
        test_size = validation_size
        train = data[:train_size]
        valid = data[train_size : train_size + validation_size]
        test = data[
            train_size + validation_size : train_size + validation_size + test_size
        ]

        tree = ID3(train, 0)
        acc = test_tree(tree, train)
        with_pruning_acc_train.append(acc)
        prune(tree, valid)
        acc = test_tree(tree, test)
        with_pruning_acc_test.append(acc)

        tree = ID3(train + valid, 0)
        acc = test_tree(tree, train + valid)
        without_pruning_acc_train.append(acc)
        acc = test_tree(tree, test)
        without_pruning_acc_test.append(acc)

        random_forest = RandomForest(num_trees)
        random_forest.fit(train + valid)
        acc = random_forest.test_tree(train + valid)
        rf_acc_train.append(acc)
        acc = random_forest.test_tree(test)
        rf_acc_test.append(acc)

    avg_test_accuracy_id3_pruning.append(
        sum(with_pruning_acc_test) / len(with_pruning_acc_test)
    )
    avg_test_accuracy_id3_not_pruning.append(
        sum(without_pruning_acc_test) / len(without_pruning_acc_test)
    )
    avg_test_accuracy_rf.append(sum(rf_acc_test) / len(rf_acc_test))
    avg_train_accuracy_id3_pruning.append(
        sum(with_pruning_acc_train) / len(with_pruning_acc_train)
    )
    avg_train_accuracy_id3_not_pruning.append(
        sum(without_pruning_acc_train) / len(without_pruning_acc_train)
    )
    avg_train_accuracy_rf.append(sum(rf_acc_train) / len(rf_acc_train))

In [None]:
def plot_random_forest_id3_accuracies(
    training_sizes,
    avg_test_accuracy_id3_pruning,
    avg_test_accuracy_id3_not_pruning,
    avg_test_accuracy_rf,
    avg_train_accuracy_id3_pruning,
    avg_train_accuracy_id3_not_pruning,
    avg_train_accuracy_rf,
    dataset_name,
):
    """
    Plot learning curves.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(
        training_sizes,
        avg_test_accuracy_id3_pruning,
        label="[Test] ID3 (With Pruning)",
        color="orange",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_test_accuracy_id3_not_pruning,
        label="[Test] ID3 (Without Pruning)",
        color="blue",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_test_accuracy_rf,
        label="[Test] Random Forest",
        color="green",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_train_accuracy_id3_pruning,
        label="[Train] ID3 (With Pruning)",
        color="orange",
        linestyle="--",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_train_accuracy_id3_not_pruning,
        label="[Train] ID3 (Without Pruning)",
        color="blue",
        linestyle="--",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_train_accuracy_rf,
        label="[Train] Random Forest",
        color="green",
        linestyle="--",
        marker="o",
    )
    plt.xlabel("Number of Training Examples")
    plt.ylabel("Average Accuracy")
    plt.title(
        f"Learning Curves for {dataset_name} Data - ID3 (With and Without Pruning) vs Random Forest"
    )
    plt.legend()
    plt.grid(True)
    plt.savefig(f"images/id3_vs_rf_{dataset_name}.png", bbox_inches="tight")
    print(f"Saved image as images/id3_vs_rf_{dataset_name}.png")

In [None]:
plot_random_forest_id3_accuracies(
    training_sizes,
    avg_test_accuracy_id3_pruning,
    avg_test_accuracy_id3_not_pruning,
    avg_test_accuracy_rf,
    avg_train_accuracy_id3_pruning,
    avg_train_accuracy_id3_not_pruning,
    avg_train_accuracy_rf,
    "Candy",
)

Finally, comparing accuracies and fetching a confusion matrix.

In [None]:
with_pruning_acc = []
without_pruning = []
rf_acc = []

with_pruning_pred = []
without_pruning_pred = []
rf_pred = []
actual_labels = []

In [None]:
for _ in range(25):
    random.shuffle(data)
    train = data[: len(data) // 2]
    valid = data[len(data) // 2 : 3 * len(data) // 4]
    test = data[3 * len(data) // 4 :]

    # Store actual labels
    actual = [example["Class"] for example in test]
    actual_labels.extend(actual)

    # ID3 with pruning
    tree = ID3(train, 0)
    prune(tree, valid)
    predictions = test_tree(tree, test, return_predictions=True)
    acc = sum(
        [pred == example["Class"] for pred, example in zip(predictions, test)]
    ) / len(test)
    with_pruning_acc.append(acc)
    with_pruning_pred.extend(predictions)

    # ID3 without pruning
    tree = ID3(train + valid, 0)
    predictions = test_tree(tree, test, return_predictions=True)
    acc = sum(
        [pred == example["Class"] for pred, example in zip(predictions, test)]
    ) / len(test)
    without_pruning.append(acc)
    without_pruning_pred.extend(predictions)

    # Random Forest
    random_forest = RandomForest(num_trees)
    random_forest.fit(train + valid)
    predictions = random_forest.test_tree(test, return_predictions=True)
    acc = sum(
        [pred == example["Class"] for pred, example in zip(predictions, test)]
    ) / len(test)
    rf_acc.append(acc)
    rf_pred.extend(predictions)

In [None]:
# Calculate and print average accuracy for each model
print("Avg accuracy of ID3 with pruning: ", sum(with_pruning_acc) / len(with_pruning_acc))
print(
    "Avg accuracy of ID3 without pruning: ", sum(without_pruning) / len(without_pruning)
)
print("Avg accuracy of Random Forest: ", sum(rf_acc) / len(rf_acc))

# Generate and print confusion matrices
print("Confusion Matrix for ID3 with Pruning:")
conf_matrix_with_pruning = confusion_matrix(actual_labels, with_pruning_pred)
print(conf_matrix_with_pruning)

print("Confusion Matrix for ID3 without Pruning:")
conf_matrix_without_pruning = confusion_matrix(actual_labels, without_pruning_pred)
print(conf_matrix_without_pruning)

print("Confusion Matrix for Random Forest:")
conf_matrix_rf = confusion_matrix(actual_labels, rf_pred)
print(conf_matrix_rf)