# Homework - 1

In [None]:
import math
import random

import ID3
import matplotlib.pyplot as plt
import parse

random.seed(101)

## Part 1

In [None]:
!python mini_auto_grader.py

## Part 2

In [None]:
def plot_learning_curve(
    training_sizes,
    avg_accuracies_with_pruning,
    avg_accuracies_without_pruning,
    dataset_name,
):
    """
    Plot learning curves.

    Args:
        training_sizes: List of training set sizes
        avg_accuracies_with_pruning: List of average accuracies with pruning
        avg_accuracies_without_pruning: List of average accuracies
                                        without pruning
        dataset_name: name of the dataset of plot title and image filename.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(
        training_sizes,
        avg_accuracies_with_pruning,
        label="With Pruning",
        color="orange",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_accuracies_without_pruning,
        label="Without Pruning",
        color="blue",
        marker="o",
    )
    plt.xlabel("Number of Training Examples")
    plt.ylabel("Average Accuracy on Test Data")
    plt.title(f"Learning Curves for {dataset_name} Data With and Without Pruning")
    plt.legend()
    plt.grid(True)
    plt.savefig(f"images/learning_curve_{dataset_name}.png", bbox_inches="tight")
    print(f"Saved image as images/learning_curve_{dataset_name}.png")

In [None]:
training_sizes = range(10, 305, 5)
loop_size = 100

In [None]:
data = parse.parse("house_votes_84.data")
avg_test_accuracy_with_pruning = []
avg_test_accuracy_without_pruning = []

In [None]:
for train_size in training_sizes:
    with_pruning = []
    without_pruning = []

    for _ in range(loop_size):
        random.shuffle(data)
        validation_size = max(1, math.ceil((int(train_size / 0.8) - train_size) // 2))
        test_size = validation_size
        train = data[:train_size]
        valid = data[train_size : train_size + validation_size]
        test = data[
            train_size + validation_size : train_size + validation_size + test_size
        ]

        # pruned tree
        tree = ID3.ID3(train, 0)
        ID3.prune(tree, valid)
        acc = ID3.test(tree, test)
        with_pruning.append(acc)

        # non-pruned tree
        tree = ID3.ID3(train + valid, 0)
        acc = ID3.test(tree, test)
        without_pruning.append(acc)

    avg_accuracy_with_pruning = sum(with_pruning) / len(with_pruning)
    avg_accuracy_without_pruning = sum(without_pruning) / len(without_pruning)
    avg_test_accuracy_with_pruning.append(avg_accuracy_with_pruning)
    avg_test_accuracy_without_pruning.append(avg_accuracy_without_pruning)

In [None]:
plot_learning_curve(
    training_sizes,
    avg_test_accuracy_with_pruning,
    avg_test_accuracy_without_pruning,
    dataset_name="House Data",
)

## Part 3

In [None]:
class RandomForest:
    """
    Random Forest - an ensemble learning technique that builds multiple trees.
    Here, we're building trees using bootstrapping (sampling with replacement)

    Attributes:
        random_forest_nodes (list): A list to store the decision tree nodes
                                     in the random forest.
        num_trees (int): The number of decision trees in the random forest.
    """

    def __init__(self, num_trees):
        self.random_forest_nodes = []
        self.num_trees = num_trees

    def fit(self, examples):
        """
        Fits the random forest to a dataset using bootstrapped samples and
        creates decision trees.
        """
        for _ in range(self.num_trees):
            # create a bootstrapped sample by randomly selecting an example n
            # times; with replacement, i.e., can select one example more than
            # once; n = len(examples)
            bootstrap_sample = [random.choice(examples) for _ in range(len(examples) + 1)]
            available_attributes = [
                attribute for attribute in examples[0].keys() if attribute != "Class"
            ]
            subset_attributes = set(
                random.sample(
                    available_attributes,
                    random.randint(2, len(available_attributes)),
                )
            )
            random_forest_node = ID3.ID3_helper(bootstrap_sample, subset_attributes)
            self.random_forest_nodes.append(random_forest_node)

    def test(self, examples):
        """
        Tests the accuracy of the random forest on a dataset.
        """
        num_correct_predictions = sum(
            [self.evaluate(example) == example["Class"] for example in examples]
        )
        return num_correct_predictions / len(examples)

    def evaluate(self, example):
        """
        Evaluates a single example using the random forest's ensemble of
        decision trees. Use majority voting to predict the class.
        """
        predictions = [
            ID3.evaluate(random_forest_node, example)
            for random_forest_node in self.random_forest_nodes
        ]
        return ID3.get_most_common_class(predictions)

### Best Number of Trees

In [None]:
data = parse.parse("candy.data")
num_trees_accuracies = {}

In [None]:
for num_trees in range(2, 15):
    accuracies = []
    for _ in range(25):
        random.shuffle(data)
        split_index = int(0.8 * len(data))
        train = data[:split_index]
        test = data[split_index:]

        random_forest = RandomForest(num_trees)
        random_forest.fit(train)
        acc = random_forest.test(test)
        accuracies.append(acc)
    num_trees_accuracies[num_trees] = sum(accuracies) / len(accuracies)

In [None]:
trees = list(num_trees_accuracies.keys())
accuracy = list(num_trees_accuracies.values())

plt.figure(figsize=(10, 6))
plt.plot(trees, accuracy, marker="o", linestyle="-")
plt.title("Random Forest Accuracy vs. Number of Trees")
plt.xlabel("Number of Trees")
plt.ylabel("Average Accuracy on Test Data")
plt.grid(True)
plt.savefig("images/rf_num_trees_accuracies.png", bbox_inches="tight")

In [None]:
best_num_trees = max(num_trees_accuracies, key=lambda key: num_trees_accuracies[key])

### Comparison with ID3

In [None]:
avg_test_accuracy_id3_pruning = []
avg_test_accuracy_id3_not_pruning = []
avg_test_accuracy_rf = []
avg_train_accuracy_id3_pruning = []
avg_train_accuracy_id3_not_pruning = []
avg_train_accuracy_rf = []

In [None]:
training_sizes = range(5, 68, 4)

In [None]:
for train_size in training_sizes:
    with_pruning_acc_test = []
    without_pruning_acc_test = []
    rf_acc_test = []
    with_pruning_acc_train = []
    without_pruning_acc_train = []
    rf_acc_train = []

    for _ in range(25):
        random.shuffle(data)
        validation_size = max(1, math.ceil((int(train_size / 0.8) - train_size) // 2))
        test_size = validation_size
        train = data[:train_size]
        valid = data[train_size : train_size + validation_size]
        test = data[
            train_size + validation_size : train_size + validation_size + test_size
        ]

        tree = ID3.ID3(train, 0)
        acc = ID3.test(tree, train)
        with_pruning_acc_train.append(acc)
        ID3.prune(tree, valid)
        acc = ID3.test(tree, test)
        with_pruning_acc_test.append(acc)

        tree = ID3.ID3(train + valid, 0)
        acc = ID3.test(tree, train + valid)
        without_pruning_acc_train.append(acc)
        acc = ID3.test(tree, test)
        without_pruning_acc_test.append(acc)

        random_forest = RandomForest(num_trees)
        random_forest.fit(train + valid)
        acc = random_forest.test(train + valid)
        rf_acc_train.append(acc)
        acc = random_forest.test(test)
        rf_acc_test.append(acc)

    avg_test_accuracy_id3_pruning.append(
        sum(with_pruning_acc_test) / len(with_pruning_acc_test)
    )
    avg_test_accuracy_id3_not_pruning.append(
        sum(without_pruning_acc_test) / len(without_pruning_acc_test)
    )
    avg_test_accuracy_rf.append(sum(rf_acc_test) / len(rf_acc_test))
    avg_train_accuracy_id3_pruning.append(
        sum(with_pruning_acc_train) / len(with_pruning_acc_train)
    )
    avg_train_accuracy_id3_not_pruning.append(
        sum(without_pruning_acc_train) / len(without_pruning_acc_train)
    )
    avg_train_accuracy_rf.append(sum(rf_acc_train) / len(rf_acc_train))

In [None]:
def plot_random_forest_id3_accuracies(
    training_sizes,
    avg_test_accuracy_id3_pruning,
    avg_test_accuracy_id3_not_pruning,
    avg_test_accuracy_rf,
    avg_train_accuracy_id3_pruning,
    avg_train_accuracy_id3_not_pruning,
    avg_train_accuracy_rf,
    dataset_name,
):
    """
    Plot learning curves.
    """
    plt.figure(figsize=(10, 6))
    plt.plot(
        training_sizes,
        avg_test_accuracy_id3_pruning,
        label="[Test] ID3 (With Pruning)",
        color="orange",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_test_accuracy_id3_not_pruning,
        label="[Test] ID3 (Without Pruning)",
        color="blue",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_test_accuracy_rf,
        label="[Test] Random Forest",
        color="green",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_train_accuracy_id3_pruning,
        label="[Train] ID3 (With Pruning)",
        color="orange",
        linestyle="--",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_train_accuracy_id3_not_pruning,
        label="[Train] ID3 (Without Pruning)",
        color="blue",
        linestyle="--",
        marker="o",
    )
    plt.plot(
        training_sizes,
        avg_train_accuracy_rf,
        label="[Train] Random Forest",
        color="green",
        linestyle="--",
        marker="o",
    )
    plt.xlabel("Number of Training Examples")
    plt.ylabel("Average Accuracy")
    plt.title(
        f"Learning Curves for {dataset_name} Data - ID3 (With and Without Pruning) vs Random Forest"
    )
    plt.legend()
    plt.grid(True)
    plt.savefig(f"images/id3_vs_rf_{dataset_name}.png", bbox_inches="tight")
    print(f"Saved image as images/id3_vs_rf_{dataset_name}.png")

In [None]:
plot_random_forest_id3_accuracies(
    training_sizes,
    avg_test_accuracy_id3_pruning,
    avg_test_accuracy_id3_not_pruning,
    avg_test_accuracy_rf,
    avg_train_accuracy_id3_pruning,
    avg_train_accuracy_id3_not_pruning,
    avg_train_accuracy_rf,
    "Candy",
)

### Comparing Accuracies

In [None]:
with_pruning_acc = []
without_pruning = []
rf_acc = []

for _ in range(25):
    random.shuffle(data)
    train = data[: len(data) // 2]
    valid = data[len(data) // 2 : 3 * len(data) // 4]
    test = data[3 * len(data) // 4 :]

    tree = ID3.ID3(train, 0)
    ID3.prune(tree, valid)
    acc = ID3.test(tree, test)
    with_pruning_acc.append(acc)

    tree = ID3.ID3(train + valid, 0)
    acc = ID3.test(tree, test)
    without_pruning.append(acc)

    random_forest = RandomForest(num_trees)
    random_forest.fit(train + valid)
    acc = random_forest.test(test)
    rf_acc.append(acc)
print(
    "Avg accuracy of ID3 with pruning: ",
    sum(with_pruning_acc) / len(with_pruning_acc),
)
print(
    "Avg accuracy of ID3 without pruning: ",
    sum(without_pruning) / len(without_pruning),
)
print(
    "Avg accuracy of RF: ",
    sum(rf_acc) / len(rf_acc),
)