### READ DATA

In [49]:
import random
import csv
import numpy as np

#####################################################################################################################
# Data Processing Section
# Helper function for preparing data for a decision tree classifiction problem. Parsing the data such
# that for each feature, the property can only either be True or False. Label can only be 1 or 0.
# For the chess.csv dataset won=1, nowin=0
# In more detail:
# Dataset with n instances, for each instance, there are m attributes. For the i-th attribute,
# the property should be chosen from a set with size of m_i to represent the information.
# Input: array with size of n*(m+1), the first column is the label
# Output: array with size of n*(m_1 + m_2 + ... + m_m + 1), the first column is 1 or 0 corresponding to label
#####################################################################################################################

def get_data(filename, class_name, num_training, num_validation):
    data = read_data(filename)
    data = convert_to_binary_features(data, class_name)
    return np.array(split_data(data, num_training, num_validation), dtype=object)

def read_data(filename):
    data = []
    with open(filename) as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row)
    return data

def convert_to_binary_features(data, class_name):
    features = []
    for feature_index in range(0, len(data[0])-1):
        feature_values = list(set([obs[feature_index] for obs in data]))
        feature_values.sort()
        if len(feature_values) > 2: features.append(feature_values[:-1])
        else: features.append([feature_values[0]])
    new_data = []
    for obs in data:
        new_obs = [1 if obs[-1] == class_name else 0] # label = 1 if label in the dataset is won
        for feature_index in range(0, len(data[0]) - 1):
            current_feature_value = obs[feature_index]
            for possible_feature_value in features[feature_index]:
                new_obs.append(current_feature_value == possible_feature_value)
        new_data.append(new_obs)

    return new_data

def split_data(data, num_training, num_validation):
    random.shuffle(data)
    # casting to a numpy array
    data = np.array(data)
    return data[0:num_training], data[num_training:num_training + num_validation], data[num_training + num_validation:len(data)]## **Model**

### Heart

In [81]:
import copy
import numpy as np
from collections import Counter
from graphviz import Digraph
from sklearn.tree import DecisionTreeClassifier, export_graphviz
import pandas as pd
import matplotlib.pyplot as plt
from graphviz import Source

def node_score_gini(probabilities):
    """
    Calculate the node score using the Gini index.
    For datasets with multiple classes, Gini(p) = (1 - sum(p_i^2)) / (1 - 1/n_classes), normalized.
    """
    gini = 1 - sum(p ** 2 for p in probabilities)
    n_classes = len(probabilities)
    if n_classes > 1:
        gini /= (1 - 1 / n_classes)
    return gini


def visualize_tree(tree, filename="tree"):
    """
    Visualize the decision tree using Graphviz.
    """
    def add_nodes_edges(dot, node, parent_id=None, edge_label=""):
        if node is None:
            return

        node_id = id(node)
        if node.isleaf:
            label = f"Leaf: {node.label}"
        else:
            label = f"X[{node.index_split_on}] <= {node.threshold:.2f}\nGain: {node.info.get('gain', 0):.4f}"

        dot.node(str(node_id), label)
        if parent_id is not None:
            dot.edge(str(parent_id), str(node_id), label=edge_label)

        # Recursively add child nodes
        add_nodes_edges(dot, node.left, node_id, edge_label="True")
        add_nodes_edges(dot, node.right, node_id, edge_label="False")

    # Create a Digraph object
    dot = Digraph()
    add_nodes_edges(dot, tree.root)

    # Save and render the tree
    dot.render(filename, format="png", cleanup=True)
    print(f"Tree visualization saved as {filename}.png")

class Node:
    """
    Helper to construct the tree structure.
    """
    def __init__(self, left=None, right=None, depth=0, index_split_on=0, threshold=None, isleaf=False, label=None):
        self.left = left
        self.right = right
        self.depth = depth
        self.index_split_on = index_split_on
        self.threshold = threshold
        self.isleaf = isleaf
        self.label = label
        self.info = {}

    def set_info(self, gain, num_samples):
        """
        Helper function to set node information for visualization.
        """
        self.info['gain'] = gain
        self.info['num_samples'] = num_samples

class DecisionTree:

    def __init__(self, gain_function=node_score_gini, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.gain_function = gain_function
        self.root = None

    def fit(self, X, y):
        data = [list(row) + [label] for row, label in zip(X, y)]
        self.root = Node()
        self._split_recursively(self.root, data, list(range(len(X[0]))))

    def predict(self, X):
        return [self._predict_recursive(self.root, row) for row in X]

    def _predict_recursive(self, node, row):
        if node.isleaf:
            return node.label

        if node.threshold is not None:
            if row[node.index_split_on] <= node.threshold:
                return self._predict_recursive(node.left, row)
            else:
                return self._predict_recursive(node.right, row)
        else:
            return self._predict_recursive(node.left if not row[node.index_split_on] else node.right, row)

    def _is_terminal(self, node, data):
        labels = [row[-1] for row in data]

        # Stop splitting if one of the termination criteria is met
        if len(set(labels)) == 1 or len(data) < self.min_samples_split or (self.max_depth is not None and node.depth >= self.max_depth):
            node.isleaf = True
            node.label = Counter(labels).most_common(1)[0][0]
            return True

        return False

    def _split_recursively(self, node, data, indices):
        if self._is_terminal(node, data):
            return

        best_gain, best_index, best_threshold = -float('inf'), None, None

        for index in indices:
            gain, threshold = self._find_best_split(data, index)
            if gain > best_gain:
                best_gain, best_index, best_threshold = gain, index, threshold

        if best_gain == -float('inf'):
            node.isleaf = True
            node.label = Counter(row[-1] for row in data).most_common(1)[0][0]
            return

        node.index_split_on = best_index
        node.threshold = best_threshold

        # Set node info with calculated gain and number of samples
        node.set_info(gain=best_gain, num_samples=len(data))

        left_data, right_data = self._split_data(data, best_index, best_threshold)

        node.left = Node(depth=node.depth + 1)
        node.right = Node(depth=node.depth + 1)
        self._split_recursively(node.left, left_data, indices)
        self._split_recursively(node.right, right_data, indices)

    def _split_data(self, data, split_index, threshold):
        if threshold is not None:
            left_data = [row for row in data if row[split_index] <= threshold]
            right_data = [row for row in data if row[split_index] > threshold]
        else:
            left_data = [row for row in data if row[split_index] == 0]
            right_data = [row for row in data if row[split_index] == 1]
        return left_data, right_data

    def _find_best_split(self, data, split_index):
        unique_values = sorted(set(row[split_index] for row in data))
        best_gain, best_threshold = -float('inf'), None

        if len(unique_values) > 10:
            thresholds = np.linspace(unique_values[0], unique_values[-1], num=10)
        else:
            thresholds = [(unique_values[i - 1] + unique_values[i]) / 2 for i in range(1, len(unique_values))]

        for threshold in thresholds:
            left_data, right_data = self._split_data(data, split_index, threshold)

            if left_data and right_data:
                gain = self._calculate_gain(data, left_data, right_data)
                if gain > best_gain:
                    best_gain, best_threshold = gain, threshold

        return best_gain, best_threshold

    def _calculate_gain(self, parent_data, left_data, right_data):
        total_count = len(parent_data)
        left_count, right_count = len(left_data), len(right_data)

        parent_probabilities = np.bincount([row[-1] for row in parent_data], minlength=2) / total_count
        left_probabilities = np.bincount([row[-1] for row in left_data], minlength=2) / left_count
        right_probabilities = np.bincount([row[-1] for row in right_data], minlength=2) / right_count

        gain = self.gain_function(parent_probabilities)
        gain -= (left_count / total_count) * self.gain_function(left_probabilities)
        gain -= (right_count / total_count) * self.gain_function(right_probabilities)

        return gain

    def _prune_recursively(self, node, validation_data):
        """
        Recursively prune the decision tree based on validation data.
        """
        if node.isleaf:
            return

        # Prune children nodes first
        if node.left:
            self._prune_recursively(node.left, validation_data)
        if node.right:
            self._prune_recursively(node.right, validation_data)

        # If both children are leaves, consider pruning them
        if node.left.isleaf and node.right.isleaf:
            current_loss = self._calculate_loss(validation_data)

            # Temporarily make the current node a leaf
            original_left, original_right = node.left, node.right
            node.isleaf = True
            node.label = Counter([row[-1] for row in validation_data]).most_common(1)[0][0]
            node.left, node.right = None, None

            new_loss = self._calculate_loss(validation_data)

            # Revert if pruning increases loss
            if new_loss > current_loss:
                node.isleaf = False
                node.left, node.right = original_left, original_right

    def _calculate_loss(self, validation_data):
        """
        Calculate the misclassification loss for the validation data.
        """
        X_val = [row[:-1] for row in validation_data]
        y_val = [row[-1] for row in validation_data]
        predictions = self.predict(X_val)
        return sum(pred != actual for pred, actual in zip(predictions, y_val)) / len(y_val)

def explore_dataset(filename, class_name, train_ratio, validation_ratio):
    def get_data_by_ratio(filename, class_name, train_ratio, validation_ratio):
        df = pd.read_csv(filename)
        df = df.sample(frac=1, random_state=42).reset_index(drop=True)

        total_rows = len(df)
        train_end = int(total_rows * train_ratio)
        validation_end = train_end + int(total_rows * validation_ratio)

        train_data = df.iloc[:train_end].values.tolist()
        validation_data = df.iloc[train_end:validation_end].values.tolist()
        test_data = df.iloc[validation_end:].values.tolist()

        return train_data, validation_data, test_data

    train_data, validation_data, test_data = get_data_by_ratio(filename, class_name, train_ratio, validation_ratio)

    gain_functions = {"Gini Index": node_score_gini}

    print(f'Exploring dataset: {filename}')
    for gain_name, gain_function in gain_functions.items():
        print(f'Gain Function: {gain_name}')

        # Build and evaluate unpruned tree
        X_train = [row[:-1] for row in train_data]
        y_train = [row[-1] for row in train_data]
        X_test = [row[:-1] for row in test_data]
        y_test = [row[-1] for row in test_data]

        tree_unpruned = DecisionTree(gain_function=gain_function, max_depth=10)
        tree_unpruned.fit(X_train, y_train)

        train_accuracy_unpruned = sum(tree_unpruned.predict(X_train) == np.array(y_train)) / len(y_train)
        test_accuracy_unpruned = sum(tree_unpruned.predict(X_test) == np.array(y_test)) / len(y_test)

        print(f'  Unpruned Training Accuracy: {train_accuracy_unpruned:.4f}')
        print(f'  Unpruned Test Accuracy: {test_accuracy_unpruned:.4f}')

        # Build and evaluate pruned tree
        X_val = [row[:-1] for row in validation_data]
        y_val = [row[-1] for row in validation_data]

        tree_pruned = DecisionTree(gain_function=gain_function, max_depth=10)
        tree_pruned.fit(X_train, y_train)
        tree_pruned._prune_recursively(tree_pruned.root, validation_data)

        train_accuracy_pruned = sum(tree_pruned.predict(X_train) == np.array(y_train)) / len(y_train)
        test_accuracy_pruned = sum(tree_pruned.predict(X_test) == np.array(y_test)) / len(y_test)
        visualize_tree(tree_pruned)
        print(f'  Pruned Training Accuracy: {train_accuracy_pruned:.4f}')
        print(f'  Pruned Test Accuracy: {test_accuracy_pruned:.4f}')

    # Train and visualize with sklearn DecisionTreeClassifier
    print("\nUsing sklearn's DecisionTreeClassifier")
    clf = DecisionTreeClassifier(max_depth=10, random_state=42)
    clf.fit(X_train, y_train)

    # Visualize sklearn decision tree
    dot_data = export_graphviz(
        clf,
        out_file=None,
        feature_names=[f"Feature {i}" for i in range(len(X_train[0]))],
        class_names=[str(cls) for cls in np.unique(y_train)],
        filled=True,
        rounded=True,
        special_characters=True
    )

    graph = Source(dot_data)
    graph.format = 'png'
    graph.render('sklearn_tree', cleanup=True)
    
    test_accuracy_sklearn = clf.score(X_test, y_test)
    print(f"Sklearn Test Accuracy: {test_accuracy_sklearn:.4f}")
    print("Sklearn decision tree saved as sklearn_tree.png.")

# Usage example:
explore_dataset('D:/DATA_2060/project/DATA2060_Final_Project/data/heartdvd.csv', 'target', train_ratio=0.5, validation_ratio=0.2)


Exploring dataset: D:/DATA_2060/project/DATA2060_Final_Project/data/heartdvd.csv
Gain Function: Gini Index
  Unpruned Training Accuracy: 1.0000
  Unpruned Test Accuracy: 0.9416
Tree visualization saved as tree.png
  Pruned Training Accuracy: 0.9707
  Pruned Test Accuracy: 0.9221

Using sklearn's DecisionTreeClassifier
Sklearn Test Accuracy: 0.9578
Sklearn decision tree saved as sklearn_tree.png.


### Iris