### READ DATA

In [42]:
import random
import csv
import numpy as np

#####################################################################################################################
# Data Processing Section
# Helper function for preparing data for a decision tree classifiction problem. Parsing the data such
# that for each feature, the property can only either be True or False. Label can only be 1 or 0.
# For the chess.csv dataset won=1, nowin=0
# In more detail:
# Dataset with n instances, for each instance, there are m attributes. For the i-th attribute,
# the property should be chosen from a set with size of m_i to represent the information.
# Input: array with size of n*(m+1), the first column is the label
# Output: array with size of n*(m_1 + m_2 + ... + m_m + 1), the first column is 1 or 0 corresponding to label
#####################################################################################################################

def get_data(filename, class_name, num_training, num_validation):
    data = read_data(filename)
    data = convert_to_binary_features(data, class_name)
    return np.array(split_data(data, num_training, num_validation), dtype=object)

def read_data(filename):
    data = []
    with open(filename) as f:
        reader = csv.reader(f)
        for row in reader:
            data.append(row)
    return data

def convert_to_binary_features(data, class_name):
    features = []
    for feature_index in range(0, len(data[0])-1):
        feature_values = list(set([obs[feature_index] for obs in data]))
        feature_values.sort()
        if len(feature_values) > 2: features.append(feature_values[:-1])
        else: features.append([feature_values[0]])
    new_data = []
    for obs in data:
        new_obs = [1 if obs[-1] == class_name else 0] # label = 1 if label in the dataset is won
        for feature_index in range(0, len(data[0]) - 1):
            current_feature_value = obs[feature_index]
            for possible_feature_value in features[feature_index]:
                new_obs.append(current_feature_value == possible_feature_value)
        new_data.append(new_obs)

    return new_data

def split_data(data, num_training, num_validation):
    random.shuffle(data)
    # casting to a numpy array
    data = np.array(data)
    return data[0:num_training], data[num_training:num_training + num_validation], data[num_training + num_validation:len(data)]## **Model**

### MODEL

In [3]:
import copy
import math
import numpy as np

def node_score_gini(prob):
    '''
    Calculate the node score using the gini index of the subdataset and return it.
    For dataset with 2 classes, C(p) = 2 * p * (1-p)
    '''
    gini_index = 2 * prob * (1 - prob)
    return gini_index

class Node:
    '''
    Helper to construct the tree structure.
    '''
    def __init__(self, left=None, right=None, depth=0, index_split_on=0, threshold=None, isleaf=False, label=1):
        self.left = left
        self.right = right
        self.depth = depth
        self.index_split_on = index_split_on
        self.threshold = threshold  # Add threshold attribute for continuous features
        self.isleaf = isleaf
        self.label = label
        self.info = {}  # used for visualization

    def _set_info(self, gain, num_samples):
        '''
        Helper function to add to info attribute.
        You do not need to modify this.
        '''
        self.info['gain'] = gain
        self.info['num_samples'] = num_samples

class DecisionTree:

    def __init__(self, data, validation_data=None, gain_function=node_score_gini, max_depth=40):
        # Find majority class; set to class 0 if exactly balanced.
        labels, counts = np.unique([row[0] for row in data], return_counts=True)
        self.majority_class = labels[np.argmax(counts)]
        
        labels, counts = np.unique([row[0] for row in data], return_counts=True)
        if np.sum(counts == np.max(counts)) > 1:
            self.majority_class = 0
        else:
            self.majority_class = labels[np.argmax(counts)]

        self.max_depth = max_depth
        self.root = Node(label=self.majority_class)
        self.gain_function = gain_function

        indices = list(range(1, len(data[0])))

        self._split_recurs(self.root, data, indices)

        # Pruning
        if validation_data is not None:
            self._prune_recurs(self.root, validation_data)

    def predict(self, features):
        return self._predict_recurs(self.root, features)

    def accuracy(self, data):
        return 1 - self.loss(data)

    def loss(self, data):
        cnt = 0.0
        test_Y = [row[0] for row in data]
        for i in range(len(data)):
            prediction = self.predict(data[i])
            if prediction != test_Y[i]:
                cnt += 1.0
        return cnt / len(data)

    def _predict_recurs(self, node, row):
        if node.isleaf or node.index_split_on == 0:
            return node.label
        split_index = node.index_split_on

        if node.threshold is not None:
            # For continuous feature, compare with threshold
            if row[split_index] <= node.threshold:
                return self._predict_recurs(node.left, row)
            else:
                return self._predict_recurs(node.right, row)
        else:
            # For binary feature
            if not row[split_index]:
                return self._predict_recurs(node.left, row)
            else:
                return self._predict_recurs(node.right, row)

    def _prune_recurs(self, node, validation_data):
        if not node.isleaf:
            if node.left is not None:
                self._prune_recurs(node.left, validation_data)
            if node.right is not None:
                self._prune_recurs(node.right, validation_data)

            if node.left.isleaf and node.right.isleaf:
                current_loss = self.loss(validation_data)
                original_left, original_right = node.left, node.right
                node.isleaf = True
                node.left, node.right = None, None
                new_loss = self.loss(validation_data)

                if new_loss > current_loss:
                    node.isleaf = False
                    node.left, node.right = original_left, original_right

    def _is_terminal(self, node, data, indices):
        y = [row[0] for row in data]

        if len(data) == 0:
            return True, self.majority_class
        if len(indices) == 0:
            Labels, Counts = np.unique(y, return_counts=True)
            Label = Labels[np.argmax(Counts)]
            return True, Label
        if len(np.unique(y)) == 1:
            return True, y[0]
        if node.depth >= self.max_depth:
            Labels, Counts = np.unique(y, return_counts=True)
            Label = Labels[np.argmax(Counts)]
            return True, Label

        if y.count(0) == y.count(1):
            return False, self.majority_class

        Labels, Counts = np.unique(y, return_counts=True)
        Label = Labels[np.argmax(Counts)]
        return False, Label

    def _split_recurs(self, node, data, indices):
        is_terminal, label = self._is_terminal(node, data, indices)
        if is_terminal:
            node.isleaf = True
            node.label = label
            return

        best_gain = -float('inf')
        best_index = -1
        best_threshold = None

        for index in indices:
            # Find the best threshold for the current feature
            gain, threshold = self._find_best_split(data, index, self.gain_function)
            if gain > best_gain:
                best_gain = gain
                best_index = index
                best_threshold = threshold

        node.index_split_on = best_index
        node.threshold = best_threshold
        node._set_info(best_gain, len(data))

        # Split the data based on the best threshold
        if best_threshold is not None:
            left_data = [row for row in data if row[best_index] <= best_threshold]
            right_data = [row for row in data if row[best_index] > best_threshold]
        else:
            left_data = [row for row in data if row[best_index] == 0]
            right_data = [row for row in data if row[best_index] == 1]

        node.left = Node(depth=node.depth + 1)
        node.right = Node(depth=node.depth + 1)
        remaining_indices = copy.deepcopy(indices)
        self._split_recurs(node.left, left_data, remaining_indices)
        self._split_recurs(node.right, right_data, remaining_indices)

    def _find_best_split(self, data, split_index, gain_function):
        """
        Find the best threshold for a given feature.
        Returns the best gain and corresponding threshold.
        """
        unique_values = sorted(set([row[split_index] for row in data]))
        best_gain = -float('inf')
        best_threshold = None

        for i in range(1, len(unique_values)):
            threshold = (unique_values[i - 1] + unique_values[i]) / 2
            left_data = [row for row in data if row[split_index] <= threshold]
            right_data = [row for row in data if row[split_index] > threshold]

            if len(left_data) > 0 and len(right_data) > 0:
                left_prob = sum([row[0] for row in left_data]) / len(left_data)
                right_prob = sum([row[0] for row in right_data]) / len(right_data)

                gain = gain_function(sum([row[0] for row in data]) / len(data))
                gain -= len(left_data) / len(data) * gain_function(left_prob)
                gain -= len(right_data) / len(data) * gain_function(right_prob)

                if gain > best_gain:
                    best_gain = gain
                    best_threshold = threshold

        return best_gain, best_threshold

    def _calc_gain(self, data, split_index, gain_function):
        y = [row[0] for row in data]
        xi = [row[split_index] for row in data]

        if len(y) != 0 and len(xi) != 0:
            Py1 = sum(y) / len(y)
            Pxi1 = sum(xi) / len(xi)
            Pxi0 = 1 - Pxi1

            if sum(xi) != 0:
                Py1_given_xi1 = sum([1 for i in range(len(y)) if y[i] == 1 and xi[i] == 1]) / sum(xi)
            else:
                Py1_given_xi1 = 0

            if sum([1 for _ in xi if _ == 0]) != 0:
                Py0_given_xi0 = sum([1 for i in range(len(y)) if y[i] == 0 and xi[i] == 0]) / sum([1 for _ in xi if _ == 0])
            else:
                Py0_given_xi0 = 0

            parent_cost = gain_function(Py1)
            left_cost = Pxi1 * gain_function(Py1_given_xi1)
            right_cost = Pxi0 * gain_function(Py0_given_xi0)

            gain = parent_cost - left_cost - right_cost
        else:
            gain = 0
        return gain



In [64]:
import copy
import math
import numpy as np


def node_score_gini(probabilities):
    '''
    Calculate the node score using the gini index of the subdataset and return it.
    For datasets with multiple classes, Gini(p) = 1 - sum(p_i^2)
    '''
    return 1 - sum([p ** 2 for p in probabilities])


class Node:
    '''
    Helper to construct the tree structure.
    '''
    def __init__(self, left=None, right=None, depth=0, index_split_on=0, threshold=None, isleaf=False, label=1):
        self.left = left
        self.right = right
        self.depth = depth
        self.index_split_on = index_split_on
        self.threshold = threshold  # Add threshold attribute for continuous features
        self.isleaf = isleaf
        self.label = label
        self.info = {}  # used for visualization

    def set_info(self, gain, num_samples):
        '''
        Helper function to add to info attribute.
        You do not need to modify this.
        '''
        self.info['gain'] = gain
        self.info['num_samples'] = num_samples


class DecisionTree:

    def __init__(self, data, validation_data=None, gain_function=node_score_gini, max_depth=40):
        labels, counts = np.unique([row[0] for row in data], return_counts=True)
        self.majority_class = labels[np.argmax(counts)] if len(counts) > 0 else 0
        
        self.max_depth = max_depth
        self.root = Node(label=self.majority_class)
        self.gain_function = gain_function

        indices = list(range(1, len(data[0])))
        self._split_recursively(self.root, data, indices)

        # Pruning
        if validation_data is not None:
            self._prune_recursively(self.root, validation_data)

    def predict(self, features):
        return self._predict_recursive(self.root, features)

    def accuracy(self, data):
        return 1 - self.loss(data)

    def loss(self, data):
        cnt = 0.0
        for row in data:
            prediction = self.predict(row)
            if prediction != row[0]:
                cnt += 1.0
        return cnt / len(data)

    def _predict_recursive(self, node, row):
        if node.isleaf or node.index_split_on == 0:
            return node.label

        split_index = node.index_split_on

        if node.threshold is not None:
            # For continuous feature, compare with threshold
            if row[split_index] <= node.threshold:
                return self._predict_recursive(node.left, row)
            else:
                return self._predict_recursive(node.right, row)
        else:
            # For binary feature
            if not row[split_index]:
                return self._predict_recursive(node.left, row)
            else:
                return self._predict_recursive(node.right, row)

    def _prune_recursively(self, node, validation_data):
        if not node.isleaf:
            if node.left:
                self._prune_recursively(node.left, validation_data)
            if node.right:
                self._prune_recursively(node.right, validation_data)

            if node.left.isleaf and node.right.isleaf:
                current_loss = self.loss(validation_data)
                original_left, original_right = node.left, node.right
                node.isleaf = True
                node.left, node.right = None, None
                new_loss = self.loss(validation_data)

                if new_loss > current_loss:
                    node.isleaf = False
                    node.left, node.right = original_left, original_right

    def _is_terminal(self, node, data, indices):
        y = [row[0] for row in data]

        if len(data) == 0:
            return True, self.majority_class
        if len(indices) == 0 or len(np.unique(y)) == 1 or node.depth >= self.max_depth:
            label = max(set(y), key=y.count) if len(y) > 0 else self.majority_class
            return True, label
        return False, max(set(y), key=y.count)

    def _split_recursively(self, node, data, indices):
        is_terminal, label = self._is_terminal(node, data, indices)
        if is_terminal:
            node.isleaf = True
            node.label = label
            return

        best_gain, best_index, best_threshold = -float('inf'), -1, None

        for index in indices:
            gain, threshold = self._find_best_split(data, index)
            if gain > best_gain:
                best_gain, best_index, best_threshold = gain, index, threshold

        node.index_split_on = best_index
        node.threshold = best_threshold
        node.set_info(best_gain, len(data))

        left_data, right_data = self._split_data(data, best_index, best_threshold)

        node.left = Node(depth=node.depth + 1)
        node.right = Node(depth=node.depth + 1)
        self._split_recursively(node.left, left_data, copy.deepcopy(indices))
        self._split_recursively(node.right, right_data, copy.deepcopy(indices))

    def _split_data(self, data, split_index, threshold):
        if threshold is not None:
            left_data = [row for row in data if row[split_index] <= threshold]
            right_data = [row for row in data if row[split_index] > threshold]
        else:
            left_data = [row for row in data if row[split_index] == 0]
            right_data = [row for row in data if row[split_index] == 1]
        return left_data, right_data

    def _find_best_split(self, data, split_index):
        unique_values = sorted(set([row[split_index] for row in data]))
        best_gain, best_threshold = -float('inf'), None

        # Use a more efficient approach to find the best threshold
        if len(unique_values) > 10:
            thresholds = np.linspace(unique_values[0], unique_values[-1], num=10)
        else:
            thresholds = [(unique_values[i - 1] + unique_values[i]) / 2 for i in range(1, len(unique_values))]

        for threshold in thresholds:
            left_data, right_data = self._split_data(data, split_index, threshold)

            if len(left_data) > 0 and len(right_data) > 0:
                gain = self._calculate_gain(data, left_data, right_data)
                if gain > best_gain:
                    best_gain, best_threshold = gain, threshold

        return best_gain, best_threshold

    def _calculate_gain(self, parent_data, left_data, right_data):
        # Calculate the probability distribution for parent, left, and right nodes
        parent_labels, parent_counts = np.unique([row[0] for row in parent_data], return_counts=True)
        parent_probabilities = parent_counts / len(parent_data)

        left_labels, left_counts = np.unique([row[0] for row in left_data], return_counts=True)
        left_probabilities = left_counts / len(left_data)

        right_labels, right_counts = np.unique([row[0] for row in right_data], return_counts=True)
        right_probabilities = right_counts / len(right_data)

        # Calculate Gini gain
        gain = self.gain_function(parent_probabilities)
        gain -= (len(left_data) / len(parent_data)) * self.gain_function(left_probabilities)
        gain -= (len(right_data) / len(parent_data)) * self.gain_function(right_probabilities)

        return gain


### Heart

In [67]:
def explore_dataset(filename, class_name, num_training, num_validation):
    train_data, validation_data, test_data = get_data(filename, class_name, num_training, num_validation)

    # TODO: Print 12 loss values associated with the dataset.
    # For each measure of gain (training error, entropy, gini):
    #      (a) Print average training loss (not-pruned)
    #      (b) Print average test loss (not-pruned)
    #      (c) Print average training loss (pruned)
    #      (d) Print average test loss (pruned)

    gain_functions = {"Gini Index": node_score_gini}

    results = []
    print(f'{filename}')
    for gain_name, gain_function in gain_functions.items():
        tree_not_pruned = DecisionTree(data=train_data, gain_function=gain_function, max_depth=10)
        train_loss_not_pruned = tree_not_pruned.loss(train_data)
        test_loss_not_pruned = tree_not_pruned.loss(test_data)
        tree_pruned = DecisionTree(data=train_data, validation_data=validation_data, gain_function=gain_function, max_depth=10)
        train_loss_pruned = tree_pruned.loss(train_data)
        test_loss_pruned = tree_pruned.loss(test_data)
        print(f'{gain_name}', '\n' , 'train_loss_not_pruned is', f'{train_loss_not_pruned}', '\n' , 'test_loss_not_pruned is', f'{test_loss_not_pruned}', '\n' , 'train_loss_pruned is', f'{train_loss_pruned}', '\n' , 'test_loss_pruned is', f'{test_loss_pruned}', '\n')
        results.append((gain_name, train_loss_not_pruned, test_loss_not_pruned, train_loss_pruned, test_loss_pruned))

explore_dataset('archive/heart.csv', '1', num_training=120, num_validation=101)

archive/heart.csv
Gini Index 
 train_loss_not_pruned is 0.0 
 test_loss_not_pruned is 0.1927710843373494 
 train_loss_pruned is 0.2 
 test_loss_pruned is 0.21686746987951808 



### Iris

In [68]:
import csv

def data_transform(file_name, new_file_name):
    with open(file_name, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        headers = next(reader)
        data = []
        labels = []
        
        for row in reader:
            labels.append(row[-1])
            data.append(row[1:-1])
            
    unique_labels = list(set(labels))
    label_to_number = {label: idx for idx, label in enumerate(unique_labels)}
    encoded_labels = [label_to_number[label] for label in labels]
    for i in range(len(data)):
        data[i].append(encoded_labels[i])
        
    with open(new_file_name, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(headers[1:])
        print(headers[1:])
        writer.writerows(data)
    

data_transform('DATA2060_Final_Project/data/iris.csv', 'DATA2060_Final_Project/data/transformed_iris.csv')
explore_dataset('DATA2060_Final_Project/data/transformed_iris.csv', 'Species', num_training=70, num_validation=30)

['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
DATA2060_Final_Project/data/transformed_iris.csv
Gini Index 
 train_loss_not_pruned is 0.014285714285714285 
 test_loss_not_pruned is 0.0 
 train_loss_pruned is 0.014285714285714285 
 test_loss_pruned is 0.0 

