# Zadaca 2: Stabla odlučivanja

## Zadatak 1: Modifikacija stabla odlučivanja

Modifikovati postojeću implementaciju algoritma tako da se dodaju sljedeće stavke:
- radukovani kandidatski splitovi
- random subspace sampling
- opcioni cost-complexity prunning

Potrebno je omogućiti da upotreba dodatnih stavki bude opciona, kroz proslijeđivanje odgovarajućih vrijednosti u konstruktor. Testirati brzinu i F1 metriiku rada klasifikatora na proizvoljnom skupu podataka, sa različitim kombinacijama novih opcija.

In [2]:
import random
import numpy as np

class DecisionTreeClassifier:
    def __init__(
        self,
        max_depth=5,
        min_size=10,
        max_features=None,
        n_candidate_splits=None,
        cost_complexity_pruning=False
    ):
        self.max_depth = max_depth
        self.min_size = min_size
        self.max_features = max_features
        self.n_candidate_splits = n_candidate_splits
        self.cost_complexity_pruning = cost_complexity_pruning
        self.root = None

    def fit(self, X, y):
        dataset = [list(row) + [label] for row, label in zip(X, y)]
        self.root = self._build_tree(dataset, depth=1)
        if self.cost_complexity_pruning:
            self.root = self._prune_cost_complexity(self.root, dataset)
        return self

    def predict(self, X):
        return [self._predict_row(self.root, list(row)) for row in X]

    def _build_tree(self, train, depth):
        node = self._get_best_split(train)
        if node['index'] is None:
            return self._to_terminal(train)
        self._split(node, depth)
        return node

    def _get_best_split(self, dataset):
        class_values = list(set(row[-1] for row in dataset))
        features = self._select_features(len(dataset[0]) - 1)
        best = {'index': None, 'value': None, 'score': float('inf'), 'groups': None}
        for index in features:
            candidates = self._generate_split_candidates(dataset, index)
            for value in candidates:
                groups = self._test_split(index, value, dataset)
                gini = self._gini_index(groups, class_values)
                if gini < best['score']:
                    best.update({'index': index, 'value': value, 'score': gini, 'groups': groups})
        return {'index': best['index'], 'value': best['value'], 'groups': best['groups']}

    def _select_features(self, n_features):
        features = list(range(n_features))
        if self.max_features is not None and self.max_features < n_features:
            features = random.sample(features, self.max_features)
        return features

    def _generate_split_candidates(self, dataset, index):
        values = [row[index] for row in dataset]
        unique_vals = sorted(set(values))
        if self.n_candidate_splits and self.n_candidate_splits < len(unique_vals):
            return list(np.linspace(min(values), max(values), self.n_candidate_splits))
        return unique_vals

    def _split(self, node, depth):
        left, right = node['groups']
        del node['groups']
        if not left or not right:
            terminal = self._to_terminal(left + right)
            node['left'] = node['right'] = terminal
            return

        if depth >= self.max_depth:
            node['left'], node['right'] = self._to_terminal(left), self._to_terminal(right)
            return

        if len(left) <= self.min_size:
            node['left'] = self._to_terminal(left)
        else:
            node['left'] = self._build_tree(left, depth + 1)
        if len(right) <= self.min_size:
            node['right'] = self._to_terminal(right)
        else:
            node['right'] = self._build_tree(right, depth + 1)

    def _test_split(self, index, value, dataset):
        left, right = [], []
        for row in dataset:
            if row[index] < value:
                left.append(row)
            else:
                right.append(row)
        return left, right

    def _gini_index(self, groups, class_values):
        n_instances = float(sum(len(g) for g in groups))
        gini = 0.0
        for group in groups:
            size = float(len(group))
            if size == 0:
                continue
            score = 0.0
            for class_val in class_values:
                p = [r[-1] for r in group].count(class_val) / size
                score += p * p
            gini += (1.0 - score) * (size / n_instances)
        return gini

    def _to_terminal(self, group):
        outcomes = [row[-1] for row in group]
        return max(set(outcomes), key=outcomes.count)

    def _predict_row(self, node, row):
        if isinstance(node, dict):
            if row[node['index']] < node['value']:
                return self._predict_row(node['left'], row)
            return self._predict_row(node['right'], row)
        return node

    def _prune_cost_complexity(self, node, dataset):
        if not isinstance(node, dict):
            return node

        left_data, right_data = self._test_split(node['index'], node['value'], dataset)
        node['left'] = self._prune_cost_complexity(node['left'], left_data)
        node['right'] = self._prune_cost_complexity(node['right'], right_data)

        if not isinstance(node['left'], dict) and not isinstance(node['right'], dict):
            class_vals = list(set(r[-1] for r in dataset))
            pre_gini = self._gini_index([left_data, right_data], class_vals)
            merged_gini = self._gini_index([dataset], class_vals)
            if merged_gini <= pre_gini:
                return self._to_terminal(dataset)
        return node
