<a href="https://colab.research.google.com/github/inderpreetsingh01/ml_machine_coding/blob/main/Decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature        # index of feature to split on
        self.threshold = threshold    # threshold value
        self.left = left              # left child node
        self.right = right            # right child node
        self.value = value            # class label if it's a leaf

    def is_leaf(self):
        return self.value is not None

class DecisionTreeClassifier:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.n_classes_ = len(np.unique(y))
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        num_labels = len(np.unique(y))

        # Stop conditions
        if (depth >= self.max_depth or num_labels == 1 or n_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return Node(value=leaf_value)

        best_feature, best_thresh = self._best_split(X, y, n_features)
        if best_feature is None:
            return Node(value=self._most_common_label(y))

        # Split
        left_idx = X[:, best_feature] < best_thresh
        right_idx = ~left_idx
        left = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._build_tree(X[right_idx], y[right_idx], depth + 1)
        return Node(feature=best_feature, threshold=best_thresh, left=left, right=right)

    def _best_split(self, X, y, n_features):
        best_gain = -1
        split_idx, split_thresh = None, None

        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for thresh in thresholds:
                left_idx = X[:, feature] < thresh
                right_idx = ~left_idx
                if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
                    continue

                gain = self._information_gain(y, y[left_idx], y[right_idx])
                if gain > best_gain:
                    best_gain = gain
                    split_idx = feature
                    split_thresh = thresh
        return split_idx, split_thresh

    def _gini(self, y):
        counts = np.bincount(y)
        probs = counts / len(y)
        return 1 - np.sum(probs ** 2)

    def _information_gain(self, parent, left, right):
        weight_l = len(left) / len(parent)
        weight_r = len(right) / len(parent)
        return self._gini(parent) - (weight_l * self._gini(left) + weight_r * self._gini(right))

    def _most_common_label(self, y):
        counts = np.bincount(y)
        return np.argmax(counts)

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature] < node.threshold:
            return self._traverse_tree(x, node.left)
        else:
            return self._traverse_tree(x, node.right)

    def score(self, X, y):
        return np.mean(self.predict(X) == y)

In [2]:
# Simulated 3-class data
np.random.seed(42)
X = np.random.randn(150, 2)
y = np.repeat([0, 1, 2], 50)
X[y == 1] += 2
X[y == 2] -= 2

tree = DecisionTreeClassifier(max_depth=5)
tree.fit(X, y)

preds = tree.predict(X)
acc = tree.score(X, y)
print("Accuracy:", acc)

Accuracy: 0.96
