In [2]:
import numpy as np
from collections import Counter
import random


In [4]:
def gini(y):
    """Calculate Gini Impurity"""
    counts = Counter(y)
    impurity = 1.0
    total = len(y)
    for label in counts:
        p = counts[label] / total
        impurity -= p ** 2
    return impurity

def split_dataset(X, y, feature_idx, threshold):
    """Split dataset into two parts based on threshold"""
    left_idxs = [i for i in range(len(X)) if X[i][feature_idx] <= threshold]
    right_idxs = [i for i in range(len(X)) if X[i][feature_idx] > threshold]
    return left_idxs, right_idxs


In [6]:
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # For leaf nodes

    def is_leaf_node(self):
        return self.value is not None


In [8]:
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2, n_features=None):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_features = n_features  # Random subset of features
        self.root = None

    def fit(self, X, y):
        self.n_features = self.n_features or len(X[0])
        self.root = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        num_samples = len(y)
        num_labels = len(set(y))

        if depth >= self.max_depth or num_samples < self.min_samples_split or num_labels == 1:
            leaf_value = self._most_common_label(y)
            return TreeNode(value=leaf_value)

        feat_idxs = random.sample(range(len(X[0])), self.n_features)

        # Find best split
        best_feat, best_thresh = self._best_split(X, y, feat_idxs)
        if best_feat is None:
            return TreeNode(value=self._most_common_label(y))

        left_idxs, right_idxs = split_dataset(X, y, best_feat, best_thresh)
        left = self._grow_tree([X[i] for i in left_idxs], [y[i] for i in left_idxs], depth + 1)
        right = self._grow_tree([X[i] for i in right_idxs], [y[i] for i in right_idxs], depth + 1)
        return TreeNode(best_feat, best_thresh, left, right)

    def _best_split(self, X, y, feat_idxs):
        best_gain = -1
        split_idx, split_thresh = None, None
        current_impurity = gini(y)

        for feat in feat_idxs:
            thresholds = set([x[feat] for x in X])
            for threshold in thresholds:
                left_idxs, right_idxs = split_dataset(X, y, feat, threshold)
                if not left_idxs or not right_idxs:
                    continue

                left_y = [y[i] for i in left_idxs]
                right_y = [y[i] for i in right_idxs]
                p = len(left_y) / len(y)
                gain = current_impurity - (p * gini(left_y) + (1 - p) * gini(right_y))

                if gain > best_gain:
                    best_gain = gain
                    split_idx = feat
                    split_thresh = threshold

        return split_idx, split_thresh

    def _most_common_label(self, y):
        return Counter(y).most_common(1)[0][0]

    def predict(self, X):
        return [self._traverse_tree(x, self.root) for x in X]

    def _traverse_tree(self, x, node):
        if node.is_leaf_node():
            return node.value
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)


In [20]:
class rf:
    def __init__(self,n_trees=10, max_depth=10, min_samples_split=2, n_features=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_features = n_features
        self.trees = []

    def fit(self,X,y):
        self.trees=[]
        for i in range (self.n_trees):
            idxs = [random.randint(0,len(X)-1) for _ in range (len(X))]
            X_sample = [X[i] for i in idxs]
            y_sample = [y[i] for i in idxs]

            tree= DecisionTree(max_depth=self.max_depth,
                                min_samples_split=self.min_samples_split,
                                n_features=self.n_features)

            tree.fit(X_sample,y_sample)
            self.trees.append(tree)


    def predict(self,x):
        tree_preds=np.array([tree.predict(x) for tree in self.trees])

        return [Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(len(x))]



In [22]:
if __name__ == "__main__":
    X = [
        [30, 25],
        [28, 30],
        [22, 40],
        [21, 35],
        [27, 22],
        [20, 29],
    ]
    y = ["Yes", "Yes", "No", "No", "Yes", "No"]

    clf = rf(n_trees=3, max_depth=3, n_features=1)
    clf.fit(X, y)

    X_test = [[26, 28]]  # New person: 26°C, age 28
    prediction = clf.predict(X_test)
    print(f"Prediction: {prediction[0]}")


Prediction: Yes
