# Implementation of decision tree in Numpy

### Import numpy

In [None]:
import numpy as np

### Load data

In [None]:
# loading the training
train_features = np.load('data/features/train_features_pca.npy')
train_labels = np.load('data/labels/train_labels.npy')

# loading the test
test_features = np.load('data/features/test_features_pca.npy')
test_labels = np.load('data//labels/test_labels.npy')

### Define Descision Tree class

In [None]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=50): # 50 by default as suggested by the assignment
        self.max_depth = max_depth
        self.tree = None
    
    def gini(self, y):
        # calculate gini impurity
        classes, counts = np.unique(y, return_counts=True)
        probability = counts / len(y)
        impurity = 1 - np.sum(probability**2)
        return impurity
    
    def split(self, X, y, feature, threshold):
        # split data based on feature and threshold
        left = y[X[:,feature] < threshold]
        right = y[X[:,feature] >= threshold]
        return left, right
    
    def best_split(self, X, y):
        # find the best split for the current node
        best_gini = float('inf')
        best_feature = None
        best_threshold = None

        # loop over all features
        for feature in range(X.shape[1]):
            thresholds = np.unique(X[:, feature])
            # loop over all thresholds
            for threshold in thresholds:
                left, right = self.split(X, y, feature, threshold)
                # skip if either side is empty
                if len(left) == 0 or len(right) == 0:
                    continue

                # calculate the gini impurity
                gini = (len(left) / len(y)) * self.gini(left) + (len(right) / len(y)) * self.gini(right)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold
        return best_feature, best_threshold
    
    def build_tree(self, X, y, depth=0):
        # Stopping criteria
        if depth >= self.max_depth or len(np.unique(y)) == 1:
            return np.argmax(np.bincount(y))
        
        feature, threshold = self.best_split(X, y)
        if feature is None:
            return np.argmax(np.bincount(y))
        
        # build left and right branches
        left = X[:, feature] <= threshold
        right = X[:, feature] > threshold

        # check if either split is empty
        if np.sum(left) == 0 or np.sum(right) == 0:
            return np.argmax(np.bincount(y))

        return {
            'feature': feature,
            'threshold': threshold,
            'left': self.build_tree(X[left], y[left], depth + 1),
            'right': self.build_tree(X[right], y[right], depth + 1)
        }
    
    def fit(self, X, y):
        self.tree = self.build_tree(X, y)
    
    def predict(self, X, tree):
        if not isinstance(tree, dict):
            return tree
        
        feature = tree['feature']
        threshold = tree['threshold']
        
        if X[feature] <= threshold:
            return self.predict(X, tree['left'])
        else:
            return self.predict(X, tree['right'])
        
    def predict_all(self, X):
        predictions = []
        for x in X:
            predictions.append(self.predict(x, self.tree))
        return predictions


### Train Decision Tree model

In [None]:
decision_tree = DecisionTreeClassifier(max_depth=50)
decision_tree.fit(train_features, train_labels)
np.save('data/models/dt/dt_numpy.npy', decision_tree)

### Test Decision Tree model

In [None]:
test_predictions = decision_tree.predict_all(test_features)
np.save('data/predictions/dt/dt_numpy.npy', test_predictions)

### Train Decision Tree model with varying `max_depth[10, 25, 50, 100]`

In [None]:
depths = [10, 25, 50, 100]
models = []
results = []

for depth in depths:
    # train and test 4 models
    decision_tree = DecisionTreeClassifier(max_depth=depth)
    decision_tree.fit(train_features, train_labels)
    models.append(decision_tree)
    
    test_predictions = decision_tree.predict_all(test_features)
    results.append(test_predictions)

np.save('data/models/dt/dt_numpy_experiment.npy', models)
np.save('data/predictions/dt/dt_numpy_experiment.npy', results)