The code below implements a CART tree for classification using gini impurity to decide on splitting.
* The tree only supports numerical data (float, int etc.)
* The tree only support categorical data if it is one hot encoded. This way a categorical feature can be used to split based on just two categories (0 or 1) which makes it a lot easier
* The stopping criterion for splitting:
    * If either a specified max depth has been reached or the number of samples at the node is less than or equal to a specified minimum required to split
    * If all the labels(predictions) at the node are of the same class
    * If the gini cost is not calculataed because a split is not possible (every split point results in a situation where one side of the split has no samples)
* Logic
    1. At the root create a node whose gini is calculated utilizing the whole training set. Assign the entire dataset (X and y values) to the root node. Set the root node as the current node
    2. Loop through all features for the dataset portion for this node and try each one as a possible candidate for splitting
    3. For each such feature if the feature is categorical (one hot encoded so values are 0 or 1) try splitting between all 0's and 1's and calculate cost
    4. If the feature is numeric, then try each value in the column as a possible splitting point and calculate cost
    5. From steps 3 and 4 pick the split with minimum cost. Create the left and the right children nodes for the current node based on this split
    6. Then recursively repeat from step 2 for the left and right node
    7. Stop when stopping criteria is reached
    




In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score,recall_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import os
from collections import Counter



Below is a class that represents a node. The node class at minimum needs the following properties:
1. The portions of the training dataset (X and y) associated with it. For the root node this will be the entire X and y.
2. The parent node (None if node is root)
3. The node depth
4. The right child node
5. The left child node
6. The dominant class (the class in y that has the highest count). This will be used to predict the class of a test sample if this node becomes a leaf node and is is used for prediction
7. The feature index (column index from X) on whihc this node is being split
8. Property indicating whether the feature used to split is categorical or not
9. The feature value at which the split occurs (only applicable for non-categorical columns. For categorical (one hot encoded) columns the split always occurs such that all 0's go left and the 1's go right)
7. The gini score (this is not required - but good to keep around for each node for future visualization etc.)

In [None]:
class Node:
    def __init__(self, X: np.ndarray, y: np.ndarray, parent = None):
        self.X = X
        self.y = y
        # get the unique classes and their counts from the ground truth (y)
        values,counts = np.unique(y,return_counts=True)
        # set the class with the highest count as the class to be predicted using this node
        self.predict_class = values[np.argmax(counts)]
        # calculate gini for this node
        self.gini = 1 - np.sum([(y[y == cls].shape[0]/y.shape[0])**2 for cls in values])       
        self.parent = parent
        # increment depth as needed
        self.depth = 1 if parent is None else parent.depth + 1

        self.right: Node = None
        self.left: Node = None
        self.feature_idx = None
        self.categorical = None
        self.split_at = None

The class below is the main tree class. 
1. The stopping parameters (if any) are passed in the constructor and stored in properties
2. The tree also stores a pointer to the root node
3. 

In [None]:
        

class DecisionTreeClassifier:
    # constructor
    def __init__(self, max_depth = None, min_sample_split = None):
        # initialize root node - we do not have one yet
        self.head = None
        # save the stopping parameters
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split
        # initialize the class list (we do not have this yet)
        self.clsarr = None

    
    def fit(self, X: np.ndarray, y: np.ndarray, categorical_col_idx: set[int]):
        self.clsarr = np.unique(y)
        self.head = Node(X,y)
        self.__split(self.head,categorical_col_idx,None)

    
    def __split(self, cur_node: Node, categorical_col_idx: set[int], parent: Node = None):
        
        if (parent is not None and self.max_depth is not None and parent.depth + 1 > self.max_depth):
            return
        if(self.min_sample_split is not None and cur_node.X.shape[0] <= self.min_sample_split):
            return
        if (np.unique(cur_node.y).shape[0] == 1):
            return

        (cost,gini_left,gini_right,feature_idx,split_at) = self.__calculate_feature_to_split_based_on_min_cost(cur_node.X,cur_node.y,categorical_col_idx)
        
        if cost == None:
            return
        
        if feature_idx in categorical_col_idx:
            left_indices = np.argwhere(cur_node.X[:,feature_idx] == 0).flatten()
            right_indices = np.argwhere(cur_node.X[:,feature_idx] == 1).flatten()
        else:
            left_indices = np.argwhere(cur_node.X[:,feature_idx] < split_at).flatten()
            right_indices = np.argwhere(cur_node.X[:,feature_idx] >= split_at).flatten()

        # if split is going to result in one side being "no samples" do not split
        if (left_indices.shape[0] == 0 or right_indices.shape[0] == 0):
            return
        
        cur_node.feature_idx = feature_idx
        cur_node.categorical = feature_idx in categorical_col_idx
        cur_node.split_at = split_at
        
        cur_node.left = Node(cur_node.X[left_indices,:],cur_node.y[left_indices],cur_node)
        cur_node.right = Node(cur_node.X[right_indices,:],cur_node.y[right_indices],cur_node)

        self.__split(cur_node.left,categorical_col_idx,cur_node)
        self.__split(cur_node.right,categorical_col_idx,cur_node)


    def __calculate_feature_to_split_based_on_min_cost(self, X: np.ndarray, y: np.ndarray, categorical_col_idx: set[int]):
        cost_arr = np.empty((0,5))
        for feature_idx in range(0,X.shape[1]):
            xy = np.column_stack((X[:,feature_idx],y))
        
            if feature_idx in categorical_col_idx:
                cost = self.__calculate_cost_and_gini(xy,feature_idx,categorical=True,split_point = None)
                if cost[0,0] != None:
                    cost_arr = np.concatenate((cost_arr,cost),axis=0)
            else:                
                possible_split_points = range(0,X.shape[0])
                for split_point in possible_split_points:
                    cost_for_split_point = self.__calculate_cost_and_gini(xy,feature_idx,categorical=False,split_point=split_point)
                    if cost_for_split_point[0,0] != None:
                        cost_arr = np.concatenate((cost_arr,cost_for_split_point))
        
        if cost_arr.shape[0] == 0:
            return (None,None,None,None,None)
        
        min_cost_idx = np.argmin(cost_arr[:,0],axis = 0)
        return cost_arr[min_cost_idx,0], cost_arr[min_cost_idx,1],cost_arr[min_cost_idx,2],int(cost_arr[min_cost_idx,3]),cost_arr[min_cost_idx,4]
    
    '''
    Calculate the cost and gini impurity for a split

    Arguments:
        xy - n by 2 dataset where the first column is some feature(x) and the second column is the ground truth label (y)
        feature_idx - column index of the feature(x) in the original training data
        categorical - is the feature categorical ?
        split_point - the row index pointing to the row where we want to split
    Returns:
        1x5 array of [cost,gini_left,gini_right,feature_idx,split_value (which is the x value pointed to by split_point and is None if the feature was categorical)]
    '''
    def __calculate_cost_and_gini(self,xy: np.ndarray, feature_idx: int, categorical: bool, split_point: int):
        # to calculate gini and cost we only need the y values - so we split them
        if categorical:
            # left is all values of y where the x == 0, and right is all values of y where x == 1 
            # (x is catagorical aka one hot encoded - so possible x values are 0 or 1)
            y_left = xy[xy[:,0] == 0,1]
            y_right = xy[xy[:,0] == 1,1]
        else:
            # left is all values of y where  x < (x value at row pointed to by split point) , 
            # and right is all values of y where  x >= (x value at row pointed to by split point) 
            y_left = xy[xy[:,0] < xy[split_point,0].item(),1]
            y_right = xy[xy[:,0] >= xy[split_point,0].item(),1]

        # a split was not possible at that split point because one or the other side resulted in no samples satisfying the split condition
        # we return all(5) None values
        if (y_left.shape[0] == 0 or y_right.shape[0] == 0):
            return np.array([None]*5).reshape(1,-1)
        # check the gini and cost formulae
        # the probabilities for each class is just the number of y values of that class dividieed by the total number of samples
        # to calculate gini we summ the squares of each class probability and subtract from 1
        # The cost is then a weighted sum of the two gini values where the weights are the proportion of the number of instances 
        # in the left/right subset to the total number of samples 
        gini_left = 1 - np.sum([(y_left[y_left == cls].shape[0]/y_left.shape[0])**2 for cls in self.clsarr])
        gini_right = 1 - np.sum([(y_right[y_right == cls].shape[0]/y_right.shape[0])**2 for cls in self.clsarr])
        # return as a 1x5 array of [cost,gini_left,gini_right,feature_idx,split_value (which is the x value pointed to by split_point and is None if the feature was categorical)]
        return np.array([(gini_left * y_left.shape[0]/xy.shape[0]) + (gini_right * y_right.shape[0]/xy.shape[0]),gini_left,gini_right,feature_idx,xy[split_point,0].item() if not categorical else None]).reshape(1,-1)
    
    def predict(self,X:np.ndarray) -> np.ndarray:
        return np.apply_along_axis(self.__predict_internal,axis=1,arr=X,use_node = self.head)

    def __predict_internal(self, X:np.ndarray, use_node: Node) -> float:
        if (use_node.left is None and use_node.right is None):
            return use_node.predict_class
        elif use_node.categorical:
            return self.__predict_internal(X,use_node.left if X[use_node.feature_idx].item() == 0 else use_node.right)
        else:
            return self.__predict_internal(X,use_node.left if X[use_node.feature_idx].item() < use_node.split_at else use_node.right)
    

In [None]:
df_data = pd.read_csv("c:/pix/ml/titanic/train.csv")
df_data = df_data.drop(["PassengerId","Name","Ticket","Cabin"],axis=1).dropna()

df_X = df_data.drop("Survived",axis=1)
df_y = df_data["Survived"]
df_X = pd.get_dummies(df_X,columns=["Pclass","Sex","Embarked"])
df_X = df_X.astype(float)

train_X,test_X,train_y,test_y = train_test_split(df_X,df_y,test_size=0.15,stratify=df_y,shuffle=True,random_state=1234)

tree = DecisionTreeClassifier(min_sample_split=12, max_depth=16)
tree.fit(train_X.values,train_y.values,categorical_col_idx=range(4,12))
predictions_train = tree.predict(train_X.values)
print(f"train accuracy = {accuracy_score(predictions_train,train_y.values)}")
print(f"train precision = {precision_score(predictions_train,train_y.values)}")
print(f"train recall = {recall_score(predictions_train,train_y.values)}")
predictions_test = tree.predict(test_X.values)
print(f"test accuracy = {accuracy_score(predictions_test,test_y.values)}")
print(f"test precision = {precision_score(predictions_test,test_y.values)}")
print(f"test recall = {recall_score(predictions_test,test_y.values)}")

