In [3]:
import numpy as np
import pandas as pd

In [241]:
class MyDecisionTree:
    def __init__(self, df, max_depth = 3):
        self.max_depth = max_depth
        self.df = df
        self.prediction = None
        self.childs = None
        self.root = None
        self.labelsOfRoot = None
    def __fit__(self):
        if self.df.columns.size != 1 and self.max_depth != 0:
            if len(np.unique(self.df.iloc[:, -1].to_numpy())) == 1:
                self.prediction = np.unique(self.df.iloc[:, -1].to_numpy())[0]
            else:
                self.root = self.find_best_feature()
                self.labelsOfRoot = np.unique(self.df[self.root].to_numpy())
                self.childs = np.empty((len(self.labelsOfRoot), ), dtype=object)
                for index, label in enumerate(self.labelsOfRoot):
                    self.childs[index] = MyDecisionTree(self.df[self.df[self.root] == label].drop(columns = self.root), max_depth=self.max_depth - 1)
                    self.childs[index].__fit__()
        else:
            self.prediction = self.majority_vote()

    def majority_vote(self):
        labels = self.df.iloc[:, -1].to_numpy()
        return np.bincount(labels).argmax()
        

    def find_best_feature(self):
        features = self.df.columns.to_numpy()[:-1]
        enthropy = np.empty((len(features), ))
        for index, feature in enumerate(features):
            enthropy[index] = self.calc_entropy(feature)  
        return features[np.argmin(enthropy)]    
    def calc_entropy(self, feature):
        labels = np.unique(self.df[feature].to_numpy())
        entropy = 0
        for label in labels:
            classDf = self.df[self.df[feature] == label]
            entropy += (len(classDf) / len(self.df)) * self.calc_class_entropy(classDf)
        return entropy    
    def calc_class_entropy(self, classDf):
        targets = classDf.iloc[:, -1].to_numpy()
        labels = np.unique(targets)
        perLabelEntropies = np.empty((len(labels), ))
        for index, label in enumerate(labels):
            labelCount = len(targets[targets == label])
            entropy = -(labelCount / len(classDf)) * (np.log((((labelCount) / len(classDf)))))
            perLabelEntropies[index] = entropy
        return np.sum(perLabelEntropies)    
    
    def __predict__(self, inputDict):
        if self.prediction == None:
            index = np.where(self.labelsOfRoot == inputDict[self.root])[0][0]
            inputDict.pop(self.root)
            return self.childs[index].__predict__(inputDict)
        else:
            return self.prediction    
            

In [5]:
df = pd.read_csv('Data/prison_dataset.csv')
df.head()

Unnamed: 0,Fiscal Year Released,Recidivism Reporting Year,Race - Ethnicity,Age At Release,Convicting Offense Classification,Convicting Offense Type,Convicting Offense Subtype,Main Supervising District,Release Type,Part of Target Population,Recidivism - Return to Prison numeric
0,2010,2013,White,<45,D Felony,Violent,Other,3JD,Parole,Yes,1
1,2010,2013,White,>45,D Felony,Other,Other,3JD,Parole,Yes,1
2,2010,2013,White,<45,D Felony,Other,Other,5JD,Parole,Yes,1
3,2010,2013,White,>45,Other Felony,Drug,Trafficking,3JD,Parole,Yes,1
4,2010,2013,Black,<45,D Felony,Drug,Trafficking,3JD,Parole,Yes,1


In [263]:
from sklearn.model_selection import train_test_split
trainIndex, testIndex = train_test_split(np.arange(len(df)), test_size=0.2, random_state=42)
trainDf = df.iloc[trainIndex, :]

In [271]:
tree = MyDecisionTree(trainDf)
tree.__fit__()

In [266]:
X_test = df.iloc[testIndex, :-1]
Y_test = df.iloc[testIndex, -1].to_numpy()

In [272]:
predictions = []
for i in range(len(X_test)):
    predictions.append(tree.__predict__(X_test.iloc[i, :].to_dict()))

In [273]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, predictions))

              precision    recall  f1-score   support

           0       0.65      0.80      0.72      1357
           1       0.81      0.66      0.73      1728

    accuracy                           0.72      3085
   macro avg       0.73      0.73      0.72      3085
weighted avg       0.74      0.72      0.72      3085

