# Decision Tree

## Imports

In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Loading Data

In [33]:
hep = pd.read_csv("data\\part2\\hepatitis", delimiter=" ")
hep_train = pd.read_csv("data\\part2\\hepatitis-test", delimiter=" ")
hep_test = pd.read_csv("data\\part2\\hepatitis-training", delimiter=" ")

display(hep.head(2))
display(hep_train.head(2))
display(hep_test.head(2))

Unnamed: 0,Class,AGE,FEMALE,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,BIGLIVER,FIRMLIVER,SPLEENPALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,SGOT,HISTOLOGY
0,live,False,False,True,True,False,False,False,False,False,False,False,True,True,True,False,False
1,live,True,False,False,True,False,True,True,False,False,True,True,True,True,True,False,False


Unnamed: 0,Class,AGE,FEMALE,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,BIGLIVER,FIRMLIVER,SPLEENPALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,SGOT,HISTOLOGY
0,live,True,True,False,True,False,True,True,True,True,False,False,True,False,True,False,True
1,die,False,False,False,True,False,False,False,True,True,False,False,True,False,True,False,True


Unnamed: 0,Class,AGE,FEMALE,STEROID,ANTIVIRALS,FATIGUE,MALAISE,ANOREXIA,BIGLIVER,FIRMLIVER,SPLEENPALPABLE,SPIDERS,ASCITES,VARICES,BILIRUBIN,SGOT,HISTOLOGY
0,live,False,False,False,True,False,False,False,True,False,True,True,True,True,True,True,False
1,die,False,False,False,True,False,False,True,True,True,True,True,False,True,False,False,True


## Data Exploration

Confirming the even split between train and test datasets.

In [38]:
print(len(hep), len(hep_train), len(hep_test))

137 25 112


No null or NA values in our dataset.

In [44]:
# check for missing values
display(np.where(pd.isnull(hep)))
display(np.where(pd.isna(hep)))

(array([], dtype=int64), array([], dtype=int64))

(array([], dtype=int64), array([], dtype=int64))

The data has loaded in as the correct types, although Class may need to be changed to factor.

In [45]:
print(hep.dtypes)
print(hep.shape)

Class             object
AGE                 bool
FEMALE              bool
STEROID             bool
ANTIVIRALS          bool
FATIGUE             bool
MALAISE             bool
ANOREXIA            bool
BIGLIVER            bool
FIRMLIVER           bool
SPLEENPALPABLE      bool
SPIDERS             bool
ASCITES             bool
VARICES             bool
BILIRUBIN           bool
SGOT                bool
HISTOLOGY           bool
dtype: object
(137, 17)


After further investigating class, it is clear this is a binary classification problem.

In [46]:
hep["Class"].unique()

array(['live', 'die'], dtype=object)

In [49]:
hep["Class"] = hep["Class"].astype('category')
hep["Class"].dtype

CategoricalDtype(categories=['die', 'live'], ordered=False)

## Decision Tree Algorithm

In [157]:
class DecisionTree:
    def __init__(self):
        import numpy as np
        return None
    
    def train(self, train_X: pd.DataFrame, train_y: pd.DataFrame):
        print([self.__probability_based_impurity(train_X[col]) for col in train_X])
        print([self.__gini_impurity1(train_X[col]) for col in train_X])
        print([self.__gini_impurity2(train_X[col]) for col in train_X])
        print([self.__entropy(train_X[col]) for col in train_X])
        
        return 0 
    
    def test(self, test_X):
        return 0
    
    def __build_tree(self, train_X, train_y):
        """Builds an optimal structured decision tree with decision nodes and leaf nodes."""
        # to determine the root node, we find the first question with the optimal value
        info, question = self.__optimal_question()

        return 0
    
    def __optimal_question(self):
        """Calculates the impurity and returns the optimal question and information gained from asking a question."""
        info = 0
        question = 0
        return info, question
    
    def __leaf(self, train_y):
        """Leaf nodes return the predicted class."""
        return 0
    
    def __decision_node(self, optimal_question, true_branch, false_branch):
        """Decision nodes hold the optimal question at this level and two child nodes."""
        return 0
    
    def __probability_based_impurity(self, labels):
        """Calculates the probability based impurity for the given labels."""
        freq = self.__frequency(labels)
        pbi = freq[0]
        for i in range(1, len(freq)):
            print(freq[i-1], freq[i])
            pbi = pbi * freq[i]
        return pbi
    
    def __gini_impurity1(self, labels):
        """From testing, this version seems to have less precision than option 12."""
        freq = self.__frequency(labels)
        return np.sum([f * (1 - f) for f in freq])
    
    def __gini_impurity2(self, labels):
        """Calculates the gini impurity for the given labels."""
        freq = self.__frequency(labels)
        return 1 - np.sum([f ** 2 for f in freq])

    def __entropy(self, labels):
        """Calculates the entropy impurity for the given labels."""
        freq = self.__frequency(labels)
        return -np.sum([f * np.log2(f) for f in freq])
    
    def __frequency(self, labels: pd.Series):
        """Returns a list of the frequencies of each label."""
        return [np.mean(label == labels) for label in labels.unique()]


dt = DecisionTree()
dt.train(hep_train, hep_test)

hep_train["Class"].value_counts()

0.8 0.2
0.32 0.68
0.08 0.92
0.6 0.4
0.8 0.2
0.64 0.36
0.64 0.36
0.84 0.16
0.88 0.12
0.72 0.28
0.28 0.72
0.4 0.6
0.92 0.08
0.2 0.8
0.56 0.44
0.72 0.28
0.56 0.44
[0.16000000000000003, 0.21760000000000002, 0.0736, 0.24, 0.16000000000000003, 0.2304, 0.2304, 0.1344, 0.1056, 0.2016, 0.2016, 0.24, 0.0736, 0.16000000000000003, 0.24640000000000004, 0.2016, 0.24640000000000004]
[0.32, 0.4352, 0.14719999999999997, 0.48, 0.32, 0.4608, 0.4608, 0.26880000000000004, 0.2112, 0.4032, 0.4032, 0.48, 0.14719999999999997, 0.32, 0.4928, 0.4032, 0.4928]
[0.31999999999999984, 0.4351999999999999, 0.1472, 0.48, 0.31999999999999984, 0.4608, 0.4608, 0.26880000000000015, 0.21120000000000005, 0.4032, 0.4032, 0.48, 0.1472, 0.31999999999999984, 0.4927999999999999, 0.4032, 0.4927999999999999]
[0.7219280948873623, 0.9043814577244937, 0.4021791902022728, 0.9709505944546686, 0.7219280948873623, 0.9426831892554922, 0.9426831892554922, 0.6343095546405662, 0.5293608652873644, 0.8554508105601306, 0.8554508105601306, 0.970950

live    20
die      5
Name: Class, dtype: int64