## Testing out the split criterias for decision tree

In [1]:
import numpy as np

### Functions to compute quality

In [44]:
#### Gini index ####
# Assume a flat vector
def GiniIndex(node):
    # Compute sum of all occurances
    totNumElements = np.sum(node)
    # Compute the sum over all classes
    classSums = 0.0
    for claNumElements in node:
        classSums += (claNumElements / totNumElements) ** 2
    # Return the gini index
    return (1.0 - classSums)
    
    
#### Entropy ####
def Entropy(node):
    # Compute sum of all occurances
    totNumElements = np.sum(node)
    # Compute the entropy sum over all classes
    negEntropSum = 0.0
    for numEle in node:
        classFreq = numEle / totNumElements
        negEntropSum += classFreq * np.log(classFreq)
    # Return 
    return (-1.0) * negEntropSum

## Get the split info
def splitInfo(Nodes):
    # Get the overall shape and number of elements
    nodeShape = np.shape(Nodes)
    totNumElements = np.sum(Nodes)
    
    negSplitInfo = 0.0
    
    # Iterate over each node
    for nIdx in range(0,nodeShape[0]):
        nodeFrac = np.sum(Nodes[nIdx,:]) / totNumElements
        negSplitInfo += nodeFrac * np.log(nodeFrac)
    
    return (-1.0)*negSplitInfo
    
    


#### Compute a weighted sum for some criteria over multiple nodes ####
def WeightedNodesAvg(Nodes, Func):
    # Get the overall shape and number of elements
    nodeShape = np.shape(Nodes)
    totNumElements = np.sum(Nodes)
    
    totalMetric = 0.0
    
    # Iterate over each node
    for nIdx in range(0,nodeShape[0]):
        # Number of elements in this node
        nodeNumElement = np.sum(Nodes[nIdx,:])
        # Gini index for this node
        modeMetric = Func(Nodes[nIdx,:])
        # Aggregate weighted sum
        totalMetric += (nodeNumElement/totNumElements) * modeMetric
    
    return totalMetric


### Defining the data matrix
```
        | class 1 | class 2  ...
----------------------------------
 node 1 |         |
 node 2 |         |
...
```

Below we consider a simple example of parent-children binary split

In [47]:
parentNode = np.array([[6, 6]])
print(parentNode)

print()

childrenNodes = np.array([[5,2],[1,4]])
print(childrenNodes)

[[6 6]]

[[5 2]
 [1 4]]


#### Compute Gini Index

In [32]:
# Compute the gini of the pre and post split
giniParent = WeightedNodesAvg(parentNode, GiniIndex)
giniChildren = WeightedNodesAvg(childrenNodes, GiniIndex)
print("Gini of parent: %f" % giniParent)
print("Gini of children: %f" % giniChildren)

# Difference in gini
print((giniParent - giniChildren))

Gini of parent: 0.500000
Gini of children: 0.371429
0.12857142857142861


#### Compute entropy

In [46]:
entropyParent = WeightedNodesAvg(parentNode, Entropy)
entropyChildren = WeightedNodesAvg(childrenNodes, Entropy)
print("Entropy of parent: %f" % entropyParent)
print("Entropy of children: %f" % entropyChildren)
print()

# Get the information gain
infoGain = entropyParent - entropyChildren
print("Information gain: %f" % infoGain)
print()

# Get the split info
split_info = splitInfo(childrenNodes)
print("Split info: %f" % split_info)
print("Gain ratio: %f" % (infoGain/split_info))

Entropy of parent: 0.693147
Entropy of children: 0.557492

Information gain: 0.135656

Split info: 0.679193
Gain ratio: 0.199730
