In [52]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
import pprint
from scipy import stats

In [62]:
def findEntropy(data):
    target = data.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = data[target].unique()
    for value in values:
        frac = data[target].value_counts()[value]/len(data[target])
        entropy += -frac*np.log2(frac+eps)
    return entropy

def findEntropyCat(data,cat):
    target = data.keys()[-1]
    targetVars = data[target].unique()  
    variables = data[cat].unique()
    entropy2 = 0
    for variable in variables:
        entropy = 0
        for targetVar in targetVars:
            num = len(data[cat][data[cat]==variable][data[target] ==targetVar])
            den = len(data[cat][data[cat]==variable])
            frac = num/(den+eps)
            entropy += -frac*np.log2(frac+eps)
        frac2 = den/len(data)
        entropy2 += -frac2*entropy
    #print(entropy2)
    return abs(entropy2)

def findME(data):
    target = data.keys()[-1]   #To make the code generic, changing target variable class name
    ME = 0
    values = data[target].unique()
    for value in values:
        frac = data[target].value_counts()[value]/len(data[target])
        ME = max(frac,ME)
    return 1-ME

def findMECat(data,cat):
    target = data.keys()[-1]
    targetVars = data[target].unique()  
    variables = data[cat].unique()
    ME2 = 0
    for variable in variables:
        ME = 0
        for targetVar in targetVars:
            num = len(data[cat][data[cat]==variable][data[target] ==targetVar])
            den = len(data[cat][data[cat]==variable])
            frac = num/(den+eps)
            ME = max(ME,frac)
        frac2 = den/len(data)
        ME2 += frac2*ME
    return abs(1-ME2)

def findGI(data):
    target = data.keys()[-1]   #To make the code generic, changing target variable class name
    GI = 0
    values = data[target].unique()
    for value in values:
        frac = data[target].value_counts()[value]/len(data[target])
        GI += frac**2
    return 1-GI

def findGICat(data,cat):
    target = data.keys()[-1]
    targetVars = data[target].unique()  
    variables = data[cat].unique()
    GI2 = 0
    for variable in variables:
        GI = 0
        for targetVar in targetVars:
            num = len(data[cat][data[cat]==variable][data[target] ==targetVar])
            den = len(data[cat][data[cat]==variable])
            frac = num/(den+eps)
            GI += frac**2
        frac2 = den/len(data)
        GI2 += frac2*GI
    return abs(1-GI2)

def winner(data,func1,func2):
    gains = []
    for value in data.keys()[:-1]:
        dEnt = func1(data)
        catEnt = func2(data,value)
        gains.append(dEnt-catEnt)
        #print(dEnt,catEnt)
    #print(gains)
    return data.keys()[:-1][np.argmax(gains)]

def getSubtable(data, node, value):
    return data[data[node] == value].reset_index(drop=True)

def buildTree(data, func1, func2, d = 0, depth = 100000, tree=None):
    target = data.keys()[-1]
    node = winner(data,func1,func2)
    catValue = np.unique(data[node])  
    if tree is None:                    
        tree={}
        tree[node] = {}
    for value in catValue:
        subtable = getSubtable(data,node,value)
        #print(subtable)
        clValue,counts = np.unique(subtable[target],return_counts=True)
        if len(counts)==1 or d == depth:
            tree[node][value] = subtable[target].mode()[0]                               
        else:        
            tree[node][value] = buildTree(subtable,func1,func2,d+1,depth)   
    return tree

In [63]:
file = 'p1' #car or bank
with open(file+'/categories.txt') as f:
    categories = f.readlines()
categories = categories[0].strip().split(', ')
tData = pd.read_csv(file+'/train.csv', names = categories)
numCats = len(categories)
print(tData)

   x1  x2  x3  x4  y
0   0   0   1   0  0
1   0   1   0   0  0
2   0   0   1   1  1
3   1   0   0   1  1
4   0   1   1   0  0
5   1   1   0   0  0
6   0   1   0   1  0


In [64]:
tree = buildTree(tData,findEntropy,findEntropyCat)
pprint.pprint(tree)

{'x2': {0: {'x4': {0: 0, 1: 1}}, 1: 0}}


In [65]:
tree = buildTree(tData,findME,findMECat)
pprint.pprint(tree)

{'x2': {0: {'x4': {0: 0, 1: 1}}, 1: 0}}


In [66]:
tree = buildTree(tData,findGI,findGICat)
pprint.pprint(tree)

{'x2': {0: {'x4': {0: 0, 1: 1}}, 1: 0}}
