In [15]:
import numpy as np
import pandas as pd
eps = np.finfo(float).eps
import pprint
from scipy import stats
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
from pandas.api.types import is_bool_dtype
pd.set_option('display.max_rows', 10000)
pd.options.display.width = 300

In [26]:
def getBins(data,cat):
    #Legacy code. Too lazy to remove right now.
    column = data[cat]
    values = data[cat].unique()
    #print(column)
    return column,values
    
def findEntropy(data):
    #Used to calculate Entropy of Target Category
    target = data.keys()[-1]
    column,values = getBins(data,target)
    entropy = 0
    for value in values:
            frac = column.value_counts()[value]/len(column)
            entropy += -frac*np.log2(frac+eps)
    return entropy

def findEntropyCat(data,cat):
    #Used to calculate fractional entropy of a category
    target = data.keys()[-1]
    tColumn,targetVals = getBins(data,target)
    column,values = getBins(data,cat)
    entropy2 = 0
    for value in values:
        entropy = 0
        for targetVal in targetVals:
            num = len(column[column==value][tColumn==targetVal])
            den = len(column[column==value])
            frac = num/(den+eps)
            entropy += -frac*np.log2(frac+eps)
        frac2 = den/len(data)
        entropy2 += -frac2*entropy
    #print(entropy2)
    return abs(entropy2)

def winner(data,func1,func2):
    #Calculates information gain of all categories, then returns category with largest gain
    #Also prints Errors and Gains calculated
    gains = []
    ents = []
    for value in data.keys()[:-1]:
        dEnt = func1(data)
        catEnt = func2(data,value)
        gains.append(dEnt-catEnt)
        ents.append(catEnt)
        #print(dEnt,catEnt)
    ents.append(dEnt)
    #print("Errors:",ents)
    print("Gains:",gains)
    return data.keys()[:-1][np.argmax(gains)]

def getSubtable(data, node, value):
    #Used to split the table down to smaller subsets
    return data[data[node] == value].reset_index(drop=True)

def buildTree(data, func1, func2, depth = 100000, d = 1, tree=None):
    #Builds a decision tree.
    #func 1 and func2 are the error calc methods.
    target = data.keys()[-1]
    node = winner(data,func1,func2)
    catValue = np.unique(data[node])  
    if tree is None:                    
        tree={}
        tree[node] = {}
    for value in catValue:
        subtable = getSubtable(data,node,value)
        #print(subtable)
        clValue,counts = np.unique(subtable[target],return_counts=True)
        if len(counts)==1 or d == depth:
            print(subtable[target].value_counts())
            tree[node][value] = subtable[target].mode()[0]                               
        else:        
            tree[node][value] = buildTree(subtable,func1,func2,depth,d+1)   
    return tree

def predict(instance, tree):
    #Given an input, traverses a decision tree and returns the result.
    #Returns "Unable to Traverse Tree With Given Input" if the input cannot follow the tree
    if(not(type(tree) is dict)):
        #print(tree)
        return tree
    for nodes in tree.keys():

        value = instance[nodes]
        #print(value)
        try:
            nextNode = tree[nodes][value]
        except KeyError:
            return "Unable to Traverse Tree With Given Input"
        prediction = 0

        if type(tree) is dict:
            prediction = predict(instance, nextNode)
        else:
            prediction = tree
            break;
    return prediction
    
def genNumBuckets(data,cats):
    #Used to get the buckets for numerical data
    buckets = []
    for cat in cats:
        bucket = []
        if(not(is_string_dtype(data[cat]))):
            median = data[cat].median()-eps
            bucket = [float('-inf'),median,float('inf')]
        buckets.append(bucket)
    return buckets

def cleanNumeric(data,cats,buckets):
    #Replaces numerical data with given bucket categories
    for i in range(len(cats)):
        cat = cats[i]
        if(not(is_string_dtype(data[cat]))):
            data[cat] = pd.cut(data[cat],bins=buckets[i],include_lowest=True,duplicates='drop')
    return data

In [27]:
file = "bank"

with open(file+'/categories.txt') as f:
    categories = f.readlines()
categories = categories[0].strip().replace(' ','').split(',')
tData = pd.read_csv(file+'/train.csv', names = categories)
numCats = len(categories)
buckets = genNumBuckets(tData,categories)
tData = cleanNumeric(tData,categories,buckets)
print(tData)

               age            job   marital  education default        balance housing loan    contact           day month       duration     campaign        pdays          previous poutcome target
0      (38.0, inf]       services   married  secondary      no  (-inf, 452.5]     yes   no    unknown  (-inf, 16.0]   may  (-inf, 180.0]   (2.0, inf]  (-1.0, inf]  (-2.22e-16, inf]  unknown     no
1      (38.0, inf]    blue-collar    single  secondary      no  (-inf, 452.5]     yes  yes   cellular  (-inf, 16.0]   feb   (180.0, inf]   (2.0, inf]  (-1.0, inf]  (-2.22e-16, inf]  unknown     no
2      (38.0, inf]     technician   married  secondary      no   (452.5, inf]      no  yes   cellular   (16.0, inf]   aug   (180.0, inf]  (-inf, 2.0]  (-1.0, inf]  (-2.22e-16, inf]  success    yes
3      (38.0, inf]         admin.   married   tertiary      no  (-inf, 452.5]     yes   no   cellular  (-inf, 16.0]   jul   (180.0, inf]  (-inf, 2.0]  (-1.0, inf]  (-2.22e-16, inf]  unknown     no
4     (-inf, 38

In [28]:
testData = pd.read_csv(file+'/test.csv',names=categories)
testData = cleanNumeric(testData,categories,buckets)
print(testData)

               age            job   marital  education default        balance housing loan    contact           day month       duration     campaign        pdays          previous poutcome target
0      (38.0, inf]     management    single  secondary      no   (452.5, inf]      no   no   cellular  (-inf, 16.0]   jun   (180.0, inf]   (2.0, inf]  (-1.0, inf]  (-2.22e-16, inf]  unknown     no
1      (38.0, inf]    blue-collar   married  secondary      no  (-inf, 452.5]     yes   no   cellular  (-inf, 16.0]   may   (180.0, inf]  (-inf, 2.0]  (-1.0, inf]  (-2.22e-16, inf]  failure     no
2      (38.0, inf]        retired   married    primary      no  (-inf, 452.5]      no   no  telephone   (16.0, inf]   jul  (-inf, 180.0]   (2.0, inf]  (-1.0, inf]  (-2.22e-16, inf]  unknown     no
3     (-inf, 38.0]   entrepreneur    single   tertiary      no  (-inf, 452.5]     yes  yes    unknown  (-inf, 16.0]   jun   (180.0, inf]  (-inf, 2.0]  (-1.0, inf]  (-2.22e-16, inf]  unknown     no
4     (-inf, 38

In [29]:
tree = buildTree(tData, findEntropy, findEntropyCat, depth = 1)
pprint.pprint(tree)

Gains: [0.0007604567497105919, 0.013310658043091395, 0.00452142294942548, 0.006626151027877558, 0.0007237854700453683, 0.0037189338644759706, 0.013953305731499444, 0.0048523948100474, 0.018113032142454077, 0.0008070074610828648, 0.03646957704202247, 0.05960230138776068, 0.004748460695890078, 0.0, 0.0, 0.03964955279086213]
no     2426
yes      77
Name: target, dtype: int64
no     1978
yes     519
Name: target, dtype: int64
{'duration': {Interval(-inf, 180.0, closed='right'): 'no',
              Interval(180.0, inf, closed='right'): 'no'}}
