In [9]:
import pandas as pd
columns = ['age', 'workclass', 'fnlwgt', 'education', 'education_num',
           'marital_status', 'occupation', 'relationship', 'race', 'sex',
           'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'high_income']
income = pd.read_csv('adult.data',names=columns)

In [16]:
target_col = ['workclass','education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'high_income']

for target in target_col:
    col = pd.Categorical.from_array(income[target])
    income[target] = col.codes

In [17]:
import numpy as np
import math

def calc_entropy(column):
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    probabilities = counts/float(len(column))
    
    entropy = 0
    
    for prob in probabilities:
        if prob > 0:
            entropy += prob*math.log(prob,2)
            
    return -entropy

def calc_information_gain(data,split_name,target_name):
    #Calculate original entropy.
    original_entropy = calc_entropy(data[target_name])
    
    #Find the median of the column we're spliting
    column = data[split_name]
    median = column.median()
    
    left_split = data[column <= median]
    right_split = data[column > median]
    
    #Loop through the splits, and calculate the subset entropy
    to_subtract = 0
    for subset in [left_split,right_split]:
        prob = (subset.shape[0] / float(data.shape[0]))
        to_subtract += prob * calc_entropy(subset[target_name])
    
    #return information gain
    return original_entropy - to_subtract

# 2: ID3 Algorithm
we'll use the ID3 Algorithm for constructing decision trees. This algorithm involves recursion.

In general, recursion is the process of splitting a large problem into small chunks. Recursive functions will call themselves, then combine the results to create a final result.

# 3: Algorithm Example
図

# 4: Column split selection
We now need a function to return the name of the column to split a dataset on. The function should take the dataset, the target column, and a list of columns we might want to split on as input.

In [19]:
columns = ["age", "workclass", "education_num", "marital_status",
           "occupation", "relationship", "race", "sex", "hours_per_week",
           "native_country"]

def find_best_column(data,target_name,columns):
    information_gains = []
    for col in columns:
        information_gain = calc_information_gain(data,col,'high_income')
        information_gains.append(information_gain)
        
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

income_split = find_best_column(income,'high_income',columns)

print income_split

marital_status


# 5: Creating a simple recursive algorithm

In [25]:
label_1s = []
label_0s = []

data = pd.DataFrame({'high_income':[0,0,0,1,1,1],
                    'age':[20,60,40,25,35,55],
                    'marital_status':[0,2,1,1,2,1]})

def id3(data,target,columns):
    unique_targets = pd.unique(data[target])
    
    if len(unique_targets) == 1:
        if 0 in unique_targets:
            label_0s.append(0)
        elif 1 in unique_targets:
            label_1s.append(1)
        return
    
    best_column = find_best_column(data,target,columns)
    column_median = data[best_column].median()
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    
    for split in [left_split,right_split]:
        #print split
        id3(split,target,columns)
        #print '\n'

id3(data,'high_income',['age','marital_status'])

# 6: Storing the tree
We can now store the entire tree instead of just the labels at the leaves. In order to do this, we'll use nested dictionaries. We can represent the root node with a dictionary, and branches as the keys left and right.

In [26]:
tree = {}
nodes = []

def id3(data,target,columns,tree):
    unique_targets = pd.unique(data[target])
    nodes.append(len(nodes)+1)
    tree['number'] = nodes[-1]
    
    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree['label'] = 0
        elif 1 in unique_targets:
            tree['label'] = 1
        return
    
    best_column = find_best_column(data,target,columns)
    column_median = data[best_column].median()
    
    tree['column'] = best_column
    tree['median'] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [['left',left_split],['right',right_split]]
    
    for name,split in split_dict:
        tree[name] = {}
        id3(split,target,columns,tree[name])

id3(data,'high_income',['age','marital_status'],tree)

# 7: A prettier tree
The tree dictionary shows all the relevant information, but it doesn't look very good. We can fix this by printing out our dictionary in a nicer way.

If we find a dictionary with a label key, then we know it's a leaf, so we print out the label of the leaf. We'll also need to keep track of a depth variable so we can indent the nodes properly to indicate which nodes come before others. When we print out anything, we'll take the depth variable into account by adding space beforehand.

In [30]:
def print_with_depth(string,depth):
    # Add space before a string.
    prefix = "    " * depth
    print('{0}{1}'.format(prefix,string))

def print_node(tree,depth):
    if 'label' in tree:
        print_with_depth('Leaf: label{0}'.format(tree['label']),depth)
        return
    print_with_depth('{0}>{1}'.format(tree['column'],tree['median']),depth)
    
    for branch in [tree['left'],tree['right']]:
        print_node(branch,depth+1)

print_node(tree,0)

age>37.5
    age>25.0
        age>22.5
            Leaf: label0
            Leaf: label1
        Leaf: label1
    age>55.0
        age>47.5
            Leaf: label0
            Leaf: label1
        Leaf: label0


# 9: Automatic predictions

In [36]:
def predict(tree,row):
    if 'label' in tree:
        return tree['label']
    
    column = tree['column']
    median = tree['median']
    if row[column] <= median:
        return predict(tree['left'],row)
    else:
        return predict(tree['right'],row)

print(predict(tree,data.iloc[5]))

1


# 10: Making multiple predictions
We can use the apply method on Pandas dataframes to apply a function across each row. 

In [39]:
new_data = pd.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])

new_data.columns = ['age','martial_status']

def batch_predict(tree,df):
    return df.apply(lambda x: predict(tree,x),axis=1)

prediction = batch_predict(tree,new_data)
print prediction

0    0
1    0
2    0
3    0
4    1
5    0
dtype: int64
