In [2]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

income = pandas.read_csv('income.csv',index_col=False)
columns = [['age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','high_income']]
income.columns = columns
print(income.head(5))

   age          workclass  fnlwgt   education  education_num  \
0   50   Self-emp-not-inc   83311   Bachelors             13   
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   
4   37            Private  284582     Masters             14   

        marital_status          occupation    relationship    race      sex  \
0   Married-civ-spouse     Exec-managerial         Husband   White     Male   
1             Divorced   Handlers-cleaners   Not-in-family   White     Male   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
3   Married-civ-spouse      Prof-specialty            Wife   Black   Female   
4   Married-civ-spouse     Exec-managerial            Wife   White   Female   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0             0             0              13   United-States   

In [3]:
#Converting categorical variables in income to codes using Categorical.from_array

col = pandas.Categorical.from_array(income['workclass'])
income['workclass'] = col.codes
print(income['workclass'].head(5))
for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:
    col = pandas.Categorical.from_array(income[name])
    income[name] = col.codes

0    6
1    4
2    4
3    4
4    4
Name: workclass, dtype: int8


In [4]:
#Splitting dataset based on whether workclass is 'private' or not

private_incomes = income[income['workclass']==4]
public_incomes = income[income['workclass']!=4]
print(private_incomes.head(2))
print(public_incomes.head(2))

   age  workclass  fnlwgt  education  education_num  marital_status  \
1   38          4  215646         11              9               0   
2   53          4  234721          1              7               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
1           6             1     4    1             0             0   
2           6             0     2    1             0             0   

   hours_per_week  native_country  high_income  
1              40              39            0  
2              40              39            0  
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   50          6   83311          9             13               2   
6   52          6  209642         11              9               2   

   occupation  relationship  race  sex  capital_gain  capital_loss  \
0           4             0     4    1             0             0   
6           4             0     4    1             0             0   

   

In [5]:
# Computing entropy

import math
values = income['high_income'].value_counts()
print(values)

prob_0 = income[income['high_income']==0].shape[0] / income.shape[0]
prob_1 = income[income['high_income']==1].shape[0] / income.shape[0]
income_entropy = -(prob_0*math.log(prob_0,2) + prob_1*(math.log(prob_1,2)))
print(income_entropy)

0    24719
1     7841
Name: high_income, dtype: int64
0.7963962067582189


In [6]:
# Calculating information gain(IG)

def calc_entropy(column):
    counts = np.bincount(column)
    probabilities = counts / len(column)
    entropy = 0
    for prob in probabilities:
        if prob > 0:
            entropy = entropy + prob * math.log(prob,2)
    return -entropy

median = np.median(income['age'])
print(median)
left_split = income[income['age'] <= median]
right_split = income[income['age'] > median]

income_entropy = calc_entropy(income['high_income'])
print(income_entropy)
age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) + ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"])))
print(age_information_gain)

37.0
0.796396206758
0.0470370853414


From the values above, we can see that the 'age' column is not a great variable to 'split' on since we get an information gain of only 0.04 i.e only 0.04 bits of information.

In [7]:
def calc_information_gain(data, split_name, target_name):
    original_entropy = calc_entropy(data[target_name])
    
    column = data[split_name]
    median = column.median()
    
    left_split = data[column <= median]
    right_split = data[column > median]
    
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    return original_entropy - to_subtract

print(calc_information_gain(income, "age", "high_income"))

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
information_gains = []
for col in columns:
    information_gain = calc_information_gain(income, col, "high_income")
    information_gains.append(information_gain)

highest_gain_index = information_gains.index(max(information_gains))
highest_gain = columns[highest_gain_index]
print(highest_gain)
print(information_gains[highest_gain_index])

0.0470370853414
marital_status
0.11142081635


From the above cell of code, we can see that the most information gain is from the 'marital_status' column with 0.11 bits. So, we can split the data on that variable.

In [8]:
# Creating a function to automate the process i.e calculate the column which provides the maximum gain for a particular target variable.

def find_best_column(data, target_name, columns):
    information_gains = []
    for col in columns:
        information_gain = calc_information_gain(income, col, 'high_income')
        information_gains.append(information_gain)
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
income_split = find_best_column(income, 'high_income', columns)

In [25]:
# Implementing ID3 recursively and using a dictionary to store the entire tree
import sys
sys.setrecursionlimit(4500) #To avoid the max recursion limit reached error
depth = sys.getrecursionlimit()
print(depth)
data = pandas.DataFrame([
    [0,20,0],
    [1,35,2],
    [1,55,1]
    ])
data.columns = ["high_income", "age", "marital_status"]

tree = {}
nodes = [] #Store the number of nodes

def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree['label'] = 0
        elif 1 in unique_targets:
            tree['label'] = 1
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree['column'] = best_column
    tree['median'] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])

id3(data, "high_income", ["age", "marital_status"], tree)
print(tree)

4500
{'column': 'marital_status', 'right': {'number': 5, 'label': 1}, 'number': 1, 'left': {'column': 'marital_status', 'right': {'number': 4, 'label': 1}, 'number': 2, 'left': {'number': 3, 'label': 0}, 'median': 0.5}, 'median': 1.0}


In [26]:
#Modifying the tree to present it in a better format

def print_with_depth(string, depth):
    prefix = "    " * depth
    print("{0}{1}".format(prefix, string))

def print_node(tree, depth):
    if 'label' in tree:
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        return
    
    print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth)
    
    branches = [tree["left"], tree["right"]]
    for branch in branches:
        print_node(branch, depth+1)
        
print_node(tree, 0)

marital_status > 1.0
    marital_status > 0.5
        Leaf: Label 0
        Leaf: Label 1
    Leaf: Label 1


In [27]:
# Predicting values from the decision tree

def predict(tree, row):
    if 'label' in tree:
        return tree['label']
    
    column = tree['column']
    median = tree['median']
    
    if row[column] <= median:
        return predict(tree['left'],row)
    else:
        return predict(tree['right'],row)
    
print(predict(tree, data.iloc[0]))

0


From the above output you can see that we have predicted the value for a single row. Next step is to try to make multiple predictions at the same time.

In [28]:
# Making multiple predictions at the same time

new_data = pandas.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])
# Assign column names to the data.
new_data.columns = ["age", "marital_status"]

def batch_predict(tree, df):
    return df.apply(lambda x: predict(tree,x), axis=1)

predictions = batch_predict(tree, new_data)
print(predictions)

0    0
1    1
2    1
3    1
4    1
5    1
dtype: int64
