In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

income = pandas.read_csv('income.csv',index_col=False)
columns = [['age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','high_income']]
income.columns = columns
print(income.head(5))

   age          workclass  fnlwgt   education  education_num  \
0   50   Self-emp-not-inc   83311   Bachelors             13   
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   
4   37            Private  284582     Masters             14   

        marital_status          occupation    relationship    race      sex  \
0   Married-civ-spouse     Exec-managerial         Husband   White     Male   
1             Divorced   Handlers-cleaners   Not-in-family   White     Male   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
3   Married-civ-spouse      Prof-specialty            Wife   Black   Female   
4   Married-civ-spouse     Exec-managerial            Wife   White   Female   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0             0             0              13   United-States   

In [2]:
#Converting categorical variables in income to codes using Categorical.from_array

col = pandas.Categorical.from_array(income['workclass'])
income['workclass'] = col.codes
print(income['workclass'].head(5))
for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:
    col = pandas.Categorical.from_array(income[name])
    income[name] = col.codes

0    6
1    4
2    4
3    4
4    4
Name: workclass, dtype: int8


In [3]:
# Working with Ensembles.
#First, we create 2 different models with varying depth controlling parameters and calculate their roc error values.

import math
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))

train_max_row = math.floor(income.shape[0] * .8)

train = income[:train_max_row]
test = income[train_max_row:]

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75)
clf.fit(train[columns], train["high_income"])

clf2 = DecisionTreeClassifier(random_state=1, max_depth=6)
clf2.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
error = roc_auc_score(predictions, test['high_income'])
print(error)

predictions = clf2.predict(test[columns])
error = roc_auc_score(predictions, test['high_income'])
print(error)

0.788010055233
0.773232669946


From the above values, we can see the roc error value for the 2 models with some minor variations in the depth controlling parameters.

In [4]:
#Combining predictions using predict_proba

predictions = clf.predict_proba(test[columns])[:,1]
predictions2 = clf2.predict_proba(test[columns])[:,1]

mean = (predictions + predictions2)/2
mean = np.round(mean)

error = roc_auc_score(mean, test['high_income'])
print(error)

0.791496726317


From the above value we can see that the combined predictions for the models is better than the individual scores. Hence ensembling will help us in achieving more accurate predictions.

In [5]:
# Introducing Bagging as a technique to introduce variation in the DTs to gain better AUC score.

tree_count = 10 #Bagging for 10 trees

bag_proportion = 0.6 #Each bag will have 60% of the number of original rows

predictions = []
for i in range(tree_count):
        bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
        clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75)
        clf.fit(bag[columns], bag["high_income"])
        predictions.append(clf.predict_proba(test[columns])[:,1])

mean = sum(predictions) / 10
mean = np.round(mean)

error = roc_auc_score(mean, test['high_income'])
print(error)


0.795120093815


Here we can see that by introducing variations by Bagging in 10 DTs instead of 1, we have managed to improve the AUC score over the single DT.

In [9]:
# Selecting Random Features i.e selecting a random sample of features from the data, computing the Information Gain for each feature and then selecting the one with the highest Gain.

data = pandas.DataFrame([
    [0,4,20,0],
    [0,4,60,2],
    [0,5,40,1],
    [1,4,25,1],
    [1,5,35,2],
    [1,5,55,1]
    ])
data.columns = ["high_income", "employment", "age", "marital_status"]

def calc_information_gain(data, split_name, target_name):
    original_entropy = calc_entropy(data[target_name])
    
    column = data[split_name]
    median = column.median()
    
    left_split = data[column <= median]
    right_split = data[column > median]
    
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    return original_entropy - to_subtract


def calc_entropy(column):
    counts = np.bincount(column)
    probabilities = counts / len(column)
    entropy = 0
    for prob in probabilities:
        if prob > 0:
            entropy = entropy + prob * math.log(prob,2)
    return -entropy


# Set a random seed to make results reproducible.
np.random.seed(1)

# The dictionary to store our tree.
tree = {}
nodes = []

# The function to find the column to split on.
def find_best_column(data, target_name, columns):
    information_gains = []
    
    # Insert your code here.
    
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    # Find the name of the column with the highest gain.
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

# The function to construct an id3 decision tree.
def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        if 0 in unique_targets:
            tree["label"] = 0
        elif 1 in unique_targets:
            tree["label"] = 1
        return
    
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])


# Run the id3 algorithm on our dataset and print the resulting tree.
id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)
def find_best_column(data, target_name, columns):
    information_gains = []
    
    # Select two columns randomly.
    cols = np.random.choice(columns, 2)
    
    for col in cols:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    highest_gain_index = information_gains.index(max(information_gains))
    
    # Get the highest gain by indexing cols.
    highest_gain = cols[highest_gain_index]
    
    return highest_gain

id3(data, "high_income", ["employment", "age", "marital_status"], tree)
print(tree)

{'column': 'employment', 'median': 4.5, 'left': {'column': 'age', 'median': 25.0, 'left': {'column': 'age', 'median': 22.5, 'left': {'label': 0, 'number': 4}, 'number': 3, 'right': {'label': 1, 'number': 5}}, 'number': 2, 'right': {'label': 0, 'number': 6}}, 'number': 1, 'right': {'column': 'age', 'median': 40.0, 'left': {'column': 'age', 'median': 37.5, 'left': {'label': 1, 'number': 9}, 'number': 8, 'right': {'label': 0, 'number': 10}}, 'number': 7, 'right': {'label': 1, 'number': 11}}}
{'column': 'age', 'median': 37.5, 'left': {'column': 'employment', 'median': 4.0, 'left': {'column': 'age', 'median': 22.5, 'left': {'label': 0, 'number': 15}, 'number': 14, 'right': {'label': 1, 'number': 16}}, 'number': 13, 'right': {'label': 1, 'number': 17}}, 'number': 12, 'right': {'column': 'age', 'median': 55.0, 'left': {'column': 'age', 'median': 47.5, 'left': {'label': 0, 'number': 20}, 'number': 19, 'right': {'label': 1, 'number': 21}}, 'number': 18, 'right': {'label': 0, 'number': 22}}}


In [11]:
# Using the Scikit-Learn package to do the same.

tree_count = 10

# Each "bag" will have 70% of the number of original rows.
bag_proportion = .7

predictions = []
for i in range(tree_count):
    # We select 80% of the rows from train, sampling with replacement.
    # We set a random state to ensure we'll be able to replicate our results.
    # We set it to i instead of a fixed value so we don't get the same sample every time.
    bag = train.sample(frac=bag_proportion, replace=True, random_state=i)
    
    # Fit a decision tree model to the "bag".
    clf = DecisionTreeClassifier(random_state=1, min_samples_leaf=75, splitter='random', max_features='auto')
    clf.fit(bag[columns], bag["high_income"])
    
    # Using the model, make predictions on the test data.
    predictions.append(clf.predict_proba(test[columns])[:,1])

combined = np.sum(predictions, axis=0) / 10
rounded = np.round(combined)

print(roc_auc_score(rounded, test["high_income"]))

0.792676621435


In [12]:
# Using RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, random_state=1, min_samples_leaf=75)
clf.fit(train[columns], train['high_income'])

predictions = clf.predict(test[columns])

error = roc_auc_score(predictions, test['high_income'])
print(error)

0.791664009288
