## Overview of the dataset

In [3]:
import pandas

# Set index_col to False to avoid pandas thinking that the first column is row indexes (it's age)
income = pandas.read_csv("income.csv", index_col=False)
income.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## Converting categorical variables

- Convert the rest of the categorical columns in income (education, marital_status, occupation, relationship, race, sex, native_country, and high_income) to numeric columns.


In [4]:
# Convert a single column from text categories to numbers
col = pandas.Categorical(income["workclass"])
income["workclass"] = col.codes
print(income["workclass"].head(5))

0    7
1    6
2    4
3    4
4    4
Name: workclass, dtype: int8


In [5]:
for i in ["education", "marital_status", "occupation", "relationship","race","sex", "native_country","high_income"]:
    col = pandas.Categorical(income[i])
    income[i] = col.codes
    

In [6]:
income

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
5,37,4,284582,12,14,2,4,5,4,0,0,0,40,39,0
6,49,4,160187,6,5,3,8,1,2,0,0,0,16,23,0
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1
8,31,4,45781,12,14,4,10,1,4,0,14084,0,50,39,1
9,42,4,159449,9,13,2,4,0,4,1,5178,0,40,39,1


## Creating splits

- Split income into two parts based on the value of the workclass column.

    - private_incomes should contain all rows where workclass is 4.
    - public_incomes should contain all rows where workclass is not 4.

In [7]:
private_incomes=income[income["workclass"] == 4]
public_incomes=income[income["workclass"] != 4]

In [8]:
public_incomes

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,high_income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
7,52,6,209642,11,9,2,4,0,4,1,0,0,45,39,1
11,30,7,141297,9,13,2,10,0,1,1,0,0,40,19,1
16,25,6,176756,11,9,4,5,3,4,1,0,0,35,39,0
19,43,6,292175,12,14,0,4,4,4,0,0,0,45,39,1
22,35,1,76845,6,5,2,5,0,2,1,0,0,40,39,0
25,56,2,216851,9,13,2,13,0,4,1,0,0,40,39,1
27,54,0,180211,15,10,2,0,0,1,1,0,0,60,35,1
30,23,2,190709,7,12,4,11,1,4,1,0,0,52,39,0


## Overview of Data Set Entropy

Compute the entropy of the high_income column in the income dataframe, and assign the result to income_entropy.

In [9]:
import math
# We'll do the same calculation we did above, but in Python
# Passing in 2 as the second parameter to math.log will take a base 2 log
entropy = -(2/5 * math.log(2/5, 2) + 3/5 * math.log(3/5, 2))
print(entropy)

0.9709505944546686


In [10]:
prob_0 = income[income["high_income"] == 0].shape[0] / income.shape[0]
prob_1 = income[income["high_income"] == 1].shape[0] / income.shape[0]
income_entropy = -(prob_0 * math.log(prob_0, 2) + prob_1 * math.log(prob_1, 2))

In [11]:
print(income_entropy)

0.7963839552022132


## Information gain

Compute the information gain for splitting on the age column of income.

- First, compute the median of age.
- Then, assign anything less than or equal to the median to the left branch, and anything greater than the median to the right branch.
- Compute the information gain and assign it to age_information_gain.


In [12]:
import numpy as np

def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            entropy += prob * math.log(prob, 2)
    
    return -entropy

# Verify that our function matches our answer from earlier
entropy = calc_entropy([1,1,0,0,1])
print(entropy)

information_gain = entropy - ((.8 * calc_entropy([1,1,0,0])) + (.2 * calc_entropy([1])))
print(information_gain)
##The calc_entropy() function accepts either a list or a series.

0.9709505944546686
0.17095059445466854


In [13]:
income_entropy = calc_entropy(income["high_income"])

median_age = income["age"].median()

left_split = income[income["age"] <= median_age]
right_split = income[income["age"] > median_age]

age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) + ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"])))

In [14]:
age_information_gain

0.047028661304691965

# Finding the best split

Create a list called information_gains.

- It should contain, in order, the information gain from splitting on these columns: age, workclass, education_num, marital_status, occupation, relationship, race, sex, hours_per_week, native_country.

Find the highest value in the information_gains list, and assign the corresponding column name to highest_gain.

In [15]:
lista = ["age","workclass","education_num","marital_status","occupation","relationship","race","sex","hours_per_week","native_country"]
information_gains = []
income_entropy = calc_entropy(income["high_income"])

for i in lista:
    median_age = income[i].median()
    left_split = income[income[i] <= median_age]
    right_split = income[income[i] > median_age]
    age_information_gain = income_entropy - ((left_split.shape[0] / income.shape[0]) * calc_entropy(left_split["high_income"]) + ((right_split.shape[0] / income.shape[0]) * calc_entropy(right_split["high_income"])))
    information_gains.append(age_information_gain)



In [16]:
print(information_gains)

[0.047028661304691965, 0.006810984054396618, 0.06501298413277423, 0.1114272573715438, 0.0015822303843424645, 0.04736241665026941, 0.0, 0.0, 0.04062246867123487, 0.00013457344495848567]


In [17]:
max_value=information_gains.index(max(information_gains))
max_value

3

In [18]:
highest_gain=lista[max_value]
highest_gain

'marital_status'

In [19]:
information_gains.index

<function list.index>

## Solution

In [20]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    # Find the median of the column we're splitting
    column = data[split_name]
    median = column.median()
    
    
    # Make two subsets of the data, based on the median
    left_split = data[column <= median]
    right_split = data[column > median]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

# Verify that our answer is the same as on the last screen
print(calc_information_gain(income, "age", "high_income"))

columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]
information_gains = []
# Loop through and compute information gains
for col in columns:
    information_gain = calc_information_gain(income, col, "high_income")
    information_gains.append(information_gain)

# Find the name of the column with the highest gain
highest_gain_index = information_gains.index(max(information_gains))
highest_gain = columns[highest_gain_index]
highest_gain


0.047028661304691965


'marital_status'

##  Determining the Column to Split On

- Write a function named find_best_column() that returns the name of a column to split the data on. We've started to define this function for you.

- Use find_best_column() to find the best column on which to split income.

    - The target is the high_income column, and the potential columns to split with are in the list columns below.
    - Assign the result to income_split.

In [21]:
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

def find_best_column(data, target_name, columns):
    information_gains = []
    # Loop through and compute information gains
    for col in columns:
        information_gain = calc_information_gain(data, col, "high_income")
        information_gains.append(information_gain)

    # Find the name of the column with the highest gain
    highest_gain_index = information_gains.index(max(information_gains))
    highest_gain = columns[highest_gain_index]
    return highest_gain

income_split = find_best_column(income, "high_income", columns)

## Creating a Simple Recursive Algorithm

def id3(data, target, columns)

    1 Create a node for the tree
    2 If all values of the target attribute are 1, add 1 to counter_1
    3 If all values of the target attribute are 0, add 1 to counter_0
    4 Using information gain, find A, the column that splits the data best
    5 Find the median value in column A
    6 Split A into values below or equal to the median (0), and values above the median (1)
    7 For each possible value (0 or 1), vi, of A,
    8    Add a new tree branch below Root that corresponds to rows of data where A = vi
    9    Let Examples(vi) be the subset of examples that have the value vi for A
   10    Below this new branch, add the subtree id3(data[A==vi], target, columns)
   11 Return Root

Read the id3() function below and fill in the lines that say "Insert code here...".

- The function should append 1 to label_1s if the node should be a leaf, and only has 1s for high_income.
- It should append 0 to label_0s if the node should be a leaf, and only has 0s for high_income.


In [22]:
# We'll use lists to store our labels for nodes (when we find them)
# Lists can be accessed inside our recursive function, whereas integers can't.  
# Look at the python missions on scoping for more information on this topic
label_1s = []
label_0s = []

def id3(data, target, columns):
    # The pandas.unique method will return a list of all the unique values in a series
    unique_targets = pandas.unique(data[target])
    
    if len(unique_targets) == 1:
        if 0 in unique_targets:
            label_0s.append(0)
        elif 1 in unique_targets:
            label_1s.append(1)
        
        # Returning here is critical -- if we don't, the recursive tree will never finish, and run forever
        # See our example above for when we returned
        return
     # Find the best column to split on in our data
    best_column = find_best_column(data, target, columns)
    # Find the median of the column
    column_median = data[best_column].median()
    
    # Create the two splits
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    
    # Loop through the splits and call id3 recursively
    for split in [left_split, right_split]:
        # Call id3 recursively to process each branch
        id3(split, target, columns)
    
# Create the data set that we used in the example on the last screen
data = pandas.DataFrame([
    [0,20,0],
    [0,60,2],
    [0,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])

# Assign column names to the data
data.columns = ["high_income", "age", "marital_status"]

# Call the function on our data to set the counters properly
id3(data, "high_income", ["age", "marital_status"])


In [23]:
label_0s

[0, 0, 0]

In [24]:
data = pandas.DataFrame([
    [1,20,0],
    [1,60,2],
    [1,40,1],
    [1,25,1],
    [1,35,2],
    [1,55,1]
    ])

target = "high_income"
data.columns = ["high_income", "age", "marital_status"]
unique_targets = pandas.unique(data[target])

In [25]:
len(unique_targets)

1

In [26]:
data

Unnamed: 0,high_income,age,marital_status
0,1,20,0
1,1,60,2
2,1,40,1
3,1,25,1
4,1,35,2
5,1,55,1


## Storing the tree
Fill in the sections labelled "Insert code here..." in the id3() function.

- The first section should assign the correct label to the tree dictionary.
    - You can do this by setting the label key equal to the correct label.

- The second section should assign the column and median keys to the tree dictionary.

    - The values should be equal to best_column and column_median.

Finally, call the id3 function with the correct inputs -- id3(data, "high_income", ["age", "marital_status"], tree).

In [27]:
# Create a dictionary to hold the tree  
# It has to be outside of the function so we can access it later
tree = {}

# This list will let us number the nodes  
# It has to be a list so we can access it inside the function
nodes = []

def id3(data, target, columns, tree):
    unique_targets = pandas.unique(data[target])
    
    # Assign the number key to the node dictionary
    nodes.append(len(nodes) + 1)
    tree["number"] = nodes[-1]

    if len(unique_targets) == 1:
        if len(unique_targets) == 1:
            if 0 in unique_targets:
                tree["label"] = 0
            elif 1 in unique_targets:
                tree["label"] = 1
        return
    best_column = find_best_column(data, target, columns)
    column_median = data[best_column].median()
    
    tree["column"] = best_column
    tree["median"] = column_median
    
    left_split = data[data[best_column] <= column_median]
    right_split = data[data[best_column] > column_median]
    split_dict = [["left", left_split], ["right", right_split]]
    
    for name, split in split_dict:
        tree[name] = {}
        id3(split, target, columns, tree[name])

# Call the function on our data to set the counters properly
id3(data, "high_income", ["age", "marital_status"], tree)

In [28]:
tree.values()

dict_values([1, 1])

## Printing Labels for a More Attractive Tree

Fill in the gaps in the print_node() function that say "Insert code here...".

- Your code should iterate through both branches of the branches list (in order), and recursively call print_node().
    - Don't forget to increment depth when you call print_node.
    
Call print_node(), and pass in tree and depth 0.

In [29]:
def print_with_depth(string, depth):
    # Add space before a string
    prefix = "    " * depth
    # Print a string, and indent it appropriately
    print("{0}{1}".format(prefix, string))
    
    
def print_node(tree, depth):
    # Check for the presence of "label" in the tree
    if "label" in tree:
        # If found, then this is a leaf, so print it and return
        print_with_depth("Leaf: Label {0}".format(tree["label"]), depth)
        # This is critical -- without it, you'll get infinite recursion
        return
    # Print information about what the node is splitting on
    print_with_depth("{0} > {1}".format(tree["column"], tree["median"]), depth)
    
    # Create a list of tree branches
    branches = [tree["left"], tree["right"]]
    
    for i in  branches:
        print_node(i,depth + 1)      
        
    # Insert code here to recursively call print_node on each branch
    # Don't forget to increment depth when you pass it in

print_node(tree, 0)

Leaf: Label 1


## Making Predictions automatically

Fill in the gaps in the predict() function that say "Insert code here...".

- The code should check whether row[column] is less than or equal to median, and return the appropriate result for each side of the tree.

- Print the result of predicting the first row of the data with predict(tree, data.iloc[0]).

In [30]:
def predict(tree, row):
    if "label" in tree:
        return tree["label"]
    
    column = tree["column"]
    median = tree["median"]
    
    if row[column] <= median:
        return tree["left"]
    else:
        return tree["right"]
    
    # Insert code here to check whether row[column] is less than or equal to median
    # If it's less than or equal, return the result of predicting on the left branch of the tree
    # If it's greater, return the result of predicting on the right branch of the tree
    # Remember to use the return statement to return the result!

# Print the prediction for the first row in our data
print(predict(tree, data.iloc[0]))

1


In [31]:
tree.values()

dict_values([1, 1])

# Making Multiple Predictions

Create a function named batch_predict() that takes two parameters, tree and df.

- It should use the apply() method to apply the predict() function across each row of df.
    - You can use lambda functions to pass tree and row into predict.

Call batch_predict() with new_data as the parameter df, and assign the result to predictions.

In [32]:
new_data = pandas.DataFrame([
    [40,0],
    [20,2],
    [80,1],
    [15,1],
    [27,2],
    [38,1]
    ])
# Assign column names to the data
new_data.columns = ["age", "marital_status"]

def batch_predict(tree, df):
    
    return df.apply(lambda x: predict(tree, x), axis=1)

predictions = batch_predict(tree, new_data)


In [33]:
predictions

0    1
1    1
2    1
3    1
4    1
5    1
dtype: int64

## Making predictions automatically

Fill in the gaps in the predict() function that say "Insert code here...".

- The code should check whether row[column] is less than or equal to median, and return the appropriate result for each side of the tree.

- Print the result of predicting the first row of the data with predict(tree, data.iloc[0]).


In [34]:
def predict(tree, row):
    if "label" in tree:
        return tree["label"]
    
    column = tree["column"]
    median = tree["median"]

    
    # Insert code here to check whether row[column] is less than or equal to median
    # If it's less than or equal, return the result of predicting on the left branch of the tree
    # If it's greater, return the result of predicting on the right branch of the tree
    # Remember to use the return statement to return the result!

# Print the prediction for the first row in our data
    if row[column] <= median:
        return tree["left"]
    else:
        return tree["right"]
    
print(predict(tree, data.iloc[0]))

1


## Applying decision trees

Fit clf to the income data.

- Pass in income[columns] so that we only use the named columns as predictors.
- The target is the high_income column.

In [35]:
from sklearn.tree import DecisionTreeClassifier

# A list of columns to train with
# We've already converted all columns to numeric
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

# Instantiate the classifier
# Set random_state to 1 to make sure the results are consistent
clf = DecisionTreeClassifier(random_state=1)

# We've already loaded the variable "income," which contains all of the income data
clf.fit(income[columns],income["high_income"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=1,
            splitter='best')

## Splitting the Data into Train and Test 

All of the rows in income with a position up to train_max_row (but not including it) will be part of the training set.

- Make a new dataframe called train containing all of these rows.
- Make a dataframe called test containing all of the rows with a position greater than or equal to train_max_row.


In [36]:
import numpy
import math
import pandas  as pd

# Set a random seed so the shuffle is the same every time
numpy.random.seed(1)

# Shuffle the rows  
# This permutes the index randomly using numpy.random.permutation
# Then, it reindexes the dataframe with the result
# The net effect is to put the rows into random order
income = income.reindex(numpy.random.permutation(income.index))

train_max_row = math.floor(income.shape[0] * .8)
#print(train_max_row)
train=pd.DataFrame()
test=pd.DataFrame()
train=income.iloc[0:train_max_row]
test=income.iloc[train_max_row:,]

## Evaluating Error With AUC

- Compute the AUC between predictions and the high_income column of test, and assign the result to error.
- Use the print function to display error.


In [37]:
from sklearn.metrics import roc_auc_score

clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])

error=roc_auc_score(test["high_income"],predictions)
print(error)

0.6934656324746192


## Computing Error on the Training Set

- Print out the AUC score between predictions and the high_income column of train.

In [38]:
predictions = clf.predict(train[columns])
error=roc_auc_score(train["high_income"],predictions)
print(error)

0.9471244501437455


## Reducing Overfitting With a Shallower Tree

- Set min_samples_split to 13 when creating the DecisionTreeClassifier.
- Make predictions on the training set, compute the AUC, and assign it to train_auc.
- Make predictions on the test set, compute the AUC, and assign it to test_auc


In [39]:
# Decision trees model from the last screen
clf = DecisionTreeClassifier(random_state=1,min_samples_split=13)
clf.fit(train[columns], train["high_income"])
predictions1=clf.predict(train[columns])
train_auc=roc_auc_score(train["high_income"],predictions1)
print(train_auc)
predictions2=clf.predict(test[columns])
test_auc=roc_auc_score(test["high_income"],predictions2)
print(test_auc)


0.8421431849275413
0.6995617145150872


## Tweaking Parameters to Adjust AUC

- Set max_depth to 7 and min_samples_split to 13 when creating the DecisionTreeClassifier.
- Make predictions on the training set, compute the AUC, and assign it to train_auc.
- Make predictions on the test set, compute the AUC, and assign it to test_auc.

In [40]:
# The first decision trees model we trained and tested
clf = DecisionTreeClassifier(random_state=1,max_depth=7, min_samples_split=13)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)

print(test_auc)
print(train_auc)

0.7436344996725136
0.748037708309209


## Tweaking Tree Depth to Adjust AUC

- Set max_depth to 2 and min_samples_split to 100 when creating the DecisionTreeClassifier.
- Make predictions on the training set, compute the AUC, and assign it to train_auc.
- Make predictions on the test set, compute the AUC, and assign it to test_auc.


In [41]:
# The first decision tree model we trained and tested
clf=DecisionTreeClassifier(random_state=1,max_depth=2,min_samples_split=100)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)

print(test_auc)
print(train_auc)

0.6553138481876499
0.6624508042161483


## Exploring Decision Tree Variance

- Fit the classifier to the training data.

- Make predictions on the training set, compute the AUC, and assign it to train_auc.

- Make predictions on the test set, compute the AUC, and assign it to test_auc



In [42]:
numpy.random.seed(1)

# Generate a column containing random numbers from 0 to 4
income["noise"] = numpy.random.randint(4, size=income.shape[0])

# Adjust "columns" to include the noise column
columns = ["noise", "age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

# Make new train and test sets
train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

# Initialize the classifier
clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(test["high_income"], predictions)

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train["high_income"], train_predictions)
print(train_auc)
print(test_auc)


0.9750761614350801
0.6914060013941348
