In [1]:
import pandas
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

income = pandas.read_csv('income.csv',index_col=False)
columns = [['age','workclass','fnlwgt','education','education_num','marital_status','occupation','relationship','race','sex','capital_gain','capital_loss','hours_per_week','native_country','high_income']]
income.columns = columns
print(income.head(5))

   age          workclass  fnlwgt   education  education_num  \
0   50   Self-emp-not-inc   83311   Bachelors             13   
1   38            Private  215646     HS-grad              9   
2   53            Private  234721        11th              7   
3   28            Private  338409   Bachelors             13   
4   37            Private  284582     Masters             14   

        marital_status          occupation    relationship    race      sex  \
0   Married-civ-spouse     Exec-managerial         Husband   White     Male   
1             Divorced   Handlers-cleaners   Not-in-family   White     Male   
2   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
3   Married-civ-spouse      Prof-specialty            Wife   Black   Female   
4   Married-civ-spouse     Exec-managerial            Wife   White   Female   

   capital_gain  capital_loss  hours_per_week  native_country high_income  
0             0             0              13   United-States   

In [2]:
#Converting categorical variables in income to codes using Categorical.from_array

col = pandas.Categorical.from_array(income['workclass'])
income['workclass'] = col.codes
print(income['workclass'].head(5))
for name in ["education", "marital_status", "occupation", "relationship", "race", "sex", "native_country", "high_income"]:
    col = pandas.Categorical.from_array(income[name])
    income[name] = col.codes

0    6
1    4
2    4
3    4
4    4
Name: workclass, dtype: int8


In [3]:
#Using sklearn.tree package to fit the decision tree. Here we are using the DecisionTreeClassifier class.

from sklearn.tree import DecisionTreeClassifier
columns = ["age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

#Instantiating the classifier
clf = DecisionTreeClassifier(random_state = 1)
clf.fit(income[columns], income['high_income']) #Fit() method to fit the Decision tree to our data

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=1, splitter='best')

In [6]:
#Splitting our data into TEST and TRAIN data sets. Here we use 80% of our data as Train and remaning as Test

import math
np.random.seed(1)
income = income.reindex(np.random.permutation(income.index))

train_max_row = math.floor(income.shape[0] * .8)

train = income[:train_max_row]
test = income[train_max_row:]

In [7]:
#Using AUC to evaluate error. Note: Higher the AUC, more accurate the predictions. AUC ranges from 0 to 1.

from sklearn.metrics import roc_auc_score

clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train["high_income"])

predictions = clf.predict(test[columns])
error = roc_auc_score(predictions, test['high_income'])
print(error)

0.702929759027


We can see that the error is 0.7 on the test set. Next, we check the AUC score for the fit on the training set to see if we are overfitting the data.

In [9]:
predictions = clf.predict(train[columns])
error = roc_auc_score(predictions, train['high_income'])
print(error)

0.97191113789


We observe from the above value that we are infact overfitting the model since we are getting a AUC score of 0.97 for the training set.
Note: Trees overfit when they have too much depth i.e more depth the tree has, the worse it performs on new data.

In [10]:
#Improving model by restricting the depth of the tree while building it.

clf = DecisionTreeClassifier(random_state=1, min_samples_split = 5)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(train[columns])
train_auc = roc_auc_score(predictions, train['high_income'])
print(train_auc)

predictions = clf.predict(test[columns])
test_auc = roc_auc_score(predictions, test['high_income'])
print(test_auc)

0.93200392174
0.716606685283


By tweaking the min_samples_fit parameter, we managed to improve our test auc score from 0.703 to 0.716 and managed to reduce train auc score from 0.97 to 0.93 thereby reducing the overfit by some amount. Continuing to tweak the model parameters further.

In [11]:
#Tweaking the max_depth and min_samples_split properties.

clf = DecisionTreeClassifier(random_state=1, max_depth=4, min_samples_split=25)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(predictions, test["high_income"])

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train_predictions, train["high_income"])

print(test_auc)
print(train_auc)

0.802536734965
0.793488947036


The test auc score is now 0.80 which means that we have improved it much further. Also, the train auc score is now 0.79 which means that we are not overfitting anymore.

In [12]:
#Tweaking the parameters more aggressively. Trying to understand "Underfitting".

clf = DecisionTreeClassifier(random_state=1, max_depth=2, min_samples_split=100)
clf.fit(train[columns], train["high_income"])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(predictions, test["high_income"])

train_predictions = clf.predict(train[columns])
train_auc = roc_auc_score(train_predictions, train["high_income"])

print(test_auc)
print(train_auc)

0.78017266921
0.773555652556


We can see that our test auc score went down from 0.8 to 0.78. This is because we are now "underfitting". It means that our model is "too simple" to actually explain the relations between the variables.

In [14]:
#Exploring Decision Tree Variance

np.random.seed(1)
#Introducing random noise
income["noise"] = np.random.randint(4, size=income.shape[0])

columns = ["noise", "age", "workclass", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "hours_per_week", "native_country"]

train_max_row = math.floor(income.shape[0] * .8)
train = income.iloc[:train_max_row]
test = income.iloc[train_max_row:]

clf = DecisionTreeClassifier(random_state=1)
clf.fit(train[columns], train['high_income'])
predictions = clf.predict(test[columns])
test_auc = roc_auc_score(predictions, test['high_income'])
print(test_auc)

predictions = clf.predict(train[columns])
train_auc = roc_auc_score(predictions, train['high_income'])
print(train_auc)

0.698470862598
0.990088832083


From the above code cell we can see that when we introduce random noise, it caused significant overfitting in the model i.e the auc score for the train data went up from 0.77 to 0.99.