In [1]:
#!/usr/bin/env python

'''
GA Data Science Q2 2016

Code walk-through 10: Decision trees and random forests

* Decision trees
* Random forests
'''

import numpy as np
import pandas as pd

from sklearn import cross_validation as cv, tree, ensemble

In [2]:
# Read in the Wine Quality datasets
reds = pd.read_csv('../../Data/winequality_red.csv', sep=';')
whites = pd.read_csv('../../Data/winequality_white.csv', sep=';')

In [3]:
# Add a new indicator variable for the type of wine to predict the classes
reds['red'] = 1
whites['red'] = 0

# Merge the two datasets, axis zero means it will concatenate by row
wines = pd.concat([reds, whites], axis=0)

# Because we are trying to binarise, this is why taking wwine quality we ask the question quality >= 8
# Define a new indicator variable for ‘excellent’ wines (quality score ≥ 8)
wines['excellent'] = wines.quality >= 8

# Prepare the data for use in scikit-learn
X = wines.drop(['quality', 'excellent'], axis=1)
y = wines.excellent.astype('int') # turns true/false into zero and ones

In [8]:
'''
Decision trees
'''

# Train a decision tree
tree1 = tree.DecisionTreeClassifier()
tree1.fit(X, y)

# Export the tree for plotting
tree.export_graphviz(tree1, 'tree1.dot', feature_names=X.columns)

In [11]:
dot -Tpng tree1.dot -o tree1.png

SyntaxError: invalid syntax (<ipython-input-11-18de1f249706>, line 1)

In [None]:
# If you have Graphviz (http://www.graphviz.org) installed, run:
#     dot -Tpng tree1.dot -o tree1.png
# Alternatively, use WebGraphviz at http://www.webgraphviz.com/

In [12]:
# Define stratified folds for cross-validation
kf = cv.StratifiedKFold(y, n_folds=10, shuffle=True)

# Compute average AUC across folds
aucs = cv.cross_val_score(tree.DecisionTreeClassifier(),\
                          X, y, scoring='roc_auc', cv=kf)
np.mean(aucs)

0.72930926282319475

In [13]:
# Train a decision tree by limiting:
# * the maximum number of questions (depth)
# * the minimum number of samples in each leaf
tree2 = tree.DecisionTreeClassifier(max_depth=2, min_samples_leaf=50)
tree2.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=50,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [14]:
# Export the tree for plotting
# From seeing tree, value = non excellent is always hgher than excellent hence model predicts non excellent all the time
tree.export_graphviz(tree2, 'tree2.dot', feature_names=X.columns)

In [17]:
# Investigate importances of predictors (the higher, the more important) Indicates how often the variable comes up in the tree
# So we can see alcohol is the most influential
tree2.feature_importances_

array([ 0.        ,  0.08035384,  0.        ,  0.        ,  0.        ,
        0.19338648,  0.        ,  0.        ,  0.        ,  0.        ,
        0.72625968,  0.        ])

In [18]:
list(zip(X.columns, tree2.feature_importances_))

[('fixed acidity', 0.0),
 ('volatile acidity', 0.080353840900456636),
 ('citric acid', 0.0),
 ('residual sugar', 0.0),
 ('chlorides', 0.0),
 ('free sulfur dioxide', 0.19338647782765769),
 ('total sulfur dioxide', 0.0),
 ('density', 0.0),
 ('pH', 0.0),
 ('sulphates', 0.0),
 ('alcohol', 0.72625968127188567),
 ('red', 0.0)]

In [None]:
#

In [19]:
'''
Random forests
'''

# Train a random forest with 20 decision trees, number of estimators is number of trees you want in the forest
rf1 = ensemble.RandomForestClassifier(n_estimators=20)
rf1.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
# Investigate importances of predictors (the higher, the more important)
# how often it comes up in the 20 trees
rf1.feature_importances_

array([ 0.07532454,  0.10364619,  0.06905918,  0.09977061,  0.0821377 ,
        0.09514271,  0.09613898,  0.09703323,  0.08646126,  0.09055142,
        0.10403389,  0.0007003 ])

In [21]:
# Evaluate performance through cross-validation
aucs = cv.cross_val_score(ensemble.RandomForestClassifier(n_estimators=20),\
                          X, y, scoring='roc_auc', cv=kf)
np.mean(aucs)

0.86199433234649947

In [22]:
# What happens when we increase the number of trees?
for n_trees in [2, 5, 10, 20, 50, 100]:
    aucs = cv.cross_val_score(
        ensemble.RandomForestClassifier(n_estimators=n_trees), X, y,\
        scoring='roc_auc', cv=kf)
    print('{:>3} trees: mean AUC {:.2%}'.format(n_trees, np.mean(aucs)))

  2 trees: mean AUC 72.17%
  5 trees: mean AUC 79.31%
 10 trees: mean AUC 82.11%
 20 trees: mean AUC 85.20%
 50 trees: mean AUC 89.02%
100 trees: mean AUC 90.11%
