In [40]:
#!/usr/bin/env python

'''
GA Data Science Q2 2016

In-class exercise 6: Decision trees and random forests
'''

import numpy as np
import pandas as pd

from sklearn import cross_validation as cv, tree, ensemble, grid_search

In [3]:
# Read in the Wine Quality datasets
reds = pd.read_csv('../../Data/winequality_red.csv', sep=';')
whites = pd.read_csv('../../Data/winequality_white.csv', sep=';')

# Add a new indicator variable for the type of wine
reds['red'] = 1
whites['red'] = 0

# Merge the two datasets
wines = pd.concat([reds, whites], axis=0)

# Prepare the data for use in scikit-learn
X = wines.drop('quality', axis=1)
y = wines.quality

In [None]:
# Add a new indicator variable for the type of wine
reds['red'] = 1
whites['red'] = 0

# Merge the two datasets
wines = pd.concat([reds, whites], axis=0)

# Prepare the data for use in scikit-learn
X = wines.drop('quality', axis=1)
y = wines.quality

In [7]:
# Train a decision tree by limiting the depth to 3, and the minimum number of
# samples per leaf to 50
#?tree.DecisionTreeRegressor
tree_model1 = tree.DecisionTreeRegressor(max_depth=3, min_samples_leaf=50)

In [8]:
tree_model1.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=50, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [11]:
# Export the tree for plotting
# ?tree.export_graphviz
tree.export_graphviz(tree_model1, 'tree_model1.dot', feature_names=X.columns)

In [14]:
# Define folds for cross-validation
kf = cv.StratifiedKFold(y, n_folds=5, shuffle=True)

In [25]:
# Compute average MSE across folds
mses = cv.cross_val_score(tree.DecisionTreeRegressor(max_depth=3, min_samples_leaf=50), X, y, scoring='mean_squared_error', cv=kf)

In [32]:
np.mean(-mses)

0.56562282509270223

In [27]:
# Train a random forest with 20 decision trees
tree_model2 = tree.DecisionTreeRegressor(max_depth=20)
tree_model2.fit(X, y)

DecisionTreeRegressor(criterion='mse', max_depth=20, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [36]:
# Investigate importances of predictors
list(zip(X.columns, tree_model2.feature_importances_))

[('fixed acidity', 0.058134382543316966),
 ('volatile acidity', 0.12897943267631351),
 ('citric acid', 0.060383019329787749),
 ('residual sugar', 0.065243126703735332),
 ('chlorides', 0.077587730591648557),
 ('free sulfur dioxide', 0.088648008456396879),
 ('total sulfur dioxide', 0.059052515364875428),
 ('density', 0.055496417348183238),
 ('pH', 0.072896383741247864),
 ('sulphates', 0.07888342677771909),
 ('alcohol', 0.25364283369264423),
 ('red', 0.0010527227741311278)]

In [31]:
# Evaluate performance through cross-validation
mses2 = cv.cross_val_score(tree.DecisionTreeRegressor(max_depth=20), X, y, scoring='mean_squared_error', cv=kf)
np.mean(-mses2)

0.6836666253378918

In [34]:
# What happens when you increase the number of trees to 50?
mses3 = cv.cross_val_score(tree.DecisionTreeRegressor(max_depth=50), X, y, scoring='mean_squared_error', cv=kf)
np.mean(-mses3)

0.67781791755841558

In [41]:
gs = grid_search.GridSearchCV(estimator=tree.DecisionTreeRegressor(),\
                              param_grid={'max_depth': np.arange(2, 100)},\
                              scoring='mean_squared_error',\
                              cv=kf)

In [43]:
gs.fit(X, y)

GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[5 5 ..., 7 6], n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': array([ 2,  3, ..., 98, 99])},
       pre_dispatch='2*n_jobs', refit=True, scoring='mean_squared_error',
       verbose=0)

In [44]:
gs.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=6, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [45]:
gs.best_score_

-0.53994722696025499