## The full picture

With cross validation we can show you the full picture of model building (after you have done the hard work of data munging). The magic that cross validation unlocks is twofold

1. It allow you to have more training data and therefore get better performance and more accurate representations of your performance
2. It actually simplifies the process. You will no longer need to keep 3 sets of data and you can get by with just two in your mental model

Let's get started:

In [1]:
import pandas as pd
import numpy as np

# we still need to do the preprocessing
def billionaire_preprocess():
    data = pd.read_csv('../data/billionaires.csv')

    del data['was founder']
    del data['inherited']
    del data['from emerging']

    data.age.replace(-1, np.NaN, inplace=True)
    data.founded.replace(0, np.NaN, inplace=True)
    data.gdp.replace(0, np.NaN, inplace=True)
    
    del data['company.name']
    del data['name']
    del data['country code']
    del data['citizenship']
    del data['rank']
    del data['relationship']
    del data['sector']
    
    dummy_data = pd.get_dummies(data, dummy_na=True, columns=data.select_dtypes(exclude=['float64']), drop_first=True)
    
    return dummy_data

In [2]:
from sklearn.model_selection import train_test_split

# now we get the data
data = billionaire_preprocess()

# we parse out the target
y = data['worth in billions']
del data['worth in billions']

# we make our test set
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=1)

# and we no longer make a validation set!

print X_train.shape, X_test.shape

(2091, 70) (523, 70)


In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest

# We still do feature engineering
def billionaire_feature_eng(X, y, quantitative_pipeline, aggregated_pipeline, training=False):
    data = X.copy()

    qualitative_features = data.select_dtypes(exclude=['float64'])
    quantitative_features = data.select_dtypes(include=['float64'])
    
    # notice how we only fit on the training data!
    if training:
        quant_X = quantitative_pipeline.fit_transform(quantitative_features)
    else:
        quant_X = quantitative_pipeline.transform(quantitative_features)

    X = np.concatenate([quant_X, qualitative_features], axis=1)
    
    if training:
        X = aggregated_pipeline.fit_transform(X, y)
    else:
        X = aggregated_pipeline.transform(X)
    
    return X, y

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_regression

# and we can abstract out specific parts of the pipeline
quantitative_pipeline = Pipeline([
    ('imputer', Imputer(strategy='median'))
])

aggregated_pipeline = Pipeline([
    ('var_threshold', VarianceThreshold(threshold=0.0))
])

X_train, y_train = billionaire_feature_eng(X_train, y_train, quantitative_pipeline, aggregated_pipeline, training=True)

Our next step will be to define the model that we are looking at:

In [5]:
from sklearn.tree import DecisionTreeRegressor

reg = DecisionTreeRegressor()

Then we determine which parameters we would like to search over:

In [7]:
params = {
    'max_depth': range(2, 20, 2),
    'min_samples_leaf': range(5, 25, 5)
}

And finally we use GridSearchCV which will search over the parameters doing cross validation to determine their performance:

In [15]:
from sklearn.model_selection import GridSearchCV

gs = GridSearchCV(reg, params, scoring='neg_mean_absolute_error')

In [16]:
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18], 'min_samples_leaf': [5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

We get a lot of goodies. We can see the best score and estimator:

In [18]:
gs.best_score_

-2.2413675289858621

In [19]:
gs.best_estimator_

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

And we get to use the grid search object as that estimator as well:

In [21]:
gs.predict(X_train[:5])

array([ 2.86509622,  2.86509622,  2.86509622,  9.35714286,  4.31932021])

As a last treat, we can combine all of these steps into a single pipeline and then do grid search over a variety of parameters on the different steps:

In [22]:
from sklearn.model_selection import train_test_split

# now we get the data
data = billionaire_preprocess()

# we parse out the target
y = data['worth in billions']
del data['worth in billions']

# we make our test set
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=1)

# and we no longer make a validation set!

print X_train.shape, X_test.shape

(2091, 70) (523, 70)


In [23]:
# first we define the full pipeline
full_pipeline = Pipeline([
    ('imputer', Imputer(strategy='median')),
    ('var_threshold', VarianceThreshold(threshold=0.0)),
    ('tree', DecisionTreeRegressor())
])

In [25]:
# we then define the parameters that we want to search over
params = {
    'imputer__strategy': ['median', 'mean'],
    'var_threshold__threshold': [0.0, 0.1, 0.2],
    'tree__max_depth': range(2, 20, 2),
    'tree__min_samples_leaf': range(5, 25, 5)
}
# notice the double underscore

In [27]:
gs = GridSearchCV(full_pipeline, params, scoring='neg_mean_absolute_error')

gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('var_threshold', VarianceThreshold(threshold=0.0)), ('tree', DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'imputer__strategy': ['median', 'mean'], 'tree__max_depth': [2, 4, 6, 8, 10, 12, 14, 16, 18], 'var_threshold__threshold': [0.0, 0.1, 0.2], 'tree__min_samples_leaf': [5, 10, 15, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_absolute_error', verbose=0)

In [28]:
print gs.best_score_
print gs.best_estimator_

-2.24136752899
Pipeline(steps=[('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('var_threshold', VarianceThreshold(threshold=0.0)), ('tree', DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=10, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'))])
