## Tune parameters
* tuning will be mostly manual since the goal is to reduce variance whereas grid search is better at reducing bias

In [1]:
from BorderModel import IncrementalModel
from BorderQuery import select_features, select_mungedata, select_mungedata_simple
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
from multiprocessing import Pool

## Run without bootstrap

In [13]:
df_train = select_mungedata(2, 1, '2011-1-1', '2015-1-1')
X_test = select_features('2015-1-1', '2016-1-1')
actuals = select_mungedata_simple(2, 1, '2015-1-1', '2016-1-1')
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

In [9]:
im = IncrementalModel(df_train, grid, categoricals=['event'])
yhat = im.predict(X_test)   
im.score(actuals.waittime)

Weights:  1.5214329278 0.856626541498


{'baseline': 0.14722281598038744,
 'ensemble': 0.20579506455332897,
 'model': 0.17568258802660763}

### Best score yet

In [12]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(sorted(zip(im.X.columns, grid.best_estimator_.feature_importances_), key=lambda x: x[1])[::-1])

[   ('minofday', 0.5597980237062623),
    ('dayofweek', 0.068425273544121964),
    ('avg_delta_8', 0.016347266831905252),
    ('year', 0.012662088421638443),
    ('avg_delta_7', 0.012437631596491458),
    ('avg_delta_9', 0.012159041499010296),
    ('avg_delta_10', 0.011239082430415992),
    ('avg_delta_6', 0.010396537075075568),
    ('avg_delta_5', 0.0094040483700228092),
    ('avg_delta_12', 0.0092429763264878211),
    ('event_goodfriday', 0.008695854856605378),
    ('week', 0.0083830799207332621),
    ('month', 0.0083438727271462881),
    ('avg_delta_11', 0.0082409184101611983),
    ('avg_delta_3', 0.0077768926076066504),
    ('avg_delta_4', 0.0075257259207146794),
    ('avg_delta_1', 0.0074608112378034821),
    ('avg_delta_2', 0.006891720736368223),
    ('temp_min_m2', 0.0057424079245158369),
    ('temp_min_m1', 0.0054513268216910306),
    ('temp_mean_m2', 0.0054333653260035626),
    ('temp_min', 0.0053868962227209124),
    ('precip_m2', 0.0052397188168027721),
    ('wind_max', 0.00

## Max_depth

In [33]:
def gridsearch(params, df_train):
    model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
    grid = GridSearchCV(model, params)
    im = IncrementalModel(df_train, grid, categoricals=['event'])
    im.predict(X_test)   
    return im.score(actuals.waittime)

In [85]:
param_options = [{'max_depth': [4]}, {'max_depth': [8]}, {'max_depth': [12]}, {'max_depth': [16]}]
scores = [gridsearch(params, df_train) for params in param_options]

Weights:  1.06605783159 0.937797515236
Weights:  1.04761277584 0.420659696526
Weights:  2.27023053585 0.380151814215
Weights:  1.41047883544 0.386761212217


In [86]:
scores

[{'baseline': 0.1059067680212642,
  'ensemble': 0.24214359724373546,
  'model': 0.12239016633762556},
 {'baseline': 0.1059067680212642,
  'ensemble': 0.15992696135250284,
  'model': 0.16896685024076274},
 {'baseline': 0.1059067680212642,
  'ensemble': 0.13552465911312039,
  'model': 0.19488460987365941},
 {'baseline': 0.1059067680212642,
  'ensemble': 0.15620359500663916,
  'model': 0.19107758644972106}]

### Larger max_depth has better model score, but lower ensemble score.  Very high ensemble score at max_depth = 4.

Ensembling low and high max depth could be a good approach.

## More max_depth options

For ipyparallel, verify correct CWD.

In [28]:
print(dview.apply_sync(os.getcwd))
dview.map(os.chdir, ['/home/ubuntu/BorderCrossing/code'] * 32)

<AsyncMapResult: finished>

In [50]:
import time
prlist = []
    
param_options = [{'max_depth': [20]}, {'max_depth': [30]}, {'max_depth': [45]}, {'max_depth': [65]}]
for params in param_options:
    prlist.append(dview.apply_async(gridsearch, params, df_train))
    time.sleep(60)

In [56]:
scores = [gridsearch(params, df_train) for params in param_options]

Weights:  1.37379626872 0.587324913794
Weights:  1.32629432603 0.71249434172
Weights:  1.29778419667 0.712840200875
Weights:  1.57367997757 0.852148627977


In [59]:
scores

[{'baseline': 0.14722281598038744,
  'ensemble': 0.19790960529150181,
  'model': 0.18774701657476178},
 {'baseline': 0.14722281598038744,
  'ensemble': 0.20594179798250389,
  'model': 0.17791530038822811},
 {'baseline': 0.14722281598038744,
  'ensemble': 0.20722834390834188,
  'model': 0.17773185033411842},
 {'baseline': 0.14722281598038744,
  'ensemble': 0.20600029586278412,
  'model': 0.17810939086161515}]

### Max_depth >=30 is same as no limit

## Shift to full 2015 data, starting at 2013

## Min_samples_leaf

In [63]:
df_train = select_mungedata(3, 1, '2013-1-1', '2015-1-1')
X_test = select_features('2015-1-1', '2016-1-1')
actuals = select_mungedata_simple(3, 1, '2015-1-1', '2016-1-1')
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {'min_samples_leaf': [1, 4, 12, 18]})
im = IncrementalModel(df_train, grid, categoricals=['event'])

In [69]:
grid.best_score_

0.62150019909963328

In [70]:
grid.grid_scores_

[mean: 0.62150, std: 0.05165, params: {'min_samples_leaf': 1},
 mean: 0.61943, std: 0.04566, params: {'min_samples_leaf': 4},
 mean: 0.61934, std: 0.04036, params: {'min_samples_leaf': 12},
 mean: 0.61758, std: 0.03887, params: {'min_samples_leaf': 18}]

In [80]:
param_options = [{'min_samples_leaf': [1]}, {'min_samples_leaf': [4]}, {'min_samples_leaf': [12]}, {'min_samples_leaf': [18]}]
scores = [gridsearch(params, df_train) for params in param_options]

Weights:  1.28911294942 0.62497370908
Weights:  1.87056358016 0.761079605562
Weights:  1.62121068896 0.682707985097
Weights:  1.30001794396 0.707403054271


In [81]:
scores

[{'baseline': 0.1059067680212642,
  'ensemble': 0.17383996754691822,
  'model': 0.15840259749415553},
 {'baseline': 0.1059067680212642,
  'ensemble': 0.16109721593270654,
  'model': 0.16617637024584497},
 {'baseline': 0.1059067680212642,
  'ensemble': 0.15143348371504783,
  'model': 0.15833155867120163},
 {'baseline': 0.1059067680212642,
  'ensemble': 0.14788635437183084,
  'model': 0.14051070808843458}]

### default of 1 is best