In [1]:
from BorderQuery import select_mungedata, insert_predictions, select_features
from BorderModel import BorderData, handle_categoricals, model_years
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import pandas as pd

## Sanity test model

In [2]:
df = select_mungedata(2, 1, '2008-1-1')

In [3]:
data = BorderData(df, categoricals=['event'])

model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96, bootstrap=True, oob_score=True)
params = {}
grid = GridSearchCV(model, params, cv=data.cv_train)
grid.fit(data.X_train, data.y_train)

data.predict(grid)
data.predict_ensemble()
data.print_metrics(grid)

OOB:  0.9112814356
Best score:  0.383691991105
** MSE for last cv fold **
Baseline :  96.7590735774
Model    :  106.037146373
Ensemble :  91.9049776083
Weights  :  (0.87546482817935323, 1.131480714387938)
** R^2 for last cv fold **
Baseline :  0.14722281598
Model    :  0.0654513758574
Ensemble :  0.190003943769
** Explained variance for last cv fold **
Baseline :  0.304333505904
Model    :  0.282734183054
Ensemble :  0.344959275285


## Predict on full training data

In [4]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96, bootstrap=True, oob_score=True)
params = {}
grid = GridSearchCV(model, params, cv=data.cv_train)
grid.fit(data.X, data.y)

GridSearchCV(cv=array([[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 7...20, 108021, 108022, 108023, 108024, 108025, 108026, 108027, 108028, 108029, 108030]]], dtype=object),
       error_score='raise',
       estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=96, n_jobs=-1, oob_score=True, random_state=None,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [5]:
features = select_features('2015-12-1', '2018-1-1')

In [6]:
fmatrix = handle_categoricals(features, ['event'])

In [7]:
yhat = grid.best_estimator_.predict(fmatrix)

In [8]:
# insert_predictions('v1.0', 2, 1, fmatrix.index, yhat)

IntegrityError: duplicate key value violates unique constraint "predictions_pkey"
DETAIL:  Key (model_version, munger_id, crossing_id, date)=(v1.0, 2, 1, 2015-12-01 00:00:00) already exists.


## Compare predictions by year

In [9]:
from BorderModel import model_years
model_years(df, model, 2011, 2015, categoricals=['event'])

Training...  2011
minimize unexplained ValueError.  Returning default weights
Baseline :  0.395701509877
Model    :  -0.214433409461
Ensemble :  0.387978185879
Training...  2012
Baseline :  0.609866621201
Model    :  0.583184237351
Ensemble :  0.638245414066
Training...  2013
Baseline :  0.61359183581
Model    :  0.658085666965
Ensemble :  0.673428683493
Training...  2014
Baseline :  0.569483794384
Model    :  0.617108317697
Ensemble :  0.635685547456
Training...  2015
Baseline :  0.14722281598
Model    :  0.0713568240205
Ensemble :  0.192092858751


{2011: (<BorderModel.BorderData at 0x7f12058f50d0>,
  GridSearchCV(cv=array([[ [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 7...40933, 40934, 40935, 40936, 40937, 40938, 40939, 40940, 40941, 40942, 40943, 40944]]], dtype=object),
         error_score='raise',
         estimator=ExtraTreesRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=96, n_jobs=-1, oob_score=True, random_state=None,
            verbose=0, warm_start=False),
         fit_params={}, iid=True, n_jobs=1, param_grid={},
         pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)),
 2012: (<BorderModel.Bor