In [55]:
from BorderModel import BorderData
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from dbhelper import pd_query
import datetime
from sklearn.metrics import r2_score

In [17]:
query = '''
        select 
            c.date,
            metric as waittime,
            year,
            month,
            dayofmonth,
            week,
            dayofweek,
            minofday,
            labor,
            newyears,
            us_mlk,
            us_washington,
            us_memorial,
            us_independence,
            us_columbus,
            us_veterans,
            us_thanksgiving,
            xmas,
            ca_goodfriday,
            ca_victoria,
            ca_canada,
            ca_civic,
            ca_thanksgiving,
            mothers,
            ca_family
        from mungedata c
        join datefeatures d on c.date = d.date
        left join publicholiday h on c.date::timestamp::date = h.date
        where
            crossing_id = 1
            and munger_id = 2
            and (minute = 0 or minute = 30)
            and is_waittime = true
        order by c.date;
        '''

df = pd_query(query)

In [18]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor

## Iterate through each year, run a model and return scores for each

In [52]:
def run_model(df, model):
    data = BorderData(df)

    params = {}
    grid = GridSearchCV(model, params, cv=data.cv_train)
    grid.fit(data.X_train, data.y_train)
    return (data, grid.best_estimator_)

In [53]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=100, bootstrap=True, oob_score=True)

trained = {}
for year in range(2010, 2016):
    dfin = df.copy()[df.date < datetime.date(year + 1, 1, 1)]
    print "Training... ", year
    trained[year] = run_model(dfin, model)

Training...  2010
Training...  2011
Training...  2012
Training...  2013
Training...  2014
Training...  2015


In [56]:
for year, val in trained.iteritems():
    print year
    (data, mdl) = val
    data.predict(mdl)
    print "Baseline : ", r2_score(data.y_test, data.baseline)
    print 'Model    : ', r2_score(data.y_test, data.yhat)

2010
Baseline :  0.285357880989
Model    :  0.348581252032
2011
Baseline :  0.395701509877
Model    :  -0.428069154036
2012
Baseline :  0.609866621201
Model    :  0.455701536847
2013
Baseline :  0.61359183581
Model    :  0.633534760378
2014
Baseline :  0.569483794384
Model    :  0.55912148217
2015
Baseline :  0.14722281598
Model    :  0.0499787728829


## Quality of estimate varies by year suggesting that nature of crossing has changed over time

In [57]:
query = '''
        select 
            c.date,
            metric as waittime,
            year,
            month,
            dayofmonth,
            week,
            dayofweek,
            minofday,
            labor,
            newyears,
            us_mlk,
            us_washington,
            us_memorial,
            us_independence,
            us_columbus,
            us_veterans,
            us_thanksgiving,
            xmas,
            ca_goodfriday,
            ca_victoria,
            ca_canada,
            ca_civic,
            ca_thanksgiving,
            mothers,
            ca_family
        from mungedata c
        join datefeatures d on c.date = d.date
        left join publicholiday h on c.date::timestamp::date = h.date
        where
            crossing_id = 5
            and munger_id = 2
            and (minute = 0 or minute = 30)
            and is_waittime = true
        order by c.date;
        '''

df = pd_query(query)

In [63]:
from BorderModel import model_years
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=100, bootstrap=True, oob_score=True)

trained = model_years(df, model, 2010, 2015)

Training...  2010
Baseline :  0.504481017297
Model    :  0.519615176907


NameError: global name 'mdl' is not defined