In [2]:
from BorderModel import BorderData, clean_df_subset
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
from dbhelper import pd_query
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV

In [3]:
query = '''
        select 
            m.date,
            metric as waittime,
            year,
            month,
            week,
            dayofweek,
            minofday,
            w.temp_max,
            w.temp_mean,
            w.temp_min,
            w.viz_max,
            w.wind_max,
            w.precip,
            w.rain,
            w.snow,
            w.fog,
            w.thunderstorm,
            wp1.temp_max as temp_max_p1,
            wp1.temp_mean as temp_mean_p1,
            wp1.temp_min as temp_min_p1,
            wp1.precip as precip_p1,
            wp1.rain as rain_p1,
            wp1.snow as snow_p1,
            wp1.thunderstorm as thunderstorm_p1,
            wp2.temp_max as temp_max_p2,
            wp2.temp_mean as temp_mean_p2,
            wp2.temp_min as temp_min_p2,
            wp2.precip as precip_p2,
            wp2.rain as rain_p2,
            wp2.snow as snow_p2,
            wp2.thunderstorm as thunderstorm_p2,
            wp3.temp_max as temp_max_p3,
            wp3.temp_mean as temp_mean_p3,
            wp3.temp_min as temp_min_p3,
            wp3.precip as precip_p3,
            wp3.rain as rain_p3,
            wp3.snow as snow_p3,
            wp3.thunderstorm as thunderstorm_p3,
            wm1.temp_max as temp_max_m1,
            wm1.temp_mean as temp_mean_m1,
            wm1.temp_min as temp_min_m1,
            wm1.precip as precip_m1,
            wm1.rain as rain_m1,
            wm1.snow as snow_m1,
            wm1.thunderstorm as thunderstorm_m1,
            wm2.temp_max as temp_max_m2,
            wm2.temp_mean as temp_mean_m2,
            wm2.temp_min as temp_min_m2,
            wm2.precip as precip_m2,
            wm2.rain as rain_m2,
            wm2.snow as snow_m2,
            wm2.thunderstorm as thunderstorm_m2,
            s.event,
            s_lead1.event as event_lead1,
            s_lag1.event as event_lag1,
            s_lead2.event as event_lead2,
            s_lag2.event as event_lag2,
            s_lead3.event as event_lead3,
            s_lag3.event as event_lag3,
            s_lead4.event as event_lead4,
            s_lag4.event as event_lag4,
            1 as sea,
            1 as sea_lag1,
            1 as sea_lead1,
            1 as sea_lag2,
            1 as sea_lead2,
            1 as sea_lag3,
            1 as sea_lead3,
            1 as van,
            1 as van_lag1,
            1 as van_lead1,
            1 as van_lag2,
            1 as van_lead2,
            1 as van_lag3,
            1 as van_lead3
        from mungedata m
        join datefeatures d on m.date = d.date
        left join publicholiday h on m.date::timestamp::date = h.date
        left join weather w on m.date::timestamp::date = w.date
        left join weather wp1 on m.date::timestamp::date = wp1.date - interval '1 day'
        left join weather wp2 on m.date::timestamp::date = wp2.date - interval '2 day'
        left join weather wp3 on m.date::timestamp::date = wp3.date - interval '3 day'
        left join weather wm1 on m.date::timestamp::date = wm1.date + interval '1 day'
        left join weather wm2 on m.date::timestamp::date = wm2.date + interval '2 day'
        left join specialdates s on m.date::timestamp::date = s.date
        left join specialdates s_lead1 on m.date::timestamp::date = s_lead1.date - interval '1 day'
        left join specialdates s_lag1 on m.date::timestamp::date = s_lag1.date + interval '1 day'
        left join specialdates s_lead2 on m.date::timestamp::date = s_lead2.date - interval '2 day'
        left join specialdates s_lag2 on m.date::timestamp::date = s_lag2.date + interval '2 day'
        left join specialdates s_lead3 on m.date::timestamp::date = s_lead3.date - interval '3 day'
        left join specialdates s_lag3 on m.date::timestamp::date = s_lag3.date + interval '3 day'
        left join specialdates s_lead4 on m.date::timestamp::date = s_lead4.date - interval '4 day'
        left join specialdates s_lag4 on m.date::timestamp::date = s_lag4.date + interval '4 day'
        left join schoolcalendar sea on m.date::timestamp::date = sea.date_out and sea.district='seattle'
        left join schoolcalendar sea_lag1 on m.date::timestamp::date = sea_lag1.date_out + interval '1 day' and sea_lag1.district='seattle'
        left join schoolcalendar sea_lead1 on m.date::timestamp::date = sea_lead1.date_out - interval '1 day' and sea_lead1.district='seattle'
        left join schoolcalendar sea_lag2 on m.date::timestamp::date = sea_lag2.date_out + interval '2 day' and sea_lag2.district='seattle'
        left join schoolcalendar sea_lead2 on m.date::timestamp::date = sea_lead2.date_out - interval '2 day' and sea_lead2.district='seattle'
        left join schoolcalendar sea_lag3 on m.date::timestamp::date = sea_lag3.date_out + interval '3 day' and sea_lag3.district='seattle'
        left join schoolcalendar sea_lead3 on m.date::timestamp::date = sea_lead3.date_out - interval '3 day' and sea_lead3.district='seattle'
        left join schoolcalendar van on m.date::timestamp::date = van.date_out and van.district='vancouver'
        left join schoolcalendar van_lag1 on m.date::timestamp::date = van_lag1.date_out + interval '1 day' and van_lag1.district='vancouver'
        left join schoolcalendar van_lead1 on m.date::timestamp::date = van_lead1.date_out - interval '1 day' and van_lead1.district='vancouver'
        left join schoolcalendar van_lag2 on m.date::timestamp::date = van_lag2.date_out + interval '2 day' and van_lag2.district='vancouver'
        left join schoolcalendar van_lead2 on m.date::timestamp::date = van_lead2.date_out - interval '2 day' and van_lead2.district='vancouver'
        left join schoolcalendar van_lag3 on m.date::timestamp::date = van_lag3.date_out + interval '3 day' and van_lag3.district='vancouver'
        left join schoolcalendar van_lead3 on m.date::timestamp::date = van_lead3.date_out - interval '3 day' and van_lead3.district='vancouver'
        where
            crossing_id = 1
            and m.date >= '2008-1-1'
            and munger_id = 2
            and (minute = 0 or minute = 30)
            and is_waittime = true
        order by m.date;
        '''

df1 = pd_query(query)

In [4]:
from BorderModel import BorderData
data = BorderData(df1, categoricals=['event'])

model = ExtraTreesRegressor(n_jobs=-1, n_estimators=16, bootstrap=True, oob_score=True)
params = {}
grid = GridSearchCV(model, params, cv=data.cv_train)
grid.fit(data.X_train, data.y_train)

data.predict(grid)
data.print_metrics(grid)

OOB:  0.876861392718
Best score:  0.345997373906
** MSE for last cv fold **
Baseline :  96.7590735774
Model    :  111.724986543
** R^2 for last cv fold **
Baseline :  0.14722281598
Model    :  0.0153221203357
** Explained variance for last cv fold **
Baseline :  0.304333505904
Model    :  0.237370408629


  warn("Some inputs do not have OOB scores. "


In [5]:
data.predict_ensemble()
data.print_metrics(grid)

OOB:  0.876861392718
Best score:  0.345997373906
** MSE for last cv fold **
Baseline :  96.7590735774
Model    :  111.724986543
Ensemble :  92.2236858923
Weights  :  (0.84536964208989807, 1.4528220991143854)
** R^2 for last cv fold **
Baseline :  0.14722281598
Model    :  0.0153221203357
Ensemble :  0.187195037659
** Explained variance for last cv fold **
Baseline :  0.304333505904
Model    :  0.237370408629
Ensemble :  0.336067096127


### Testing an observation weighting function

In [120]:
from BorderModel import BorderData, harmonic_mean
# harmonic_mean((data.yhat, data.baseline), (predict_weight, 1))
data.calculate_weights(grid.best_estimator_)

array([3.607367969935602, 3.607367969935602, 3.607367969935602, ..., 1.0,
       1.0, 1.0], dtype=object)

## Basic weighting with optimization of scalara weights between baseline and predictions

In [142]:
from BorderModel import BorderData
data.predict_ensemble()

In [143]:
data.print_metrics(grid)

OOB:  0.876861392718
Best score:  0.999963564428
** MSE for last cv fold **
Baseline :  96.7590735774
Model    :  111.724986543
Ensemble :  92.2236858923
Weights  :  (0.73559210964333466, 1.2641629095661624)
** R^2 for last cv fold **
Baseline :  0.14722281598
Model    :  0.0153221203357
Ensemble :  0.187195037659
** Explained variance for last cv fold **
Baseline :  0.304333505904
Model    :  0.237370408629
Ensemble :  0.336067093841


## With feature importance weighting
* Not as good as simple weighting
* Also tried without success
    * constant weight for all event features: .15
    * constant weight above feature importance threshold for all event features: .15
    * feature importance weighting with spreading: .186
    * event-only weighting with spreading: .186

In [157]:
from BorderModel import BorderData
data.predict_ensemble(grid.best_estimator_)

In [158]:
data.print_metrics(grid)

OOB:  0.876861392718
Best score:  0.999963564428
** MSE for last cv fold **
Baseline :  96.7590735774
Model    :  111.724986543
Ensemble :  92.8744020544
Weights  :  (1.0408831853579976, 0.95910732444429858)
** R^2 for last cv fold **
Baseline :  0.14722281598
Model    :  0.0153221203357
Ensemble :  0.181460010692
** Explained variance for last cv fold **
Baseline :  0.304333505904
Model    :  0.237370408629
Ensemble :  0.336018723268


In [177]:
from BorderModel import BorderData
data.predict_ensemble(grid.best_estimator_)
data.print_metrics(grid)

OOB:  0.876861392718
Best score:  0.999963564428
** MSE for last cv fold **
Baseline :  96.7590735774
Model    :  111.724986543
Ensemble :  92.3358096757
Weights  :  (0.70341412841683049, 1.6074567465655702)
** R^2 for last cv fold **
Baseline :  0.14722281598
Model    :  0.0153221203357
Ensemble :  0.186206845021
** Explained variance for last cv fold **
Baseline :  0.304333505904
Model    :  0.237370408629
Ensemble :  0.334068242432
