In [1]:
from BorderModel import IncrementalModel, run_Incremental, sort_importances, print_importances
from BorderQuery import select_features, select_mungedata_simple, select_mungedata
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import itertools
import pdb
import random
import pandas as pd
import numpy as np

### Initialize for parallel operations

In [2]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 40)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import IncrementalModel, run_Incremental
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing IncrementalModel,run_Incremental from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


In [3]:
from BorderModel import IncrementalModel, run_Incremental
with dview.sync_imports():
    from BorderModel import IncrementalModel, run_Incremental
import random

def create_train_test(year, train_length=2):
    '''
    IN 
        years: list of years to predict
        train_length: number of years to train
    '''
    train_start = datetime.date(year - train_length, 1, 1).strftime('%Y-%m-%d')
    train_end = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_start = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_end = datetime.date(year + 1, 1, 1).strftime('%Y-%m-%d')
    return train_start, train_end, test_start, test_end
    
def compare_years_parallel(model, xing, munger_id, years):
    prlist = {}
    for year in years:
        cpu = random.randint(0, 31)
        train_start, train_end, test_start, test_end = create_train_test(year, 2)

        prlist[year] = rc[cpu].apply_async(run_Incremental, model, munger_id, xing,  
                                           train_start, train_end, 
                                           test_start, test_end)
        
    return prlist

importing IncrementalModel,run_Incremental from BorderModel on engine(s)


In [4]:
def model_plot(model, start, end):
    plt.figure(figsize=(16,4))
    baseline = model.baseline()
    ensemble = model.ensemble()
    actuals = model.actual
    yhat = model.y_predict
    
    plt.plot(actuals[(actuals.index.date>=start) & (actuals.index.date<end)], label='actuals')
    plt.plot(baseline[(baseline.index.date>=start) & (baseline.index.date<end)], label='baseline')
    plt.plot(yhat[(yhat.index.date>=start) & (yhat.index.date<end)], label='predictions')
    plt.plot(ensemble[(ensemble.index.date>=start) & (ensemble.index.date<end)], label='ensemble')
    plt.legend();
    
def imp_df(xid, model_years):
    impdf = pd.DataFrame()
    for year, model in model_years.items():
        imp = sort_importances(model.model.best_estimator_, model.X.columns)
        df = pd.DataFrame(np.array(imp)[:,1], np.array(imp)[:,0]).T
        df['xid'] = xid
        df['yr'] = int(year)
        df = df.set_index(['xid', 'yr'])
        impdf = pd.concat([impdf, df])
    return impdf

## Pacific Crossing South

In [5]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr5 = compare_years_parallel(model, 5, 3, range(2011, 2016))

## Pacific Crossing North

In [8]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr6 = compare_years_parallel(model, 6, 4, range(2013, 2016))

## Peace Arch South

In [11]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr1 = compare_years_parallel(model, 1, 3, range(2011, 2016))

## Peace Arch North

In [14]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr2 = compare_years_parallel(model, 2, 4, range(2013, 2016))

## Pull data from ipyparallel

In [27]:
model5 = {}
for year in range(2011, 2016):
    if pr5[year].ready():
        model5[year] = pr5[year].get(1)
        print model5[year].score()
    else:
        print year, "not ready"

imp5 = imp_df(5, model5)

{'model': 0.48958917535850477, 'ensemble': 0.54896185652642382, 'baseline': 0.50830951929768831}
{'model': 0.61871601195780435, 'ensemble': 0.64389271202539922, 'baseline': 0.6264414468772701}
{'model': 0.6149569340263179, 'ensemble': 0.60070780876856067, 'baseline': 0.57215536973016168}
{'model': 0.58409583067628756, 'ensemble': 0.59754163202736144, 'baseline': 0.51290499913254095}
{'model': 0.31580220049919683, 'ensemble': 0.31825498281260534, 'baseline': 0.23074703434185284}


In [28]:
model6 = {}
for year in range(2013, 2016):
    if pr6[year].ready():
        model6[year] = pr6[year].get(1)
        print model6[year].score()
    else:
        print year, "not ready"

imp6 = imp_df(6, model6)

{'model': 0.25578195288947747, 'ensemble': 0.27921287749023405, 'baseline': 0.26915343237860456}
{'model': 0.30362933058163388, 'ensemble': 0.31377945077274494, 'baseline': 0.29165671141312366}
{'model': -0.18660426439792599, 'ensemble': -0.057209523821120545, 'baseline': -0.32952975930038231}


In [29]:
model1 = {}
for year in range(2011, 2016):
    if pr1[year].ready():
        model1[year] = pr1[year].get(1)
        print model1[year].score()
    else:
        print year, "not ready"

imp1 = imp_df(1, model1)

{'model': 0.095401329333742346, 'ensemble': 0.38201539345186519, 'baseline': 0.39385165054016535}
{'model': 0.54165496369075283, 'ensemble': 0.61637917752010263, 'baseline': 0.60900901758379433}
{'model': 0.63713483540825555, 'ensemble': 0.63730595896832531, 'baseline': 0.61360307915900703}
{'model': 0.60176018976317258, 'ensemble': 0.61323120005334042, 'baseline': 0.56996406060235849}
{'model': 0.16901122012355674, 'ensemble': 0.19370129681513437, 'baseline': 0.1059067680212642}


In [30]:
model2 = {}
for year in range(2013, 2016):
    if pr2[year].ready():
        model2[year] = pr2[year].get(1)
        print model2[year].score()
    else:
        print year, "not ready"

imp2 = imp_df(2, model2)

{'model': 0.38072990455468447, 'ensemble': 0.3742112122874367, 'baseline': 0.33042393202063758}
{'model': 0.41908958416855646, 'ensemble': 0.39825661139479807, 'baseline': 0.33485359571062911}
{'model': 0.057805075128210759, 'ensemble': 0.11814231681136333, 'baseline': -0.1272957779194126}


# Combine and compare feature importances

In [31]:
impall = pd.concat([imp1, imp2, imp5, imp6]).astype(float)

In [32]:
impall

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_delta_1,avg_delta_10,avg_delta_11,avg_delta_12,avg_delta_2,avg_delta_3,avg_delta_4,avg_delta_5,avg_delta_6,avg_delta_7,...,thunderstorm,thunderstorm_m1,thunderstorm_m2,thunderstorm_p1,thunderstorm_p2,thunderstorm_p3,viz_max,week,wind_max,year
xid,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2011,0.011109,0.020826,0.020998,0.015044,0.011562,0.007551,0.008746,0.007443,0.01153,0.011446,...,0.000444,0.000397,0.002692,0.00025,0.001738,0.000666,0.000289,0.013527,0.004514,0.009892
1,2012,0.019546,0.016995,0.017777,0.013063,0.015657,0.013009,0.013337,0.009666,0.009864,0.009152,...,0.000198,0.000544,0.00389,0.000408,0.002996,0.000221,0.000213,0.007862,0.007751,0.011542
1,2013,0.007227,0.010796,0.0111,0.00818,0.008258,0.00796,0.007828,0.008989,0.011856,0.012195,...,0.000167,0.000191,0.000201,0.000262,0.000123,0.000328,0.000118,0.007417,0.00609,0.011223
1,2014,0.006238,0.008563,0.006618,0.006031,0.0062,0.008652,0.005497,0.008694,0.007071,0.01004,...,0.00026,0.000271,0.000218,0.00038,0.000232,0.00034,0.000166,0.010045,0.004931,0.00579
1,2015,0.00587,0.00968,0.008192,0.008365,0.006558,0.006808,0.007613,0.009785,0.010431,0.01173,...,3.3e-05,4.5e-05,9.3e-05,0.000129,0.000241,0.000162,0.000115,0.008228,0.004797,0.012439
2,2013,0.006393,0.012452,0.011961,0.007461,0.009476,0.008314,0.008504,0.011292,0.016064,0.00708,...,0.001216,0.00063,0.001096,0.000304,0.000349,0.000777,7e-06,0.007911,0.005155,0.025892
2,2014,0.008023,0.008982,0.008528,0.009156,0.008199,0.009678,0.007752,0.009637,0.008372,0.007639,...,0.000746,0.000554,0.001438,0.000359,0.000585,0.000818,0.00134,0.009504,0.005862,0.006519
2,2015,0.007327,0.019284,0.013267,0.014987,0.009397,0.00956,0.008655,0.009439,0.008163,0.009228,...,3.1e-05,0.000146,0.000752,0.000138,0.000235,6.7e-05,0.000586,0.009632,0.006144,0.00941
5,2011,0.007403,0.021429,0.022008,0.016629,0.009753,0.008702,0.007128,0.006941,0.010194,0.013007,...,0.000474,0.000392,0.001515,0.000366,0.004152,0.00077,0.000535,0.012971,0.004463,0.008327
5,2012,0.006732,0.019688,0.020746,0.016397,0.011856,0.006609,0.007425,0.007333,0.010163,0.009199,...,0.00021,0.000327,0.001559,0.000536,0.002,0.000174,7.9e-05,0.008098,0.006144,0.011663


In [33]:
avgdelta_cols = [col for col in impall.columns.values if 'avg_delta' in col]

In [34]:
impall['trend'] = impall[avgdelta_cols].sum(1)

In [35]:
event_cols = [col for col in impall.columns.values if 'event' in col]

In [36]:
impall['event'] = impall[event_cols].sum(1)

In [37]:
impall.trend

xid  yr  
1    2011    0.152994
     2012    0.167913
     2013    0.118131
     2014    0.104258
     2015    0.112178
2    2013    0.124881
     2014    0.106084
     2015    0.137265
5    2011    0.157391
     2012    0.145725
     2013    0.115833
     2014    0.118675
     2015    0.116591
6    2013    0.105864
     2014    0.125619
     2015    0.156214
Name: trend, dtype: float64

In [38]:
impall.event

xid  yr  
1    2011    0.088450
     2012    0.110506
     2013    0.085428
     2014    0.060670
     2015    0.064995
2    2013    0.125049
     2014    0.110442
     2015    0.100685
5    2011    0.073443
     2012    0.085473
     2013    0.082216
     2014    0.059537
     2015    0.064070
6    2013    0.124568
     2014    0.098825
     2015    0.082775
Name: event, dtype: float64

In [39]:
impall.minofday

xid  yr  
1    2011    0.490167
     2012    0.476422
     2013    0.564104
     2014    0.602427
     2015    0.571069
2    2013    0.467985
     2014    0.503373
     2015    0.483884
5    2011    0.520479
     2012    0.539200
     2013    0.573896
     2014    0.578695
     2015    0.566321
6    2013    0.441133
     2014    0.494110
     2015    0.486717
Name: minofday, dtype: float64

In [40]:
impall['dayofweek']

xid  yr  
1    2011    0.059460
     2012    0.036552
     2013    0.054372
     2014    0.073713
     2015    0.078164
2    2013    0.076669
     2014    0.036874
     2015    0.042893
5    2011    0.064273
     2012    0.048751
     2013    0.062798
     2014    0.074770
     2015    0.070818
6    2013    0.082915
     2014    0.023244
     2015    0.029960
Name: dayofweek, dtype: float64

In [41]:
impall[['month', 'week', 'year']].sum(1)

xid  yr  
1    2011    0.036578
     2012    0.027110
     2013    0.026587
     2014    0.025584
     2015    0.029362
2    2013    0.040025
     2014    0.023285
     2015    0.027593
5    2011    0.034807
     2012    0.027338
     2013    0.021490
     2014    0.028723
     2015    0.042639
6    2013    0.072326
     2014    0.026997
     2015    0.028013
dtype: float64

In [42]:
weather_cols = [col for col in impall.columns.values if 'rain' in col or 'precip' in col or 'thund' in col or 
              'snow' in col or 'fog' in col or 'temp' in col or 'viz' in col or 'wind' in col]

In [43]:
impall['weather'] = impall[weather_cols].sum(1)

In [44]:
north = impall.loc[[2, 6], :]

south = impall.loc[([1, 5], [2013, 2014, 2015]),:]

In [45]:
north.trend.mean()

0.12598766953257

In [46]:
south.trend.mean()

0.11427755265311666

In [47]:
north.event.mean()

0.10705726994105376

In [48]:
south.event.mean()

0.06948592394546943

In [49]:
north.minofday.mean()

0.47953359679616664

In [50]:
south.minofday.mean()

0.576085296976

In [51]:
north[['dayofweek', 'month', 'week', 'year']].sum(1).mean()

0.08513212207225167

In [52]:
south[['dayofweek', 'month', 'week', 'year']].sum(1).mean()

0.09817016852251832

In [53]:
south.weather.mean()

0.14198105790298435

In [54]:
north.weather.mean()

0.20228934165789078

In [55]:
north[weather_cols].mean(0)

fog                0.002856
precip             0.004889
precip_m1          0.004797
precip_m2          0.005244
precip_p1          0.006598
precip_p2          0.005910
precip_p3          0.005253
rain               0.004071
rain_m1            0.005037
rain_m2            0.004185
rain_p1            0.004212
rain_p2            0.004384
rain_p3            0.003976
snow               0.001019
snow_m1            0.001499
snow_m2            0.000484
snow_p1            0.000521
snow_p2            0.001281
snow_p3            0.000570
temp_max           0.008422
temp_max_m1        0.007112
temp_max_m2        0.006562
temp_max_p1        0.007239
temp_max_p2        0.007102
temp_max_p3        0.008776
temp_mean          0.006576
temp_mean_m1       0.006303
temp_mean_m2       0.006680
temp_mean_p1       0.007197
temp_mean_p2       0.006291
temp_mean_p3       0.007159
temp_min           0.006494
temp_min_m1        0.006661
temp_min_m2        0.007581
temp_min_p1        0.006933
temp_min_p2        0

In [56]:
north[weather_cols].mean(0)

fog                0.002856
precip             0.004889
precip_m1          0.004797
precip_m2          0.005244
precip_p1          0.006598
precip_p2          0.005910
precip_p3          0.005253
rain               0.004071
rain_m1            0.005037
rain_m2            0.004185
rain_p1            0.004212
rain_p2            0.004384
rain_p3            0.003976
snow               0.001019
snow_m1            0.001499
snow_m2            0.000484
snow_p1            0.000521
snow_p2            0.001281
snow_p3            0.000570
temp_max           0.008422
temp_max_m1        0.007112
temp_max_m2        0.006562
temp_max_p1        0.007239
temp_max_p2        0.007102
temp_max_p3        0.008776
temp_mean          0.006576
temp_mean_m1       0.006303
temp_mean_m2       0.006680
temp_mean_p1       0.007197
temp_mean_p2       0.006291
temp_mean_p3       0.007159
temp_min           0.006494
temp_min_m1        0.006661
temp_min_m2        0.007581
temp_min_p1        0.006933
temp_min_p2        0

# Top holidays

In [57]:
north[event_cols].mean().sort_values()

event_lead4_mlk                0.000024
event_lag1_president           0.000026
event_lag2_mlk                 0.000028
event_lead1_mlk                0.000028
event_lag3_president           0.000030
event_lag4_newyears            0.000030
event_lag3_thanksgiving        0.000034
event_lead3_mothers            0.000039
event_lead4_veterans           0.000041
event_newyears                 0.000041
event_lag1_mlk                 0.000042
event_lag3_newyears            0.000043
event_mlk                      0.000044
event_lag1_mothers             0.000051
event_halloween                0.000055
event_lead2_halloween          0.000059
event_lag3_labor               0.000059
event_lag2_mothers             0.000060
event_lag2_labor               0.000068
event_lead3_halloween          0.000070
event_lag4_president           0.000070
event_lag4_veterans            0.000074
event_lead2_mlk                0.000074
event_lag4_independence        0.000075
event_lead4_goodfriday         0.000076


In [58]:
south[event_cols].mean().sort_values()

event_lag3_mlk                 0.000019
event_lead1_mlk                0.000022
event_lag2_mlk                 0.000026
event_lag4_newyears            0.000029
event_lag4_mlk                 0.000029
event_lead2_thanksgiving       0.000035
event_lag2_president           0.000042
event_lead2_halloween          0.000046
event_lead3_thanksgiving       0.000051
event_lag3_labor               0.000053
event_lag1_mlk                 0.000054
event_victoria                 0.000058
event_lag4_labor               0.000062
event_lead4_thanksgiving       0.000062
event_lead4_mlk                0.000064
event_lag4_thanksgiving        0.000067
event_lag2_halloween           0.000070
event_lag3_goodfriday          0.000072
event_lag2_newyears            0.000073
event_lag1_president           0.000073
event_lag3_newyears            0.000075
event_lag2_labor               0.000080
event_lag4_veterans            0.000082
event_lead2_mlk                0.000087
event_lag4_halloween           0.000087


## Feature importances are low for infrequent events.  Analyze effect of scaling frequency of event on weather.

In [70]:
dfsouth = pd.DataFrame()
for xing in [1, 5]:
    dfsouth = pd.concat([dfsouth, select_mungedata(3, xing, '2013-1-1', '2016-1-1')])

In [75]:
dfnorth = pd.DataFrame()
for xing in [2, 6]:
    dfnorth = pd.concat([dfnorth, select_mungedata(4, xing, '2013-1-1', '2016-1-1')])

In [81]:
dfnorth[dfnorth > 0].count(0)[weather_cols]

fog                 14640
precip              51903
precip_m1           51952
precip_m2           51971
precip_p1           51910
precip_p2           51960
precip_p3           51986
rain                47091
rain_m1             47182
rain_m2             47200
rain_p1             47177
rain_p2             47400
rain_p3             47265
snow                 1177
snow_m1              1343
snow_m2              1331
snow_p1              1206
snow_p2              1226
snow_p3              1218
temp_max           102824
temp_max_m1        102824
temp_max_m2        102824
temp_max_p1        102824
temp_max_p2        102824
temp_max_p3        102824
temp_mean          102824
temp_mean_m1       102824
temp_mean_m2       102824
temp_mean_p1       102824
temp_mean_p2       102824
temp_mean_p3       102824
temp_min           102824
temp_min_m1        102824
temp_min_m2        102824
temp_min_p1        102824
temp_min_p2        102824
temp_min_p3        102824
thunderstorm          563
thunderstorm

In [83]:
(north[weather_cols].mean(0) / dfnorth[dfnorth > 0].count(0)[weather_cols] * 102824).sort_values(ascending=False)

thunderstorm_m2    0.170662
thunderstorm       0.120159
snow_m1            0.114748
snow_p2            0.107416
thunderstorm_p3    0.090947
snow               0.089063
thunderstorm_m1    0.076944
thunderstorm_p1    0.074148
thunderstorm_p2    0.065419
snow_p3            0.048123
snow_p1            0.044398
snow_m2            0.037406
fog                0.020061
precip_p1          0.013069
precip_p2          0.011696
rain_m1            0.010976
precip_p3          0.010389
precip_m2          0.010374
precip             0.009686
rain_p2            0.009510
precip_m1          0.009494
rain_p1            0.009180
rain_m2            0.009116
rain               0.008888
temp_max_p3        0.008776
rain_p3            0.008650
temp_max           0.008422
temp_min_m2        0.007581
temp_max_p1        0.007239
temp_mean_p1       0.007197
temp_mean_p3       0.007159
temp_max_m1        0.007112
temp_max_p2        0.007102
temp_min_p1        0.006933
temp_mean_m2       0.006680
temp_min_m1        0

In [84]:
(south[weather_cols].mean(0) / dfsouth[dfsouth > 0].count(0)[weather_cols] * 102824).sort_values(ascending=False)

snow_m2            0.075491
snow               0.072897
snow_p3            0.068718
snow_p2            0.067191
snow_p1            0.065161
thunderstorm_p3    0.061710
snow_m1            0.060020
thunderstorm_p1    0.041854
thunderstorm_p2    0.033789
thunderstorm_m2    0.031777
thunderstorm       0.029545
thunderstorm_m1    0.025242
fog                0.021427
precip_m2          0.009150
precip_p2          0.008782
rain_p3            0.008222
precip_p3          0.008127
rain_m1            0.008108
precip_p1          0.008075
rain               0.008073
precip             0.007621
precip_m1          0.007608
rain_p1            0.007363
rain_p2            0.007111
rain_m2            0.007073
temp_min_m2        0.005648
temp_mean_m2       0.005492
temp_min           0.005361
temp_min_m1        0.005240
wind_max           0.005162
temp_min_p1        0.004713
temp_max_m2        0.004579
temp_max_p3        0.004530
temp_max_p1        0.004471
temp_max_p2        0.004470
temp_mean_p3       0