In [3]:
from BorderModel import IncrementalModel, run_Incremental, sort_importances, print_importances
from BorderQuery import select_features, select_mungedata_simple, select_mungedata
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import itertools
import pdb
import random
import pandas as pd
import numpy as np

### Initialize for parallel operations

In [374]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 40)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import IncrementalModel, run_Incremental
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing IncrementalModel,run_Incremental from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


In [375]:
from BorderModel import IncrementalModel, run_Incremental
with dview.sync_imports():
    from BorderModel import IncrementalModel, run_Incremental
import random

def create_train_test(year, train_length=2):
    '''
    IN 
        years: list of years to predict
        train_length: number of years to train
    '''
    train_start = datetime.date(year - train_length, 1, 1).strftime('%Y-%m-%d')
    train_end = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_start = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_end = datetime.date(year + 1, 1, 1).strftime('%Y-%m-%d')
    return train_start, train_end, test_start, test_end
    
def compare_years_parallel(model, xing, munger_id, years):
    prlist = {}
    for year in years:
        cpu = random.randint(0, 31)
        train_start, train_end, test_start, test_end = create_train_test(year, 2)

        prlist[year] = rc[cpu].apply_async(run_Incremental, model, munger_id, xing,  
                                           train_start, train_end, 
                                           test_start, test_end)
        
    return prlist

importing IncrementalModel,run_Incremental from BorderModel on engine(s)


In [376]:
def model_plot(model, start, end):
    plt.figure(figsize=(16,4))
    baseline = model.baseline()
    ensemble = model.ensemble()
    actuals = model.actual
    yhat = model.y_predict
    
    plt.plot(actuals[(actuals.index.date>=start) & (actuals.index.date<end)], label='actuals')
    plt.plot(baseline[(baseline.index.date>=start) & (baseline.index.date<end)], label='baseline')
    plt.plot(yhat[(yhat.index.date>=start) & (yhat.index.date<end)], label='predictions')
    plt.plot(ensemble[(ensemble.index.date>=start) & (ensemble.index.date<end)], label='ensemble')
    plt.legend();
    
def imp_df(xid, model_years):
    impdf = pd.DataFrame()
    for year, model in model_years.items():
        imp = sort_importances(model.model.best_estimator_, model.X.columns)
        df = pd.DataFrame(np.array(imp)[:,1], np.array(imp)[:,0]).T
        df['xid'] = xid
        df['yr'] = int(year)
        df = df.set_index(['xid', 'yr'])
        impdf = pd.concat([impdf, df])
    return impdf

## Pacific Crossing South

In [377]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr5 = compare_years_parallel(model, 5, 3, range(2011, 2016))

In [421]:
model5 = {}
for year in range(2011, 2016):
    if pr5[year].ready():
        model5[year] = pr5[year].get(1)
        print model5[year].score()
    else:
        print year, "not ready"

{'model': 0.49288736279227985, 'ensemble': 0.54696587195236956, 'baseline': 0.50830951929768831}
{'model': 0.61514885220285942, 'ensemble': 0.64429596179219595, 'baseline': 0.6264414468772701}
{'model': 0.61504775414227275, 'ensemble': 0.60102454050577681, 'baseline': 0.57215536973016168}
{'model': 0.57792935460423078, 'ensemble': 0.59804534045376045, 'baseline': 0.51290499913254095}
{'model': 0.32134963611719236, 'ensemble': 0.32372928929853795, 'baseline': 0.23074703434185284}


In [422]:
imp5 = imp_df(5, model5)

## Pacific Crossing North

In [380]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr6 = compare_years_parallel(model, 6, 4, range(2013, 2016))

In [414]:
model6 = {}
for year in range(2013, 2016):
    if pr6[year].ready():
        model6[year] = pr6[year].get(1)
        print model6[year].score()
    else:
        print year, "not ready"

{'model': 0.25320951299174665, 'ensemble': 0.27696545876290857, 'baseline': 0.26915343237860456}
{'model': 0.30003813869069584, 'ensemble': 0.31299321758551935, 'baseline': 0.29165671141312366}
{'model': -0.18949947619078555, 'ensemble': -0.058769421901040619, 'baseline': -0.32952975930038231}


In [415]:
imp6 = imp_df(6, model6)

## Peace Arch South

In [383]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr1 = compare_years_parallel(model, 1, 3, range(2011, 2016))

In [423]:
model1 = {}
for year in range(2011, 2016):
    if pr1[year].ready():
        model1[year] = pr1[year].get(1)
        print model1[year].score()
    else:
        print year, "not ready"

{'model': 0.11119174659686448, 'ensemble': 0.39072582624651886, 'baseline': 0.39385165054016535}
{'model': 0.55672878925324998, 'ensemble': 0.62167070256277257, 'baseline': 0.60900901758379433}
{'model': 0.63829958139168153, 'ensemble': 0.6380169391467585, 'baseline': 0.61360307915900703}
{'model': 0.60094886476415943, 'ensemble': 0.61233493221941937, 'baseline': 0.56996406060235849}
{'model': 0.15344579612365894, 'ensemble': 0.18745585639513052, 'baseline': 0.1059067680212642}


In [424]:
imp1 = imp_df(1, model1)

## Peace Arch North

In [386]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr2 = compare_years_parallel(model, 2, 4, range(2013, 2016))

In [417]:
model2 = {}
for year in range(2013, 2016):
    if pr2[year].ready():
        model2[year] = pr2[year].get(1)
        print model2[year].score()
    else:
        print year, "not ready"

{'model': 0.38463435093991338, 'ensemble': 0.37393983097866912, 'baseline': 0.33042393202063758}
{'model': 0.41927825400267738, 'ensemble': 0.39841584748602588, 'baseline': 0.33485359571062911}
{'model': 0.052299021629152675, 'ensemble': 0.11929093903203281, 'baseline': -0.1272957779194126}


In [418]:
imp2 = imp_df(2, model2)

# Combine and compare feature importances

In [425]:
impall = pd.concat([imp1, imp2, imp5, imp6]).astype(float)

In [426]:
impall

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_delta_1,avg_delta_10,avg_delta_11,avg_delta_12,avg_delta_2,avg_delta_3,avg_delta_4,avg_delta_5,avg_delta_6,avg_delta_7,...,thunderstorm,thunderstorm_m1,thunderstorm_m2,thunderstorm_p1,thunderstorm_p2,thunderstorm_p3,viz_max,week,wind_max,year
xid,yr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,2011,0.013121,0.016608,0.021084,0.015362,0.011197,0.007649,0.008366,0.00777,0.011153,0.012862,...,0.000492,0.000492,0.002453,0.000253,0.001912,0.000659,0.000292,0.014181,0.004449,0.009262
1,2012,0.018042,0.015961,0.017665,0.012493,0.015391,0.011811,0.012284,0.00854,0.013629,0.008576,...,0.000176,0.000528,0.003691,0.000345,0.002806,0.00021,0.00023,0.008491,0.007686,0.011646
1,2013,0.007592,0.012019,0.010276,0.008327,0.008164,0.007603,0.00765,0.008643,0.01217,0.012265,...,0.000172,0.00023,0.000183,0.000285,0.000136,0.000367,0.000134,0.007781,0.005972,0.01124
1,2014,0.006335,0.008152,0.006308,0.006134,0.006111,0.008624,0.005927,0.008336,0.007867,0.009068,...,0.000244,0.000266,0.000177,0.000345,0.00023,0.000322,0.000168,0.010394,0.004886,0.005906
1,2015,0.00639,0.009285,0.008932,0.008602,0.0062,0.0066,0.007435,0.009397,0.011109,0.010393,...,3.6e-05,3.5e-05,0.000105,8e-05,0.000249,0.000183,0.00014,0.007942,0.004577,0.011838
2,2013,0.006363,0.012337,0.012991,0.007218,0.009292,0.008164,0.007991,0.011642,0.017061,0.006499,...,0.001363,0.000813,0.001179,0.000272,0.000345,0.000828,5e-06,0.007746,0.005611,0.026964
2,2014,0.00809,0.010241,0.00839,0.008563,0.008605,0.009413,0.008213,0.009799,0.00808,0.007831,...,0.00082,0.000381,0.001784,0.000365,0.000534,0.000826,0.001254,0.010038,0.005899,0.006546
2,2015,0.006442,0.019309,0.016008,0.015609,0.010782,0.008802,0.008314,0.009657,0.008034,0.009225,...,4.1e-05,0.00012,0.000713,0.000169,0.000216,5.4e-05,0.000681,0.008968,0.006581,0.009431
5,2011,0.006549,0.021182,0.020769,0.019018,0.009796,0.007586,0.006669,0.005445,0.010395,0.012446,...,0.000433,0.000369,0.001622,0.000389,0.003817,0.000912,0.000471,0.014751,0.004644,0.008399
5,2012,0.007262,0.021025,0.021888,0.015868,0.011948,0.006899,0.007534,0.007442,0.011422,0.009843,...,0.000166,0.000346,0.001434,0.000505,0.002063,0.000208,9e-05,0.007859,0.006201,0.011429


In [427]:
avgdelta_cols = [col for col in impall.columns.values if 'avg_delta' in col]

In [428]:
impall['trend'] = impall[avgdelta_cols].sum(1)

In [429]:
event_cols = [col for col in impall.columns.values if 'event' in col]

In [430]:
impall['event'] = impall[event_cols].sum(1)

In [431]:
impall.trend

xid  yr  
1    2011    0.148575
     2012    0.163130
     2013    0.118740
     2014    0.104980
     2015    0.113206
2    2013    0.122681
     2014    0.108005
     2015    0.140273
5    2011    0.153388
     2012    0.147843
     2013    0.115763
     2014    0.116725
     2015    0.116714
6    2013    0.105261
     2014    0.122626
     2015    0.154850
Name: trend, dtype: float64

In [432]:
impall.event

xid  yr  
1    2011    0.087045
     2012    0.109531
     2013    0.084037
     2014    0.062497
     2015    0.063972
2    2013    0.127130
     2014    0.108524
     2015    0.103970
5    2011    0.075208
     2012    0.084219
     2013    0.081283
     2014    0.058345
     2015    0.064630
6    2013    0.119153
     2014    0.100158
     2015    0.081439
Name: event, dtype: float64

In [433]:
impall.minofday

xid  yr  
1    2011    0.496642
     2012    0.477470
     2013    0.564629
     2014    0.594715
     2015    0.573292
2    2013    0.463182
     2014    0.503950
     2015    0.475748
5    2011    0.520597
     2012    0.537706
     2013    0.575292
     2014    0.582611
     2015    0.564925
6    2013    0.457621
     2014    0.489205
     2015    0.486568
Name: minofday, dtype: float64

In [434]:
impall['dayofweek']

xid  yr  
1    2011    0.057748
     2012    0.036987
     2013    0.054404
     2014    0.078794
     2015    0.077533
2    2013    0.077430
     2014    0.036541
     2015    0.043235
5    2011    0.065621
     2012    0.049718
     2013    0.062142
     2014    0.073814
     2015    0.070466
6    2013    0.077635
     2014    0.023863
     2015    0.029494
Name: dayofweek, dtype: float64

In [435]:
impall[['month', 'week', 'year']].sum(1)

xid  yr  
1    2011    0.036850
     2012    0.027764
     2013    0.026774
     2014    0.026073
     2015    0.028110
2    2013    0.042058
     2014    0.023749
     2015    0.027341
5    2011    0.035767
     2012    0.026596
     2013    0.021857
     2014    0.028429
     2015    0.042072
6    2013    0.068632
     2014    0.026551
     2015    0.029055
dtype: float64

In [436]:
weather_cols = [col for col in impall.columns.values if 'rain' in col or 'precip' in col or 'thund' in col or 
              'snow' in col or 'fog' in col or 'temp' in col or 'viz' in col or 'wind' in col]

In [437]:
impall['weather'] = impall[weather_cols].sum(1)

In [438]:
north = impall.loc[[2, 6], :]

south = impall.loc[([1, 5], [2013, 2014, 2015]),:]

In [439]:
north.trend.mean()

0.12561597686278833

In [440]:
south.trend.mean()

0.11435465755884167

In [441]:
north.event.mean()

0.10672913423305692

In [442]:
south.event.mean()

0.06912731099871415

In [443]:
north.minofday.mean()

0.47937890121550003

In [444]:
south.minofday.mean()

0.5759106940055

In [459]:
north[['dayofweek', 'month', 'week', 'year']].sum(1).mean()

0.08426374535455833

In [460]:
south[['dayofweek', 'month', 'week', 'year']].sum(1).mean()

0.09841144330153667

In [451]:
south.weather.mean()

0.14219589413535083

In [450]:
north.weather.mean()

0.20401224233424628

In [454]:
north[weather_cols].mean(0)

fog                0.002962
precip             0.004874
precip_m1          0.004786
precip_m2          0.005223
precip_p1          0.006596
precip_p2          0.005898
precip_p3          0.005238
rain               0.003960
rain_m1            0.005195
rain_m2            0.004241
rain_p1            0.004315
rain_p2            0.004365
rain_p3            0.003978
snow               0.000988
snow_m1            0.001628
snow_m2            0.000516
snow_p1            0.000531
snow_p2            0.001461
snow_p3            0.000548
temp_max           0.008723
temp_max_m1        0.006922
temp_max_m2        0.006791
temp_max_p1        0.007327
temp_max_p2        0.007113
temp_max_p3        0.008819
temp_mean          0.006856
temp_mean_m1       0.006048
temp_mean_m2       0.006958
temp_mean_p1       0.006813
temp_mean_p2       0.006007
temp_mean_p3       0.006948
temp_min           0.006741
temp_min_m1        0.006587
temp_min_m2        0.007470
temp_min_p1        0.007432
temp_min_p2        0

In [455]:
north[weather_cols].mean(0)

fog                0.002962
precip             0.004874
precip_m1          0.004786
precip_m2          0.005223
precip_p1          0.006596
precip_p2          0.005898
precip_p3          0.005238
rain               0.003960
rain_m1            0.005195
rain_m2            0.004241
rain_p1            0.004315
rain_p2            0.004365
rain_p3            0.003978
snow               0.000988
snow_m1            0.001628
snow_m2            0.000516
snow_p1            0.000531
snow_p2            0.001461
snow_p3            0.000548
temp_max           0.008723
temp_max_m1        0.006922
temp_max_m2        0.006791
temp_max_p1        0.007327
temp_max_p2        0.007113
temp_max_p3        0.008819
temp_mean          0.006856
temp_mean_m1       0.006048
temp_mean_m2       0.006958
temp_mean_p1       0.006813
temp_mean_p2       0.006007
temp_mean_p3       0.006948
temp_min           0.006741
temp_min_m1        0.006587
temp_min_m2        0.007470
temp_min_p1        0.007432
temp_min_p2        0

# Top holidays

In [457]:
north[event_cols].mean().sort_values()

event_lag1_president           0.000024
event_lag2_mlk                 0.000026
event_lag3_thanksgiving        0.000028
event_lag4_newyears            0.000028
event_lead4_mlk                0.000028
event_lag3_president           0.000029
event_lead1_mlk                0.000033
event_newyears                 0.000034
event_lag3_newyears            0.000036
event_lag1_mlk                 0.000041
event_mlk                      0.000041
event_lead4_veterans           0.000041
event_lag1_mothers             0.000042
event_lead3_mothers            0.000046
event_lead2_halloween          0.000053
event_lag2_labor               0.000058
event_lag2_mothers             0.000060
event_lag3_labor               0.000061
event_halloween                0.000062
event_lag4_president           0.000065
event_lead2_mlk                0.000068
event_lead3_halloween          0.000069
event_lag4_veterans            0.000073
event_lead2_thanksgiving       0.000074
event_lag4_independence        0.000076


In [458]:
south[event_cols].mean().sort_values()

event_lead1_mlk                0.000020
event_lag3_mlk                 0.000021
event_lag2_mlk                 0.000023
event_lag4_mlk                 0.000028
event_lag4_newyears            0.000031
event_lag2_president           0.000037
event_lead2_thanksgiving       0.000039
event_lead2_halloween          0.000049
event_lead3_thanksgiving       0.000052
event_lag3_labor               0.000054
event_lag4_labor               0.000061
event_lag1_mlk                 0.000062
event_lag3_goodfriday          0.000063
event_victoria                 0.000064
event_lead4_thanksgiving       0.000064
event_lead4_mlk                0.000066
event_lag4_thanksgiving        0.000070
event_lag2_halloween           0.000075
event_lag4_veterans            0.000077
event_lag2_newyears            0.000080
event_lag4_halloween           0.000080
event_lag3_newyears            0.000081
event_lag2_labor               0.000083
event_lead2_mlk                0.000089
event_lag3_veterans            0.000093


## Feature importances are low for infrequent events.  Analyze effect of scaling frequency of event.

In [None]:
dfsouth = pd.DataFrame()
for xing in [1, 5]:
    dfsouth.append(select_mungedata(3, xing, '2013-1-1', '2016-1-1'))