In [28]:
%load_ext autoreload
%autoreload 2

In [59]:
from BorderModel import run_Incremental, harmonic_mean #, print_importances
from BorderQuery import select_features, select_mungedata_simple, select_mungedata, insert_predictions
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
from dbhelper import PgDB
import pprint
import random

In [2]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 32)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import run_Incremental
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing run_Incremental from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


In [15]:
from BorderModel import IncrementalModel, run_Incremental
with dview.sync_imports():
    from BorderModel import IncrementalModel, run_Incremental
import random

def create_train_test(year, train_length=2):
    '''
    IN 
        years: list of years to predict
        train_length: number of years to train
    '''
    train_start = datetime.date(year - train_length, 1, 1).strftime('%Y-%m-%d')
    train_end = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_start = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_end = datetime.date(year + 1, 1, 1).strftime('%Y-%m-%d')
    return train_start, train_end, test_start, test_end
    
def compare_years_parallel(model, xing, munger_id, years):
    prlist = {}
    for year in years:
        cpu = random.randint(0, 31)
        train_start, train_end, test_start, test_end = create_train_test(year, 2)

        prlist[year] = rc[cpu].apply_async(run_Incremental, model, munger_id, xing,  
                                           train_start, train_end, 
                                           test_start, test_end)
        
    return prlist

importing IncrementalModel,run_Incremental from BorderModel on engine(s)


# Run predictions on both crossings for 2014 & 2015 to compare r2 scores

## Peace Arch

In [16]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr2 = compare_years_parallel(model, 2, 4, range(2014, 2016))

In [20]:
model2 = {}
for year in range(2014, 2016):
    print year, pr2[year].ready()
    if pr2[year].ready():
        model2[year] = pr2[year].get(1)
        print model2[year].score()

2014 True
{'model': 0.41489579865598514, 'ensemble': 0.39776025849572016, 'baseline': 0.3349124462672769}
2015 True
{'model': 0.062822942108717306, 'ensemble': 0.11726178803359477, 'baseline': -0.12817610437840088}


## Pacific Highway

In [17]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr6 = compare_years_parallel(model, 6, 4, range(2014, 2016))

In [22]:
model6 = {}
for year in range(2014, 2016):
    print year, pr6[year].ready()
    if pr6[year].ready():
        model6[year] = pr6[year].get(1)
        print model6[year].score()

2014 True
{'model': 0.29951918477103701, 'ensemble': 0.31087256287944842, 'baseline': 0.28888403182573141}
2015 True
{'model': -0.18565959089138473, 'ensemble': -0.061926468700798765, 'baseline': -0.33484653403631937}


# Investigate feature importances

In [24]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96, bootstrap=True, oob_score=True)
prlist = {}
for xing in [2,6]:
    cpu = random.randint(0, 31)
    train_start, train_end, test_start, test_end = create_train_test(2015, 2)

    prlist[xing] = rc[cpu].apply_async(run_Incremental, model, 4, xing,  
                                       train_start, train_end, 
                                       test_start, test_end)

In [31]:
im = {}
for xing in [2,6]:
    print xing, prlist[xing].ready()
    if prlist[xing].ready():
        im[xing] = prlist[xing].get(1)
        print im[xing].score()

2 True
{'model': 0.073487572044148353, 'ensemble': 0.12332765337685814, 'baseline': -0.12817610437840088}
6 True
{'model': -0.18050459559560439, 'ensemble': -0.065337907083191871, 'baseline': -0.33484653403631937}


In [40]:
pp = pprint.PrettyPrinter(indent=4)
for xing in [2,6]:
    print xing
    pp.pprint(sorted(zip(im[xing].X.columns, im[xing].model.best_estimator_.feature_importances_), key=lambda x: x[1])[::-1])

2
[   ('minofday', 0.46306504777679175),
    ('dayofweek', 0.043756277617258331),
    ('avg_delta_10', 0.016482864140242609),
    ('avg_delta_11', 0.016256667530719086),
    ('avg_delta_12', 0.014424507863656649),
    ('avg_delta_9', 0.014126374801040521),
    ('avg_delta_8', 0.013000273061726875),
    ('event_lead2_labor', 0.012314665943872101),
    ('avg_delta_7', 0.01129038330944244),
    ('avg_delta_2', 0.010764268478988837),
    ('avg_delta_3', 0.010239391658428848),
    ('year', 0.0096900997719012085),
    ('event_victoria', 0.0093669841333706115),
    ('event_lead2_president', 0.0093059956641210941),
    ('week', 0.0092887981729932127),
    ('temp_min_m1', 0.0092626928146924813),
    ('avg_delta_5', 0.0089886639914987954),
    ('month', 0.008816298076970739),
    ('precip_p1', 0.0086315581510708825),
    ('avg_delta_4', 0.0084653878384657565),
    ('temp_mean_p1', 0.0082858570489206199),
    ('avg_delta_6', 0.0082686315055852475),
    ('temp_mean_p3', 0.0082495955574493057),
   

# Redo predictions after extending imputing to 2011

## Peace Arch

In [41]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr2 = compare_years_parallel(model, 2, 4, range(2013, 2016))

In [46]:
model2 = {}
for year in range(2013, 2016):
    print year, pr2[year].ready()
    if pr2[year].ready():
        model2[year] = pr2[year].get(1)
        print model2[year].score()

2013 True
{'model': 0.39274242817015859, 'ensemble': 0.3779144914504825, 'baseline': 0.33042393202063758}
2014 True
{'model': 0.41614044553351881, 'ensemble': 0.39739431840013084, 'baseline': 0.33485359571062911}
2015 True
{'model': 0.068490772561140112, 'ensemble': 0.1227213287558151, 'baseline': -0.1272957779194126}


## Pacific Highway

In [48]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr6 = compare_years_parallel(model, 6, 4, range(2013, 2016))

In [51]:
model6 = {}
for year in range(2013, 2016):
    print year, pr6[year].ready()
    if pr6[year].ready():
        model6[year] = pr6[year].get(1)
        print model6[year].score()

2013 True
{'model': 0.25378230717971639, 'ensemble': 0.27673036933991857, 'baseline': 0.26915343237860456}
2014 True
{'model': 0.30628301888427711, 'ensemble': 0.31576159621659483, 'baseline': 0.29165671141312366}
2015 True
{'model': -0.1959661150021903, 'ensemble': -0.064466474321794198, 'baseline': -0.32952975930038231}


# Predict 2016

In [64]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr = {}
for xing in [2, 6]:
    cpu  = random.randint(0, 31)
    pr[xing] = rc[cpu].apply_async(run_Incremental, model, 4, xing, '2013-1-1', '2016-1-1', '2016-1-1', '2018-1-1')

In [66]:
im = {}
for xing in [2, 6]:
    print xing, pr[xing].ready()
    if pr[xing].ready():
        im[xing] = pr[xing].get(1)
        ensemble = harmonic_mean((im[xing].y_predict, im[xing].baseline()), (1, 1))
        insert_predictions('v2.1', 4, xing, ensemble1.index, ensemble1.values)

2 False
6 False
