In [1]:
from BorderModel import IncrementalModel
from BorderQuery import select_features, select_mungedata_simple, select_mungedata
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import pprint
import itertools
import pdb
import random

In [2]:
%load_ext autoreload
%autoreload 2

### Initialize for parallel operations

In [3]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 32)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import IncrementalModel
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing IncrementalModel from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


In [4]:
from BorderModel import IncrementalModel
with dview.sync_imports():
    from BorderModel import IncrementalModel

def model_xing(model, xing, munger_id, train_start, train_end, test_start, test_end):
    df_train = select_mungedata(munger_id, xing, train_start, train_end)
    X_test = select_features(test_start, test_end)
    actual = select_mungedata_simple(munger_id, xing, test_start, test_end)

    grid = GridSearchCV(model, {})
    im = IncrementalModel(df_train, grid, categoricals=['event'])
    im.set_actual(actual.waittime)
    im.predict(X_test)   

    return im

def create_train_test(year, train_length=2):
    '''
    IN 
        years: list of years to predict
        train_length: number of years to train
    '''
    train_start = datetime.date(year - train_length, 1, 1).strftime('%Y-%m-%d')
    train_end = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_start = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_end = datetime.date(year + 1, 1, 1).strftime('%Y-%m-%d')
    return train_start, train_end, test_start, test_end
    
def compare_years_parallel(model, xing, munger_id, years):
    prlist = {}
    for year in years:
        cpu = random.randint(0, 31)
        train_start, train_end, test_start, test_end = create_train_test(year, 2)

        prlist[year] = rc[cpu].apply_async(model_xing, model, xing, munger_id, 
                                           train_start, train_end, 
                                           test_start, test_end)
        
    return prlist

importing IncrementalModel from BorderModel on engine(s)


## Pacific Crossing

### Tuned for 2015

In [88]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96, max_features='sqrt', max_depth=20)
pr5_tuned = compare_years_parallel(model, 5, 3, range(2011, 2016))

In [96]:
model5_tuned = {}
for year in range(2011, 2016):
    model5_tuned[year] = pr5_tuned[year].get(1)

In [97]:
for year in range(2011, 2016):
    print year
    print model5_tuned[year].score()

2011
{'model': 0.42965509214215614, 'ensemble': 0.52877829027700607, 'baseline': 0.50830951929768831}
2012
{'model': 0.47966607705882247, 'ensemble': 0.59494750831210763, 'baseline': 0.6264414468772701}
2013
{'model': 0.39322658042436331, 'ensemble': 0.50464539760127458, 'baseline': 0.57215536973016168}
2014
{'model': 0.51736679457973112, 'ensemble': 0.59632476018810343, 'baseline': 0.51290499913254095}
2015
{'model': 0.32035822603068875, 'ensemble': 0.4092626339974389, 'baseline': 0.23074703434185284}


### Basic model

In [74]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr5 = compare_years_parallel(model, 5, 3, range(2011, 2016))

In [77]:
pr5[2011].ready()

True

In [78]:
model5 = {}
for year in range(2011, 2016):
    model5[year] = pr5[year].get(1)

In [79]:
for year in range(2011, 2016):
    print year
    print model5[year].score()

2011
{'model': 0.49369444480526425, 'ensemble': 0.54803346707984235, 'baseline': 0.50830951929768831}
2012
{'model': 0.61971364274178997, 'ensemble': 0.64556489119672844, 'baseline': 0.6264414468772701}
2013
{'model': 0.61530682975381756, 'ensemble': 0.60048741092203484, 'baseline': 0.57215536973016168}
2014
{'model': 0.58392387137995727, 'ensemble': 0.59788335146587057, 'baseline': 0.51290499913254095}
2015
{'model': 0.31463951274474444, 'ensemble': 0.31990942146797063, 'baseline': 0.23074703434185284}


## Peace Arch

### Tuned for 2015

In [92]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96, max_features='sqrt', max_depth=20)
pr1_tuned = compare_years_parallel(model, 1, 3, range(2011, 2016))

In [93]:
pr1_tuned[2011].ready()

True

In [94]:
model1_tuned = {}
for year in range(2013, 2016):
    model1_tuned[year] = pr1_tuned[year].get(1)

In [95]:
for year in range(2013, 2016):
    print year
    print model1_tuned[year].score()

2013
{'model': 0.4009608796145997, 'ensemble': 0.5424707250162959, 'baseline': 0.61360307915900703}
2014
{'model': 0.50709359952887811, 'ensemble': 0.59402700678292808, 'baseline': 0.56996406060235849}
2015
{'model': 0.23670397287473677, 'ensemble': 0.32516340322538972, 'baseline': 0.1059067680212642}


In [102]:
pr1_tuned[2011].get(1).score()

{'baseline': 0.39385165054016535,
 'ensemble': 0.40165956970033045,
 'model': 0.037910005936363844}

### Basic model

In [80]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr1 = compare_years_parallel(model, 1, 3, range(2011, 2016))

In [52]:
pr1[2011].ready()

True

In [81]:
model1 = {}
for year in range(2013, 2016):
    model1[year] = pr1[year].get(1)

In [82]:
for year in range(2013, 2016):
    print year
    print model1[year].score()

2013
{'model': 0.63768395074523787, 'ensemble': 0.63752908148900767, 'baseline': 0.61360307915900703}
2014
{'model': 0.60060376592570752, 'ensemble': 0.61296328470465489, 'baseline': 0.56996406060235849}
2015
{'model': 0.16272341843482119, 'ensemble': 0.19073867957260571, 'baseline': 0.1059067680212642}


In [103]:
pr1[2011].get(1).score()

{'baseline': 0.39385165054016535,
 'ensemble': 0.38071787318738071,
 'model': 0.055917597505885652}

## Lynden

### Tuned for 2015

In [6]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96, max_features='sqrt', max_depth=20)
pr16_tuned = compare_years_parallel(model, 16, 3, range(2013, 2016))

In [26]:
pr16_tuned[2014].ready()

True

In [None]:
model16_tuned = {}
for year in range(2015, 2016):
    model16_tuned[year] = pr16_tuned[year].get(1)

In [37]:
for year in range(2015, 2016):
    print model16_tuned[year].score()

{'model': -0.09993010687997006, 'ensemble': 0.02692528357729429, 'baseline': -0.47527592569072796}


### Basic model

In [11]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
pr16 = compare_years_parallel(model, 16, 3, range(2013, 2016))

In [30]:
pr16[2013].ready()

True

In [None]:
model16 = {}
for year in range(2015, 2016):
    model16[year] = pr16[year].get(1) 

In [35]:
for year in range(2015, 2016):
    print model16[year].score()

{'model': -0.23607158941442696, 'ensemble': -0.18106521401453124, 'baseline': -0.47527592569072796}


# Conclusions

* Basic model with ensemble is better than tuned model for all years except 2011
* Lynden is not well predicted by model