## Predict 1 week at a time for dates before 2016
1. Train model for 2 years
2. Predict 1 week via IncrementalModel
3. Repeat starting with next week

In [2]:
from BorderModel import run_Incremental, harmonic_mean
from BorderQuery import insert_predictions
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime
import random
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import copy
import pdb
from sklearn.metrics import r2_score
import cPickle as pickle

In [3]:
def create_train_test(date_end, test_days=7, train_years=2):
    '''
    IN 
        date_end: datetime.date of last day to predict
        duration: days to predict
        train_years: number of years to train
    '''
    train_start = date_end - datetime.timedelta(test_days + 366 * train_years)
    train_end = date_end - datetime.timedelta(test_days)
    test_start = date_end - datetime.timedelta(test_days)
    test_end = date_end
    return train_start, train_end, test_start, test_end
    
def run_multiweek(model, munger_id, crossing_id, first, last, test_days):
    prlist = {}
    test_date = first
    while test_date < last:
        cpu = random.randint(0, 31)
        train_start, train_end, test_start, test_end = create_train_test(test_date, test_days=test_days)

        prlist[str(test_date)] = rc[cpu].apply_async(run_Incremental, model, munger_id, crossing_id,  
                                           train_start, train_end, 
                                           test_start, test_end)
        
        test_date += datetime.timedelta(test_days)
        
    return prlist

def score_df(models):
    predict = {date: model.score()['model'] for date, model in models.items()}
    ensemble = {date: model.score()['ensemble'] for date, model in models.items()}
    baseline = {date: model.score()['baseline'] for date, model in models.items()}
    
    df = pd.DataFrame([predict, ensemble, baseline]).T
    df.columns = ['predict', 'ensemble', 'baseline']
    df.index.name = 'date'
    df.index = pd.to_datetime(df.index)
    return df

def plot_scores(df):
    plt.figure(figsize=(16,4))
    plt.plot(df.baseline, label='baseline')
    plt.plot(df.predict, label='predict')
    plt.plot(df.ensemble, label='ensemble')
    plt.axhline(0, color='y')
    plt.legend();
    
def results_df(models):
    predict = pd.Series().append([model.y_predict for key, model in trained_1.items()]).sort_index()
    ensemble = pd.Series().append([model.ensemble() for key, model in trained_1.items()]).sort_index()
    baseline = pd.Series().append([model.baseline() for key, model in trained_1.items()]).sort_index()
    actual = pd.Series().append([model.actual for key, model in trained_1.items()]).sort_index()
  
    df = pd.DataFrame()
    df['predict'] = predict
    df['ensemble'] = ensemble
    df['baseline'] = baseline
    df['actual'] = actual
    return df

def print_r2(results):
    actual = results.actual.dropna()
    print 'Predict: ', r2_score(actual, results.predict[actual.index])
    print 'Ensemble: ', r2_score(actual, results.ensemble[actual.index])
    print 'Baseline: ', r2_score(actual, results.baseline[actual.index])

In [4]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 32)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import run_Incremental
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing run_Incremental from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


## Peace Arch South

In [5]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

pr = run_multiweek(grid, 3, 1, datetime.date(2015, 1, 1), datetime.date(2016, 1, 1), 7)

In [None]:
trained = {}
first = datetime.date(2015, 1, 1)
last = datetime.date(2016, 1, 1)
test_days = 7
test_date = first
exclude = [datetime.date(2015, 2, 19)]
while test_date < last:
    if test_date not in exclude:
        if pr[str(test_date)].ready():
            trained[str(test_date)] = pr[str(test_date)].get(1)
            print test_date, trained[str(test_date)].score()
        else:
            print  test_date, pr[str(test_date)].ready()

    test_date += datetime.timedelta(test_days)

In [None]:
# trained_1 = copy.deepcopy(trained)


In [None]:
scores = score_df(trained)

In [None]:
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_{0}_{1}.pkl', 'w') as f:
    pickle.dump(results, f)

In [None]:
print_r2(results['2015-1-1':'2016-1-1'])

In [65]:
print_r2(results['2015-1-1':'2016-1-1']) # old, 


Predict:  0.391770485174
Ensemble:  0.441411669676
Baseline:  0.356917688188


### Note that baseline is a rolling 12 months, so not unexpected that baseline improves with week by week approach

In [70]:
full2015 = run_Incremental(grid, 3, 1, '2013-1-1', '2015-1-1', '2015-1-1', '2016-1-1')

In [71]:
full2015.score()

{'baseline': 0.1059067680212642,
 'ensemble': 0.18871318906645274,
 'model': 0.11903765655965948}

## Pacific Highway South

In [43]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

pr = run_multiweek(grid, 3, 5, datetime.date(2015, 1, 1), datetime.date(2016, 1, 1), 7)

In [64]:
trained = {}
first = datetime.date(2015, 1, 1)
last = datetime.date(2016, 1, 1)
test_days = 7
test_date = first
exclude = [datetime.date(2015, 10, 15), datetime.date(2015, 2, 19)]
while test_date < last:
    if test_date not in exclude:
        if pr[str(test_date)].ready():
            trained[str(test_date)] = pr[str(test_date)].get(1)
            print test_date, trained[str(test_date)].score()
        else:
            print  test_date, pr[str(test_date)].ready()

    test_date += datetime.timedelta(test_days)

2015-01-01 {'model': 0.47442663738977786, 'ensemble': 0.50517844170980919, 'baseline': 0.53360635276911372}
2015-01-08 {'model': 0.39309074740893124, 'ensemble': 0.43456830253917367, 'baseline': 0.42777892104205117}
2015-01-15 {'model': -2.1356778917556496, 'ensemble': -2.2326721221699541, 'baseline': -2.5594198289083412}
2015-01-22 {'model': 0.12515715332668176, 'ensemble': -0.11274671602146569, 'baseline': -0.71456686843956074}
2015-01-29 {'model': -0.076635420776420826, 'ensemble': -0.46846544398769763, 'baseline': -1.8131967463785328}
2015-02-05 {'model': 0.54517385665862328, 'ensemble': 0.35631724193852221, 'baseline': -1.2446526859107325}
2015-02-12 {'model': 0.53979032357643542, 'ensemble': 0.64488156520051088, 'baseline': 0.56255811702560932}
2015-02-26 {'model': 0.55647392504037918, 'ensemble': 0.63871064818947076, 'baseline': 0.37507249104058504}
2015-03-05 {'model': 0.54205946570974994, 'ensemble': 0.40081046341057924, 'baseline': -0.49281888391150352}
2015-03-12 {'model': 0

In [None]:
trained_5 = copy.deepcopy(trained)

In [None]:
scores