## Predict 1 week at a time for dates before 2016
1. Train model for 2 years
2. Predict 1 week via IncrementalModel
3. Repeat starting with next week

In [5]:
from BorderModel import run_Incremental, harmonic_mean
from BorderQuery import insert_predictions
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime
import random
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import copy
import pdb
from sklearn.metrics import r2_score
import cPickle as pickle

In [10]:
def create_train_test(date_end, test_days=7, train_years=2):
    '''
    IN 
        date_end: datetime.date of last day to predict
        duration: days to predict
        train_years: number of years to train
    '''
    train_start = date_end - datetime.timedelta(test_days + 366 * train_years)
    train_end = date_end - datetime.timedelta(test_days)
    test_start = date_end - datetime.timedelta(test_days)
    test_end = date_end
    return train_start, train_end, test_start, test_end
    
def run_multiweek(model, munger_id, crossing_id, first, last, test_days):
    prlist = {}
    test_date = first
    while test_date < last + datetime.timedelta(test_days):
        cpu = random.randint(0, 31)
        train_start, train_end, test_start, test_end = create_train_test(test_date, test_days=test_days)

        prlist[str(test_date)] = rc[cpu].apply(run_Incremental, model, munger_id, crossing_id,  
                                           train_start, train_end, 
                                           test_start, test_end)
        
        test_date += datetime.timedelta(test_days)
        
    return prlist

def score_df(models):
    predict = {date: model.score()['model'] for date, model in models.items()}
    ensemble = {date: model.score()['ensemble'] for date, model in models.items()}
    baseline = {date: model.score()['baseline'] for date, model in models.items()}
    
    df = pd.DataFrame([predict, ensemble, baseline]).T
    df.columns = ['predict', 'ensemble', 'baseline']
    df.index.name = 'date'
    df.index = pd.to_datetime(df.index)
    return df

def plot_scores(df):
    plt.figure(figsize=(16,4))
    plt.plot(df.baseline, label='baseline')
    plt.plot(df.predict, label='predict')
    plt.plot(df.ensemble, label='ensemble')
    plt.axhline(0, color='y')
    plt.legend();
    
def results_df(trained_models):
    predict = pd.Series().append([model.y_predict for key, model in trained_models.items()]).sort_index()
    ensemble = pd.Series().append([model.ensemble() for key, model in trained_models.items()]).sort_index()
    baseline = pd.Series().append([model.baseline() for key, model in trained_models.items()]).sort_index()
    actual = pd.Series().append([model.actual for key, model in trained_models.items()]).sort_index()
  
    df = pd.DataFrame()
    df['predict'] = predict
    df['ensemble'] = ensemble
    df['baseline'] = baseline
    df['actual'] = actual
    return df

def print_r2(results):
    actual = results.actual.dropna()
    print 'Predict: ', r2_score(actual, results.predict[actual.index])
    print 'Ensemble: ', r2_score(actual, results.ensemble[actual.index])
    print 'Baseline: ', r2_score(actual, results.baseline[actual.index])
    
# def get_trained(pr, first, last, test_days, exclude):
def get_trained(pr, exclude=[]):
    trained = {}
    for date in sorted(pr.keys()):
        if date not in [str(ex) for ex in exclude]:
#             if pr[date].ready():
#                 trained[date] = pr[date].get()
#                 print date, trained[date].score()
#             else:
#                 print date, 'not ready'
            trained[date] = pr[date].get()
            
    return trained


In [11]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 40)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import run_Incremental
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing run_Incremental from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


## Peace Arch South

### 2015

In [12]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

pr = run_multiweek(grid, 3, 1, datetime.date(2015, 1, 1), datetime.date(2016, 1, 1), 7)


In [None]:
trained = get_trained(pr)


In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_1_2015.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2015-1-1':'2016-1-1'])

### 2014

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2014, 1, 1)
last = datetime.date(2015, 1, 1)
pr = run_multiweek(grid, 3, 1, first, last, 7)


In [None]:
exclude = []
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_1_2014.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2014-1-1':'2015-1-1'])

## Pacific Highway South

### 2015

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})
first = datetime.date(2015, 1, 1)
last = datetime.date(2016, 1, 1)

pr = run_multiweek(grid, 3, 5, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)


In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_5_2015.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2015-1-1':'2016-1-1'])

### 2014

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2014, 1, 1)
last = datetime.date(2015, 1, 1)
pr = run_multiweek(grid, 3, 5, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_5_2014.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2014-1-1':'2015-1-1'])

### 2013

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2013, 1, 1)
last = datetime.date(2014, 1, 1)
pr = run_multiweek(grid, 3, 5, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_5_2013.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2013-1-1':'2014-1-1'])

## 2012

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2012, 1, 1)
last = datetime.date(2013, 1, 1)
pr = run_multiweek(grid, 3, 5, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_5_2012.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results[str(first):str(last)])

## 2011

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2011, 1, 1)
last = datetime.date(2012, 1, 1)
pr = run_multiweek(grid, 3, 5, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_5_2011.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results[str(first):str(last)])

## Peace Arch North
### 2015

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2015, 1, 1)
last = datetime.date(2016, 1, 1)
pr = run_multiweek(grid, 4, 2, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_2_2015.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2015-1-1':'2016-1-1'])

### 2014

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2014, 1, 1)
last = datetime.date(2015, 1, 1)
pr = run_multiweek(grid, 4, 2, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_2_2014.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2014-1-1':'2015-1-1'])

## Pacific Highway North
### 2015

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2015, 1, 1)
last = datetime.date(2016, 1, 1)
pr = run_multiweek(grid, 4, 6, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_6_2015.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2015-1-1':'2016-1-1'])

### 2014

In [None]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
grid = GridSearchCV(model, {})

first = datetime.date(2014, 1, 1)
last = datetime.date(2015, 1, 1)
pr = run_multiweek(grid, 4, 6, first, last, 7)


In [None]:
trained = get_trained(pr)


In [None]:
scores = score_df(trained)
plot_scores(scores)

In [None]:
results = results_df(trained)
with open('/home/ubuntu/BorderCrossing/data/results_6_2014.pkl', 'w') as f:
    pickle.dump(results, f)

print_r2(results['2014-1-1':'2015-1-1'])