In [1]:
from BorderModel import run_Incremental, harmonic_mean
from BorderQuery import insert_predictions, select_features, select_mungedata
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime
import random
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline
import copy
import pdb
from sklearn.metrics import r2_score
import cPickle as pickle

In [2]:
def create_train_test(date_end, test_days=7, train_years=2):
    '''
    IN 
        date_end: datetime.date of last day to predict
        duration: days to predict
        train_years: number of years to train
    '''
    train_start = date_end - datetime.timedelta(test_days + 366 * train_years)
    train_end = date_end - datetime.timedelta(test_days)
    test_start = date_end - datetime.timedelta(test_days)
    test_end = date_end
    return train_start, train_end, test_start, test_end
    
def run_multiweek(model, munger_id, crossing_id, first, last, test_days):
    prlist = {}
    test_date = first
    while test_date < last + datetime.timedelta(test_days):
        cpu = random.randint(0, 3)
        train_start, train_end, test_start, test_end = create_train_test(test_date, test_days=test_days)

        prlist[str(test_date)] = rc[cpu].apply_async(run_Incremental, model, munger_id, crossing_id,  
                                           train_start, train_end, 
                                           test_start, test_end)
        
        test_date += datetime.timedelta(test_days)
        
    return prlist

def score_df(models):
    predict = {date: model.score()['model'] for date, model in models.items()}
    ensemble = {date: model.score()['ensemble'] for date, model in models.items()}
    baseline = {date: model.score()['baseline'] for date, model in models.items()}
    
    df = pd.DataFrame([predict, ensemble, baseline]).T
    df.columns = ['predict', 'ensemble', 'baseline']
    df.index.name = 'date'
    df.index = pd.to_datetime(df.index)
    return df

def plot_scores(df):
    plt.figure(figsize=(16,4))
    plt.plot(df.baseline, label='baseline')
    plt.plot(df.predict, label='predict')
    plt.plot(df.ensemble, label='ensemble')
    plt.axhline(0, color='y')
    plt.legend();
    
def results_df(trained_models):
    predict = pd.Series().append([model.y_predict for key, model in trained_models.items()]).sort_index()
    ensemble = pd.Series().append([model.ensemble() for key, model in trained_models.items()]).sort_index()
    baseline = pd.Series().append([model.baseline() for key, model in trained_models.items()]).sort_index()
    actual = pd.Series().append([model.actual for key, model in trained_models.items()]).sort_index()
  
    df = pd.DataFrame()
    df['predict'] = predict
    df['ensemble'] = ensemble
    df['baseline'] = baseline
    df['actual'] = actual
    return df

def print_r2(results):
    actual = results.actual.dropna()
    print 'Predict: ', r2_score(actual, results.predict[actual.index])
    print 'Ensemble: ', r2_score(actual, results.ensemble[actual.index])
    print 'Baseline: ', r2_score(actual, results.baseline[actual.index])
    
# def get_trained(pr, first, last, test_days, exclude):
def get_trained(pr, exclude):
    trained = {}
    test_date = first
    for date in sorted(pr.keys()):
        if date not in [str(ex) for ex in exclude]:
            if pr[date].ready():
                trained[date] = pr[date].get(1)
                print date, trained[date].score()
            else:
                print date, 'not ready'
            
    return trained

In [9]:
import os
from ipyparallel import Client
rc = Client()
dview = rc[:]

# set proper working directory on all clients
cwd = os.getcwd()
dview.map(os.chdir, [cwd] * 4)
# print(dview.apply_sync(os.getcwd))

with dview.sync_imports():
    import datetime
    from BorderModel import run_Incremental
    from BorderQuery import select_features, select_mungedata_simple, select_mungedata
    from sklearn.ensemble import ExtraTreesRegressor
    from sklearn.grid_search import GridSearchCV

importing datetime on engine(s)
importing run_Incremental from BorderModel on engine(s)
importing select_features,select_mungedata_simple,select_mungedata from BorderQuery on engine(s)
importing ExtraTreesRegressor from sklearn.ensemble on engine(s)
importing GridSearchCV from sklearn.grid_search on engine(s)


## 12/31/15
* Fixed issue with run_multiweek while loop ending too soon.

In [28]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=4)
grid = GridSearchCV(model, {})

first = datetime.date(2015, 12, 24)
last = datetime.date(2016, 1, 1)
pr1 = run_multiweek(grid, 3, 1, first, last, 7)

In [30]:
exclude = [str(datetime.date(2014,11,19))]
trained = get_trained(pr1, exclude)

2015-12-24 {'model': 0.21115655429191704, 'ensemble': 0.30286010887906656, 'baseline': 0.31504188784061993}
2015-12-31 {'model': -0.065298101692196742, 'ensemble': 0.019113083574276768, 'baseline': 0.26367513450612756}
2016-01-07 {'model': 0.3543741719660094, 'ensemble': 0.22351074600551335, 'baseline': 0.11350436614136672}


In [33]:
results = results_df(trained)
results['2015-12-30':'2015-12-31'].tail()

Unnamed: 0_level_0,predict,ensemble,baseline,actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-12-31 21:30:00,2.942816,2.904701,2.867561,0.207733
2015-12-31 22:00:00,2.936545,2.853952,2.775878,1.81801
2015-12-31 22:30:00,3.25322,2.31954,1.802282,5.66753
2015-12-31 23:00:00,2.543292,1.758387,1.343697,6.57513
2015-12-31 23:30:00,3.605153,2.087365,1.468935,0.896698


## PH-N 11/9/15 - 11/10/15
* Fixed issue in BorderModel.ensemble where actual was being used;  actual should be optional

In [36]:
df = select_mungedata_simple(4, 2, '2015-11-9', '2015-11-11')

In [40]:
df['2015-11-9 5:00':'2015-11-9 16:00']

Unnamed: 0_level_0,waittime,year,month,week,dayofweek,minofday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-09 05:00:00,0.701143,2015,11,46,0,300
2015-11-09 05:30:00,0.390254,2015,11,46,0,330
2015-11-09 06:00:00,0.579846,2015,11,46,0,360
2015-11-09 06:30:00,1.22691,2015,11,46,0,390
2015-11-09 07:00:00,2.30988,2015,11,46,0,420
2015-11-09 15:00:00,11.7858,2015,11,46,0,900
2015-11-09 15:30:00,7.56029,2015,11,46,0,930
2015-11-09 16:00:00,9.08812,2015,11,46,0,960


In [57]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=4)
# grid = GridSearchCV(model, {})

first = datetime.date(2015, 11, 12)
last = datetime.date(2015, 11, 19)
pr1 = run_multiweek(model, 4, 2, first, last, 7)

In [60]:
exclude = []
trained = get_trained(pr1, exclude)

2015-11-12 {'model': 0.36915062407809784, 'ensemble': 0.3156948468695101, 'baseline': -1.0031052308380701}
2015-11-19 {'model': -8.2990046006153619, 'ensemble': -2.7261017110087078, 'baseline': -2.2301557302217456}


In [66]:
results = results_df(trained)
results['2015-11-9 6:00':'2015-11-9 16:00']

Unnamed: 0_level_0,predict,ensemble,baseline,actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-09 06:00:00,0.643688,0.915639,1.585492,0.579846
2015-11-09 06:30:00,0.350928,0.614031,2.453518,1.22691
2015-11-09 07:00:00,0.579626,1.002884,3.717504,2.30988
2015-11-09 07:30:00,1.515067,,5.402574,
2015-11-09 08:00:00,2.121043,,5.97787,
2015-11-09 08:30:00,1.26158,,5.843366,
2015-11-09 09:00:00,2.095142,,8.302568,
2015-11-09 09:30:00,3.499642,,7.44923,
2015-11-09 10:00:00,4.11359,,8.386908,
2015-11-09 10:30:00,4.33637,,8.730355,


### after fixing ensemble function

In [74]:
trained['2015-11-12'].ensemble()['2015-11-9 6:00':'2015-11-9 16:00']

date
2015-11-09 06:00:00     0.915639
2015-11-09 06:30:00     0.614031
2015-11-09 07:00:00     1.002884
2015-11-09 07:30:00     2.366489
2015-11-09 08:00:00     3.131116
2015-11-09 08:30:00     2.075139
2015-11-09 09:00:00     3.345941
2015-11-09 09:30:00     4.762069
2015-11-09 10:00:00     5.519828
2015-11-09 10:30:00     5.794573
2015-11-09 11:00:00     8.774862
2015-11-09 11:30:00    10.200310
2015-11-09 12:00:00     8.089211
2015-11-09 12:30:00     5.089286
2015-11-09 13:00:00     6.785583
2015-11-09 13:30:00     5.922840
2015-11-09 14:00:00     5.268232
2015-11-09 14:30:00     5.570735
2015-11-09 15:00:00     7.689786
2015-11-09 15:30:00     9.520863
2015-11-09 16:00:00     8.179584
dtype: float64

## PH-N 5/19/15 - 5/27/15
* Missing data issue is resolved from previous fixes

In [75]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=4)
# grid = GridSearchCV(model, {})

first = datetime.date(2015, 5, 19)
last = datetime.date(2015, 5, 27)
pr1 = run_multiweek(model, 4, 2, first, last, 7)

In [77]:
exclude = []
trained = get_trained(pr1, exclude)

2015-05-19 {'model': 0.42798803061084623, 'ensemble': 0.40594963792016325, 'baseline': 0.31575636126945161}
2015-05-26 {'model': 0.31797645151960374, 'ensemble': 0.34409842200194751, 'baseline': 0.2784641268591268}
2015-06-02 {'model': -3.0280160511962864, 'ensemble': -0.50481227036787635, 'baseline': -0.025641648743577994}


In [84]:
results = results_df(trained)
results['2015-5-21 12:00':'2015-5-21 16:00']

Unnamed: 0_level_0,predict,ensemble,baseline,actual
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-21 12:00:00,24.568925,20.352535,17.371354,
2015-05-21 12:30:00,24.0,19.078934,15.832557,
2015-05-21 13:00:00,40.5,24.127466,17.181622,7.54938
2015-05-21 13:30:00,60.0,28.797493,18.945199,8.39954
2015-05-21 14:00:00,34.26415,23.623693,18.025896,9.74797
2015-05-21 14:30:00,25.11745,17.473693,13.396778,9.04905
2015-05-21 15:00:00,25.20455,17.240662,13.101103,6.14483
2015-05-21 15:30:00,39.2845,21.181228,14.499499,10.3765
2015-05-21 16:00:00,43.86535,23.709732,16.245235,15.6723


In [85]:
trained['2015-05-26'].y_predict['2015-5-21 12:00':'2015-5-21 16:00']

date
2015-05-21 12:00:00    24.568925
2015-05-21 12:30:00    24.000000
2015-05-21 13:00:00    40.500000
2015-05-21 13:30:00    60.000000
2015-05-21 14:00:00    34.264150
2015-05-21 14:30:00    25.117450
2015-05-21 15:00:00    25.204550
2015-05-21 15:30:00    39.284500
2015-05-21 16:00:00    43.865350
dtype: float64

## PA-S 2/1/15 - 2/18/15

* y_test 2/11/13 - 2/9/15 15:30
* X_test 2/12 - 2/18
* df 2/11/13 - 2/9/15 15:30

* Note the gap between y_test and X_test
* We want to extend df to include data points even when waittime is null
    * updated select_mungedata
    * for purposes of training, we need to keep dropna in _init_
    * created a separate y_test parameter for use in predict
* In delta(), when there is not enough data for a delta, fill with zeros

In [10]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=4)

first = datetime.date(2015, 2, 5)
last = datetime.date(2015, 2, 18)
pr1 = run_multiweek(model, 3, 1, first, last, 7)

In [13]:
exclude = []
trained = get_trained(pr1, exclude)

2015-02-05 {'model': 0.036437817851286414, 'ensemble': -0.15805530677477986, 'baseline': -2.7982134283227738}
2015-02-12 {'model': 0.67594721840629057, 'ensemble': 0.68445886119748067, 'baseline': 0.52540316434315615}
2015-02-19 {'model': 0.58131213661457226, 'ensemble': 0.58933601918350975, 'baseline': 0.20219806520563821}


In [13]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=4)
test_date = datetime.date(2015,2,19)
train_start, train_end, test_start, test_end = create_train_test(test_date, test_days=7)

res = run_Incremental(model, 3, 1, train_start, train_end, test_start, test_end)

In [15]:
res.y_predict

date
2015-02-12 00:00:00     0.059130
2015-02-12 00:30:00     0.000000
2015-02-12 01:00:00     0.000000
2015-02-12 01:30:00     0.033376
2015-02-12 02:00:00     0.000000
2015-02-12 02:30:00     0.000000
2015-02-12 03:00:00     0.000000
2015-02-12 03:30:00     0.000000
2015-02-12 04:00:00     0.000000
2015-02-12 04:30:00     0.215311
2015-02-12 05:00:00     0.045203
2015-02-12 05:30:00     0.081235
2015-02-12 06:00:00     0.094374
2015-02-12 06:30:00     0.337812
2015-02-12 07:00:00     1.534876
2015-02-12 07:30:00     2.587473
2015-02-12 08:00:00     3.393536
2015-02-12 08:30:00     2.080125
2015-02-12 09:00:00     1.982795
2015-02-12 09:30:00     2.121957
2015-02-12 10:00:00     2.227395
2015-02-12 10:30:00     6.423100
2015-02-12 11:00:00    10.165827
2015-02-12 11:30:00    10.636390
2015-02-12 12:00:00    11.111530
2015-02-12 12:30:00    10.022265
2015-02-12 13:00:00     7.326965
2015-02-12 13:30:00     7.042400
2015-02-12 14:00:00     9.965803
2015-02-12 14:30:00     9.110496
     