In [4]:
from dbhelper import pd_query
import pandas as pd
import datetime as dt

In [5]:
from BorderModel import IncrementalModel
from BorderQuery import select_features_simple, select_mungedata_simple
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import r2_score
import pprint

In [6]:
query = '''
        select 
            date,
            extract(dow from date) as dow,
            locale,
            event
        from specialdates
        '''
df = pd_query(query)
df.date = pd.to_datetime(df.date)
df = df.set_index('date')

## Add feature for days off on Mondays and Fridays plus lead/lag

1. Identify Monday or Friday specialdates
2. Filter out holidays which are not typically days off
3. Create lag/lead effects

Expectation is that this handles cases where there is not enough data for model to recognize moving holidays 

In [7]:
# 1 = Monday
# 5 = Friday
df[df.dow==1].event.unique()

array(['newyears', 'mlk', 'memorial', 'labor', 'independence', 'veterans',
       'xmas', 'victoria', 'civic', 'ca_thanksgiving', 'canada',
       'president', 'halloween'], dtype=object)

In [8]:
df[df.dow==5].event.unique()

array(['independence', 'xmas', 'newyears', 'veterans', 'goodfriday',
       'canada', 'halloween'], dtype=object)

### Exclude Halloween since it is not an observed holiday

In [9]:
df = df[df.event != 'halloween']

### Create a feature submatrix with following fields that can be merged into full feature matrix
* MondayDayOff, alias mdo
* mdo_lag_1
* mdo_lead_1 to mdo_lead_4

* FridayDayOff, alias fdo
* fdo_lead_1
* fdo_lag_1 to fdo_lag_4

In [10]:
query = '''
        select
            d.date,
            mdo.event as mdo,
            fdo.event as fdo
from datefeatures d
left join specialdates mdo
    on d.date::timestamp::date = mdo.date
    and extract(dow from d.date) = 1
    and mdo.event <> 'halloween'
left join specialdates fdo
    on d.date::timestamp::date = fdo.date
    and extract(dow from d.date) = 5
    and fdo.event <> 'halloween'
order by d.date
        '''

df1 = pd_query(query)

In [11]:
df1.head()

Unnamed: 0,date,mdo,fdo
0,2007-01-01 00:00:00,newyears,
1,2007-01-01 00:05:00,newyears,
2,2007-01-01 00:10:00,newyears,
3,2007-01-01 00:15:00,newyears,
4,2007-01-01 00:20:00,newyears,


In [12]:
df1.mdo = ~pd.isnull(df1.mdo)
df1.fdo = ~pd.isnull(df1.fdo)

In [13]:
df1.head()

Unnamed: 0,date,mdo,fdo
0,2007-01-01 00:00:00,True,False
1,2007-01-01 00:05:00,True,False
2,2007-01-01 00:10:00,True,False
3,2007-01-01 00:15:00,True,False
4,2007-01-01 00:20:00,True,False


In [14]:
df1['mdo_lag1'] = df1.shift(288).mdo
df1.head()

Unnamed: 0,date,mdo,fdo,mdo_lag1
0,2007-01-01 00:00:00,True,False,
1,2007-01-01 00:05:00,True,False,
2,2007-01-01 00:10:00,True,False,
3,2007-01-01 00:15:00,True,False,
4,2007-01-01 00:20:00,True,False,


In [15]:
for i in [1, 2, 3, 4]:
    df1['mdo_lead{0}'.format(i)] = df1.shift(-288 * i).mdo
df1[df1.date > '2007-1-2'].head()

Unnamed: 0,date,mdo,fdo,mdo_lag1,mdo_lead1,mdo_lead2,mdo_lead3,mdo_lead4
289,2007-01-02 00:05:00,False,False,True,False,False,False,False
290,2007-01-02 00:10:00,False,False,True,False,False,False,False
291,2007-01-02 00:15:00,False,False,True,False,False,False,False
292,2007-01-02 00:20:00,False,False,True,False,False,False,False
293,2007-01-02 00:25:00,False,False,True,False,False,False,False


In [16]:
df1['fdo_lead1'] = df1.shift(-288).fdo
for i in [1, 2, 3, 4]:
    df1['fdo_lag{0}'.format(i)] = df1.shift(288 * i).fdo
df1[df1.date > '2008-7-5'].head()

Unnamed: 0,date,mdo,fdo,mdo_lag1,mdo_lead1,mdo_lead2,mdo_lead3,mdo_lead4,fdo_lead1,fdo_lag1,fdo_lag2,fdo_lag3,fdo_lag4
158689,2008-07-05 00:05:00,False,False,False,False,False,False,False,False,True,False,False,False
158690,2008-07-05 00:10:00,False,False,False,False,False,False,False,False,True,False,False,False
158691,2008-07-05 00:15:00,False,False,False,False,False,False,False,False,True,False,False,False
158692,2008-07-05 00:20:00,False,False,False,False,False,False,False,False,True,False,False,False
158693,2008-07-05 00:25:00,False,False,False,False,False,False,False,False,True,False,False,False


## Test model

In [17]:
import datetime
from BorderQuery import select_features, select_mungedata, select_mungedata_simple

def create_train_test(year, train_length=2):
    '''
    IN 
        years: list of years to predict
        train_length: number of years to train
    '''
    train_start = datetime.date(year - train_length, 1, 1).strftime('%Y-%m-%d')
    train_end = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_start = datetime.date(year, 1, 1).strftime('%Y-%m-%d')
    test_end = datetime.date(year + 1, 1, 1).strftime('%Y-%m-%d')
    return train_start, train_end, test_start, test_end

def run_incremental_join(model, munger_id, xing, train_start, train_end,
                    test_start, test_end, join_df=None):
    df_train = select_mungedata(munger_id, xing, train_start, train_end)
    X_test = select_features(test_start, test_end)
    actual = select_mungedata_simple(munger_id, xing, test_start, test_end)

    if join_df is not None:
        df_train = df_train.join(join_df)
        X_test = X_test.join(join_df)

#     return df_train
        
    grid = GridSearchCV(model, {})
    im = IncrementalModel(df_train, grid, categoricals=['event'])
    im.set_actual(actual.waittime)
    im.predict(X_test)
    
    return im

In [18]:
df1 = df1.set_index('date')

In [19]:
df1 = df1[(df1.index.minute == 0) | (df1.index.minute == 30)]

In [20]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
train_start, train_end, test_start, test_end = create_train_test(2015)
im = run_incremental_join(model, 3, 1, train_start, train_end, test_start, test_end, join_df=df1)

In [21]:
im.score()

{'baseline': 0.1059067680212642,
 'ensemble': 0.19072517240780962,
 'model': 0.15619347124690364}

### Increase training period to 4 years from 2 years

In [22]:
model = ExtraTreesRegressor(n_jobs=-1, n_estimators=96)
train_start, train_end, test_start, test_end = create_train_test(2015, train_length=4)
im = run_incremental_join(model, 3, 1, train_start, train_end, test_start, test_end, join_df=df1)

In [25]:
im.score()

{'baseline': 0.1059067680212642,
 'ensemble': 0.17846059664342961,
 'model': 0.13247236958927722}

## Long weekend features do not help
Perhaps additional data is needed, but since more data makes predictions worse, any benefits are counteracted.