In [116]:
import numpy as np
import pandas as pd
import os 
import datetime

In [173]:
def data_reader():
    """read data"""
    
    data_dir = os.path.join('..', 'data') #/Users/siliangchen/Airbnb

    train_path = os.path.join(data_dir, 'train.csv')
    meta_path = os.path.join(data_dir, 'metadata.csv')
    holiday_path = os.path.join(data_dir, 'holidays.csv')
    frequency_path = os.path.join(data_dir, 'submission_frequency.csv')    
#     weather_path = os.path.join(data_dir, 'weather.csv')

    train = pd.read_csv(train_path)
    meta = pd.read_csv(meta_path)
    holiday = pd.read_csv(holiday_path)
    frequency = pd.read_csv(frequency_path)
#     weather = pd.read_csv(weather_path)
    
    return train, meta, holiday, frequency  

In [174]:
def data_transfomer(train, holiday, frequency):
    """transform some table format"""
    
    # deal with train df
    train['Timestamp'] = train['Timestamp'].astype('datetime64[ns]')
    train['Weekday'] = train['Timestamp'].apply(lambda x: x.weekday()) 
    train['Date'] =  train['Timestamp'].apply(lambda x: x.date())    
    
    # deal with holiday df
    del holiday['Unnamed: 0']
    holiday['Date'] = holiday['Date'].astype('datetime64[ns]').apply(lambda x: x.date())    
    
    # deal with frequency df
    frequency['ForecastPeriodNS'] = frequency['ForecastPeriodNS'].map({900000000000: '15mins', 
                                                                      3600000000000: '1hour',
                                                                      86400000000000: '1day'})
            
    return train, holiday, frequency

In [148]:
def data_merger(train, meta, holiday, frequency):
    """merge multiple tables together"""    

    # create onoff feature
    onoff = meta[['SiteId','FridayIsDayOff','SaturdayIsDayOff','SundayIsDayOff']].melt('SiteId').rename(columns={'variable':'Weekday','value':'OnAndOff'})
    onoff['Weekday'] = onoff['Weekday'].map({'FridayIsDayOff': 4, 
                                             'SaturdayIsDayOff': 5,
                                             'SundayIsDayOff': 6})
    # drop unneeded columns in meta
    meta = meta.drop(['MondayIsDayOff','TuesdayIsDayOff','WednesdayIsDayOff','ThursdayIsDayOff','FridayIsDayOff','SaturdayIsDayOff','SundayIsDayOff'], axis=1)

    # merge tables
    result = pd.merge(train, frequency,  on='ForecastId', how='left')
    result = pd.merge(result, holiday,  on=['SiteId','Date'], how='left')
    result = pd.merge(result, onoff,  on=['SiteId','Weekday'], how='left')
    result = pd.merge(result, meta,  on='SiteId', how='left')

    # create isHoliday column
    result['isHoliday'] = result['Holiday'].notnull()

    # fill OnAndOff with False when weekday in 0-3 (Monday to Thursday) 
    result.loc[result.Weekday.isin([0,1,2,3]) , 'OnAndOff'] = False

    return result

In [179]:
train, meta, holiday, frequency = data_reader()

In [180]:
train, holiday, frequency = data_transfomer(train, holiday, frequency)

In [182]:
train_final = data_merger(train, meta, holiday, frequency)

In [185]:
# train_final.isnull().sum()/train_final.shape[0]

In [184]:
train_final.head()

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Weekday,Date,ForecastPeriodNS,Holiday,OnAndOff,Surface,Sampling,BaseTemperature,isHoliday
0,744519,1,2014-09-03,1,909655.5,2,2014-09-03,1day,,False,1387.205119,15.0,18.0,False
1,7627564,1,2014-09-04,1,1748273.0,3,2014-09-04,1day,,False,1387.205119,15.0,18.0,False
2,7034705,1,2014-09-05,1,,4,2014-09-05,1day,,False,1387.205119,15.0,18.0,False
3,5995486,1,2014-09-06,1,,5,2014-09-06,1day,,True,1387.205119,15.0,18.0,False
4,7326510,1,2014-09-07,1,,6,2014-09-07,1day,,True,1387.205119,15.0,18.0,False


In [186]:
train_15min = train_final[train_final.ForecastPeriodNS=='15mins']
train_1hour = train_final[train_final.ForecastPeriodNS=='1hour']
train_1day = train_final[train_final.ForecastPeriodNS=='1day']

In [187]:
train_15min.shape

(5250620, 14)

In [188]:
train_1hour.shape

(1260136, 14)

In [189]:
train_1day.shape

(52864, 14)

---

# Moving Average

* Append with test data
* Make lag1-3 features groupby siteid, forecastid