In [2]:
import numpy as np
import pandas as pd
import os 
import datetime

In [3]:
def data_reader():
    """read data"""
    
    data_dir = os.path.join('..', 'data') #/Users/siliangchen/Airbnb

    train_path = os.path.join(data_dir, 'train.csv')
    meta_path = os.path.join(data_dir, 'metadata.csv')
    holiday_path = os.path.join(data_dir, 'holidays.csv')
    frequency_path = os.path.join(data_dir, 'submission_frequency.csv')    
    test_path = os.path.join(data_dir, 'submission_format.csv')        
#     weather_path = os.path.join(data_dir, 'weather.csv')

    train = pd.read_csv(train_path)
    meta = pd.read_csv(meta_path)
    holiday = pd.read_csv(holiday_path)
    frequency = pd.read_csv(frequency_path)
    test = pd.read_csv(test_path)
#     weather = pd.read_csv(weather_path)
    
    return train, meta, holiday, frequency, test

In [4]:
def data_transfomer(train, holiday, frequency, test):
    """transform some table format"""
    
    # deal with train df
    train['Timestamp'] = train['Timestamp'].astype('datetime64[ns]')
    train['Weekday'] = train['Timestamp'].apply(lambda x: x.weekday()) 
    train['Date'] =  train['Timestamp'].apply(lambda x: x.date())  
    
    # deal with test df
    test['Timestamp'] = test['Timestamp'].astype('datetime64[ns]')
    
    # deal with holiday df
    del holiday['Unnamed: 0']
    holiday['Date'] = holiday['Date'].astype('datetime64[ns]').apply(lambda x: x.date())    
    
    # deal with frequency df
    frequency['ForecastPeriodNS'] = frequency['ForecastPeriodNS'].map({900000000000: '15mins', 
                                                                      3600000000000: '1hour',
                                                                      86400000000000: '1day'})
            
    return train, holiday, frequency, test

In [5]:
def data_merger(train, meta, holiday, frequency):
    """merge multiple tables together"""    

    # create onoff feature
    onoff = meta[['SiteId','FridayIsDayOff','SaturdayIsDayOff','SundayIsDayOff']].melt('SiteId').rename(columns={'variable':'Weekday','value':'OnAndOff'})
    onoff['Weekday'] = onoff['Weekday'].map({'FridayIsDayOff': 4, 
                                             'SaturdayIsDayOff': 5,
                                             'SundayIsDayOff': 6})
    # drop unneeded columns in meta
    meta = meta.drop(['MondayIsDayOff','TuesdayIsDayOff','WednesdayIsDayOff','ThursdayIsDayOff','FridayIsDayOff','SaturdayIsDayOff','SundayIsDayOff'], axis=1)

    # merge tables
    result = pd.merge(train, frequency,  on='ForecastId', how='left')
    result = pd.merge(result, holiday,  on=['SiteId','Date'], how='left')
    result = pd.merge(result, onoff,  on=['SiteId','Weekday'], how='left')
    result = pd.merge(result, meta,  on='SiteId', how='left')

    # create isHoliday column
    result['isHoliday'] = result['Holiday'].notnull()

    # fill OnAndOff with False when weekday in 0-3 (Monday to Thursday) 
    result.loc[result.Weekday.isin([0,1,2,3]) , 'OnAndOff'] = False

    return result

In [6]:
train, meta, holiday, frequency, test = data_reader()

In [7]:
train, holiday, frequency, test = data_transfomer(train, holiday, frequency, test)

In [8]:
train_final = data_merger(train, meta, holiday, frequency)

In [8]:
# train_final.isnull().sum()/train_final.shape[0]

In [9]:
train_final.head()

Unnamed: 0,obs_id,SiteId,Timestamp,ForecastId,Value,Weekday,Date,ForecastPeriodNS,Holiday,OnAndOff,Surface,Sampling,BaseTemperature,isHoliday
0,744519,1,2014-09-03,1,909655.5,2,2014-09-03,1day,,False,1387.205119,15.0,18.0,False
1,7627564,1,2014-09-04,1,1748273.0,3,2014-09-04,1day,,False,1387.205119,15.0,18.0,False
2,7034705,1,2014-09-05,1,,4,2014-09-05,1day,,False,1387.205119,15.0,18.0,False
3,5995486,1,2014-09-06,1,,5,2014-09-06,1day,,True,1387.205119,15.0,18.0,False
4,7326510,1,2014-09-07,1,,6,2014-09-07,1day,,True,1387.205119,15.0,18.0,False


In [10]:
train_15min = train_final[train_final.ForecastPeriodNS=='15mins']
train_1hour = train_final[train_final.ForecastPeriodNS=='1hour']
train_1day = train_final[train_final.ForecastPeriodNS=='1day']

In [11]:
train_15min.shape

(5250620, 14)

In [12]:
train_1hour.shape

(1260136, 14)

In [13]:
train_1day.shape

(52864, 14)

In [14]:
train_15min.to_csv('../data/train_15min.csv', sep=',', index = False)
train_1hour.to_csv('../data/train_1hour.csv', sep=',', index = False)
train_1day.to_csv('../data/train_1day.csv', sep=',', index = False)

---

## Other messy code

In [None]:
# fl_ma['ValueLag1'] = fl_ma.groupby(['ForecastId'])['Value'].shift(1)
# fl_ma['ValueLag2'] = fl_ma.groupby(['ForecastId'])['Value'].shift(2)
# fl_ma['ValueLag3'] = fl_ma.groupby(['ForecastId'])['Value'].shift(3)
# fl_ma.head()

In [62]:
# import matplotlib.pyplot as plt
# import seaborn as sns

In [61]:
# a = ma_df.query("ForecastId==1")[['Timestamp','Value']]
# a.set_index('Timestamp')
# a.plot()

In [60]:
# lag=10
# for index, row in ma_df.query("ForecastId==1").iterrows():
#     if row['isTrain']==False:
#         ma_df.loc[index, 'Value'] = ma_df.loc[index-lag:index-1,'Value'].mean()

In [59]:
# b = ma_df.query("ForecastId==1")[['Timestamp','Value']]
# b.set_index('Timestamp')
# b.plot()