# Customer Multivariate Time Series 

<br>
Kanru Wang Jan 2019
<br>
<br>
<br>
<br>

In [73]:
import pandas as pd
import numpy as np

from datetime import datetime
from datetime import timedelta

<br>
<br>

Rows with index 14231, 11884 are manually corrected in the raw training data.

In [311]:
dateparse_1 = lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M')
dateparse_2 = lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S')

train_raw = pd.read_csv('train.csv', index_col = 0, parse_dates = ['start_time'], date_parser=dateparse_1)
test_raw = pd.read_csv('test.csv', index_col = 0, parse_dates = ['start_time'], date_parser=dateparse_2)

<br>
<br>





It is clear that var_bu, var_hl and end_time are useless. Also total_cnt will be modelled as the sum of others + cust.

Drop them.

In [312]:
train_raw = train_raw.drop(['var_bu', 'var_hl', 'end_time', 'total_cnt'], axis = 1)
test_raw = test_raw.drop(['var_bu', 'var_hl', 'end_time'], axis = 1)

<br>
<br>


In order for ARIMA to handle seasonality well, plug in the missing rows, so that each hour has its own row.

In [313]:
train_raw.shape

(15919, 9)

In [314]:
test_raw.shape

(1460, 7)

<br>
<br>


Start from the training set

In [315]:
base_train = train_raw.loc[15919, 'start_time']
# 670 days * 24 hours = 16080
full_datetime_list_train = [base_train - timedelta(hours=x) for x in range(0, 16080)]

In [316]:
full_datetime_df_train = pd.DataFrame(full_datetime_list_train)
full_datetime_df_train.columns = ['start_time']
full_datetime_df_train = full_datetime_df_train.sort_values('start_time')

In [317]:
train_raw = train_raw.reset_index(drop = False)
train_expanded = pd.merge(full_datetime_df_train,
                          train_raw,
                          on='start_time', 
                          how='left')

<br>
<br>

Do the same for the test set

In [318]:
base_test = test_raw.loc[17379, 'start_time']
# 61 days * 24 hours = 1464
full_datetime_list_test = [base_test - timedelta(hours=x) for x in range(0, 1464)]

In [319]:
full_datetime_df_test = pd.DataFrame(full_datetime_list_test)
full_datetime_df_test.columns = ['start_time']
full_datetime_df_test = full_datetime_df_test.sort_values('start_time')

In [320]:
test_raw = test_raw.reset_index(drop = False)
test_expanded = pd.merge(full_datetime_df_test,
                         test_raw,
                         on='start_time', 
                         how='left')

<br>
<br>


Combine and then fill NA. Fill forward then backward instead of interpolation to avoid peeking into the future.

Notice that values to be predicted in the test dataset are temporarily filled anyway.

In [413]:
combined = pd.concat([train_expanded, test_expanded], axis=0, ignore_index = True, sort=False)
combined.loc[np.isnan(combined['idx']), 'idx'] = 999999
combined = combined.apply(lambda x: x.fillna(method = "ffill", axis = 0).\
                                      fillna(method = "bfill",axis = 0))

<br>
<br>

Assume that only the last 24 * 7 hours' feature values have impact on the current hour's result value.

The list_of_df_by_window is a list of dataframes each is a sliding window of 24 * 7 + 1 = 169 hours. 

As you will see in the later process, the first 24 * 7 hours will be used to form features. For the last one hour, there is no feature, except for "others" and "cust" which are to be forecasted. 

The total number of dataframes in list_of_df_by_window should be: number of rows in dataset - window length + 1.

In [414]:
window_size = 24 * 7 + 1

def get_list_of_window_sized_df(df):
    return [df.iloc[i: i + window_size] for i in range(len(df.index) - window_size + 1)]

In [415]:
list_of_df_by_window = get_list_of_window_sized_df(combined)

<br>
<br>
An example of such windows.

In [416]:
list_of_df_by_window[0].head()

Unnamed: 0,start_time,idx,var_w,var_t,var_at,var_m,var_hm,var_wd,others,cust
0,2011-01-01 00:00:00,1.0,1000.0,119.68,68.79,17.0,40.5,0.0,3.0,13.0
1,2011-01-01 01:00:00,2.0,1000.0,118.04,67.27,44.0,40.0,0.0,8.0,32.0
2,2011-01-01 02:00:00,3.0,1000.0,118.04,67.27,44.0,40.0,0.0,5.0,27.0
3,2011-01-01 03:00:00,4.0,1000.0,119.68,68.79,44.0,37.5,0.0,3.0,10.0
4,2011-01-01 04:00:00,5.0,1000.0,119.68,68.79,44.0,37.5,0.0,0.0,1.0


In [333]:
list_of_df_by_window[0].tail()

Unnamed: 0,start_time,idx,var_w,var_t,var_at,var_m,var_hm,var_wd,others,cust
164,2011-01-07 20:00:00,158.0,1000.0,114.76,61.21,38.0,23.5,3.50075,1.0,50.0
165,2011-01-07 21:00:00,159.0,1000.0,114.76,59.7,38.0,23.5,4.49905,0.0,39.0
166,2011-01-07 22:00:00,160.0,2000.0,114.76,59.7,38.0,21.5,5.5007,2.0,34.0
167,2011-01-07 23:00:00,161.0,2000.0,114.76,59.7,38.0,25.5,5.5007,1.0,14.0
168,2011-01-08 00:00:00,162.0,2000.0,114.76,59.7,38.0,25.5,5.5007,1.0,24.0


<br>
<br>

### Transform to Tidy format

The next step is to transform each window into a row. The list_of_df_by_stock_by_window_flatten_reindexed_unstacked is a list of such rows.

In [417]:
list_of_df_by_window_reindexed_unstacked = \
    [window.reset_index(drop = True).unstack()
     for window in list_of_df_by_window]

<br>
<br>
Concatenate rows into a dataframe.

In [418]:
tidy_table = pd.concat(list_of_df_by_window_reindexed_unstacked, axis=1).T

In [419]:
tidy_table.shape

(17376, 1690)

In [420]:
tidy_table.head()

Unnamed: 0_level_0,start_time,start_time,start_time,start_time,start_time,start_time,start_time,start_time,start_time,start_time,...,cust,cust,cust,cust,cust,cust,cust,cust,cust,cust
Unnamed: 0_level_1,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,167,168
0,2011-01-01 00:00:00,2011-01-01 01:00:00,2011-01-01 02:00:00,2011-01-01 03:00:00,2011-01-01 04:00:00,2011-01-01 05:00:00,2011-01-01 06:00:00,2011-01-01 07:00:00,2011-01-01 08:00:00,2011-01-01 09:00:00,...,63,82,178,116,92,50,39,34,14,24
1,2011-01-01 01:00:00,2011-01-01 02:00:00,2011-01-01 03:00:00,2011-01-01 04:00:00,2011-01-01 05:00:00,2011-01-01 06:00:00,2011-01-01 07:00:00,2011-01-01 08:00:00,2011-01-01 09:00:00,2011-01-01 10:00:00,...,82,178,116,92,50,39,34,14,24,15
2,2011-01-01 02:00:00,2011-01-01 03:00:00,2011-01-01 04:00:00,2011-01-01 05:00:00,2011-01-01 06:00:00,2011-01-01 07:00:00,2011-01-01 08:00:00,2011-01-01 09:00:00,2011-01-01 10:00:00,2011-01-01 11:00:00,...,178,116,92,50,39,34,14,24,15,13
3,2011-01-01 03:00:00,2011-01-01 04:00:00,2011-01-01 05:00:00,2011-01-01 06:00:00,2011-01-01 07:00:00,2011-01-01 08:00:00,2011-01-01 09:00:00,2011-01-01 10:00:00,2011-01-01 11:00:00,2011-01-01 12:00:00,...,116,92,50,39,34,14,24,15,13,7
4,2011-01-01 04:00:00,2011-01-01 05:00:00,2011-01-01 06:00:00,2011-01-01 07:00:00,2011-01-01 08:00:00,2011-01-01 09:00:00,2011-01-01 10:00:00,2011-01-01 11:00:00,2011-01-01 12:00:00,2011-01-01 13:00:00,...,92,50,39,34,14,24,15,13,7,1


<br>
<br>
Flatten the column names.

In [421]:
tidy_table.columns = [col[0] + '_' + str(col[1]) for col in tidy_table.columns.values]

<br>
<br>
Take a look:

In [422]:
tidy_table.columns[0:169]

Index(['start_time_0', 'start_time_1', 'start_time_2', 'start_time_3',
       'start_time_4', 'start_time_5', 'start_time_6', 'start_time_7',
       'start_time_8', 'start_time_9',
       ...
       'start_time_159', 'start_time_160', 'start_time_161', 'start_time_162',
       'start_time_163', 'start_time_164', 'start_time_165', 'start_time_166',
       'start_time_167', 'start_time_168'],
      dtype='object', length=169)

In [423]:
tidy_table.columns[169 : 169 * 2]

Index(['idx_0', 'idx_1', 'idx_2', 'idx_3', 'idx_4', 'idx_5', 'idx_6', 'idx_7',
       'idx_8', 'idx_9',
       ...
       'idx_159', 'idx_160', 'idx_161', 'idx_162', 'idx_163', 'idx_164',
       'idx_165', 'idx_166', 'idx_167', 'idx_168'],
      dtype='object', length=169)

In [424]:
tidy_table.columns[169 * 2 : 169 * 3]

Index(['var_w_0', 'var_w_1', 'var_w_2', 'var_w_3', 'var_w_4', 'var_w_5',
       'var_w_6', 'var_w_7', 'var_w_8', 'var_w_9',
       ...
       'var_w_159', 'var_w_160', 'var_w_161', 'var_w_162', 'var_w_163',
       'var_w_164', 'var_w_165', 'var_w_166', 'var_w_167', 'var_w_168'],
      dtype='object', length=169)

In [425]:
tidy_table.columns[169 * 3 : 169 * 4]

Index(['var_t_0', 'var_t_1', 'var_t_2', 'var_t_3', 'var_t_4', 'var_t_5',
       'var_t_6', 'var_t_7', 'var_t_8', 'var_t_9',
       ...
       'var_t_159', 'var_t_160', 'var_t_161', 'var_t_162', 'var_t_163',
       'var_t_164', 'var_t_165', 'var_t_166', 'var_t_167', 'var_t_168'],
      dtype='object', length=169)

In [426]:
tidy_table.columns[169 * 4 : 169 * 5]

Index(['var_at_0', 'var_at_1', 'var_at_2', 'var_at_3', 'var_at_4', 'var_at_5',
       'var_at_6', 'var_at_7', 'var_at_8', 'var_at_9',
       ...
       'var_at_159', 'var_at_160', 'var_at_161', 'var_at_162', 'var_at_163',
       'var_at_164', 'var_at_165', 'var_at_166', 'var_at_167', 'var_at_168'],
      dtype='object', length=169)

In [427]:
tidy_table.columns[169 * 5 : 169 * 6]

Index(['var_m_0', 'var_m_1', 'var_m_2', 'var_m_3', 'var_m_4', 'var_m_5',
       'var_m_6', 'var_m_7', 'var_m_8', 'var_m_9',
       ...
       'var_m_159', 'var_m_160', 'var_m_161', 'var_m_162', 'var_m_163',
       'var_m_164', 'var_m_165', 'var_m_166', 'var_m_167', 'var_m_168'],
      dtype='object', length=169)

In [428]:
tidy_table.columns[169 * 6 : 169 * 7]

Index(['var_hm_0', 'var_hm_1', 'var_hm_2', 'var_hm_3', 'var_hm_4', 'var_hm_5',
       'var_hm_6', 'var_hm_7', 'var_hm_8', 'var_hm_9',
       ...
       'var_hm_159', 'var_hm_160', 'var_hm_161', 'var_hm_162', 'var_hm_163',
       'var_hm_164', 'var_hm_165', 'var_hm_166', 'var_hm_167', 'var_hm_168'],
      dtype='object', length=169)

In [429]:
tidy_table.columns[169 * 7 : 169 * 8]

Index(['var_wd_0', 'var_wd_1', 'var_wd_2', 'var_wd_3', 'var_wd_4', 'var_wd_5',
       'var_wd_6', 'var_wd_7', 'var_wd_8', 'var_wd_9',
       ...
       'var_wd_159', 'var_wd_160', 'var_wd_161', 'var_wd_162', 'var_wd_163',
       'var_wd_164', 'var_wd_165', 'var_wd_166', 'var_wd_167', 'var_wd_168'],
      dtype='object', length=169)

In [430]:
tidy_table.columns[169 * 8 : 169 * 9]

Index(['others_0', 'others_1', 'others_2', 'others_3', 'others_4', 'others_5',
       'others_6', 'others_7', 'others_8', 'others_9',
       ...
       'others_159', 'others_160', 'others_161', 'others_162', 'others_163',
       'others_164', 'others_165', 'others_166', 'others_167', 'others_168'],
      dtype='object', length=169)

In [431]:
tidy_table.columns[169 * 9 : 169 * 10]

Index(['cust_0', 'cust_1', 'cust_2', 'cust_3', 'cust_4', 'cust_5', 'cust_6',
       'cust_7', 'cust_8', 'cust_9',
       ...
       'cust_159', 'cust_160', 'cust_161', 'cust_162', 'cust_163', 'cust_164',
       'cust_165', 'cust_166', 'cust_167', 'cust_168'],
      dtype='object', length=169)

<br>
<br>

### Feature engineering

As a starting point...

In [432]:
tidy_more_features = tidy_table[['start_time_168', 'idx_168', 'cust_168', 'others_168']]

Then define a function to generate more features. Specifically, using the sum of the value of 2 ** n hours:

- Sum of last 001 hours =
- Sum of last 002 hours ==
- Sum of last 004 hours ====
- Sum of last 008 hours ========
- ...
- ...
- ...
- Sum of last 128 hours ======================================================
- Sum of last 168 hours ====================================================================

In [433]:
def add_more_rolling_features(df1, df2, string, start, end):
    df1.loc[:, string + '_168'] = df2.loc[:, string + '_168']
    df1.loc[:, string + '_last_2'] = df2.iloc[:, end - 1 - 2 ** 1 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_4'] = df2.iloc[:, end - 1 - 2 ** 2 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_8'] = df2.iloc[:, end - 1 - 2 ** 3 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_16'] = df2.iloc[:, end - 1 - 2 ** 4 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_32'] = df2.iloc[:, end - 1 - 2 ** 5 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_64'] = df2.iloc[:, end - 1 - 2 ** 6 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_128'] = df2.iloc[:, end - 1 - 2 ** 7 : end - 1].sum(axis=1)
    df1.loc[:, string + '_last_168'] = df2.iloc[:, start : end - 1].sum(axis=1)
    return df1

In [434]:
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_w', 169 * 2, 169 * 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [435]:
tidy_more_features.head()

Unnamed: 0,start_time_168,idx_168,cust_168,others_168,var_w_168,var_w_last_2,var_w_last_4,var_w_last_8,var_w_last_16,var_w_last_32,var_w_last_64,var_w_last_128,var_w_last_168
0,2011-01-08 00:00:00,162,24,1,2000,4000.0,6000.0,12000.0,25000.0,52000.0,89000.0,157000.0,231000.0
1,2011-01-08 01:00:00,163,15,1,2000,4000.0,7000.0,12000.0,26000.0,53000.0,90000.0,156000.0,232000.0
2,2011-01-08 02:00:00,164,13,3,2000,4000.0,8000.0,12000.0,27000.0,54000.0,91000.0,157000.0,233000.0
3,2011-01-08 03:00:00,165,7,0,3000,4000.0,8000.0,13000.0,28000.0,55000.0,92000.0,157000.0,234000.0
4,2011-01-08 04:00:00,166,1,0,3000,5000.0,9000.0,15000.0,29000.0,57000.0,94000.0,159000.0,236000.0


<br>
<br>
Let's apply the same generation function to the rest of the useful features.

In [436]:
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_w', 169 * 2, 169 * 3)
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_t', 169 * 3, 169 * 4)
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_at', 169 * 4, 169 * 5)
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_m', 169 * 5, 169 * 6)
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_hm', 169 * 6, 169 * 7)
tidy_more_features = add_more_rolling_features(tidy_more_features, tidy_table, 'var_wd', 169 * 7, 169 * 8)

In [437]:
tidy_more_features.columns

Index(['start_time_168', 'idx_168', 'cust_168', 'others_168', 'var_w_168',
       'var_w_last_2', 'var_w_last_4', 'var_w_last_8', 'var_w_last_16',
       'var_w_last_32', 'var_w_last_64', 'var_w_last_128', 'var_w_last_168',
       'var_t_168', 'var_t_last_2', 'var_t_last_4', 'var_t_last_8',
       'var_t_last_16', 'var_t_last_32', 'var_t_last_64', 'var_t_last_128',
       'var_t_last_168', 'var_at_168', 'var_at_last_2', 'var_at_last_4',
       'var_at_last_8', 'var_at_last_16', 'var_at_last_32', 'var_at_last_64',
       'var_at_last_128', 'var_at_last_168', 'var_m_168', 'var_m_last_2',
       'var_m_last_4', 'var_m_last_8', 'var_m_last_16', 'var_m_last_32',
       'var_m_last_64', 'var_m_last_128', 'var_m_last_168', 'var_hm_168',
       'var_hm_last_2', 'var_hm_last_4', 'var_hm_last_8', 'var_hm_last_16',
       'var_hm_last_32', 'var_hm_last_64', 'var_hm_last_128',
       'var_hm_last_168', 'var_wd_168', 'var_wd_last_2', 'var_wd_last_4',
       'var_wd_last_8', 'var_wd_last_16', 'var_w

<br>
<br>

### Export to csv

In [438]:
tidy_more_features.to_csv("combined_clean.csv", index = False)