In [1]:
import numpy as np
import pandas as pd
import os
import datetime
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_log_error
import os

Plot Style

In [2]:
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True, figsize=(11, 4))
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
plot_params = dict(
    color="0.75",
    style=".-",
    markeredgecolor="0.25",
    markerfacecolor="0.25",
    legend=False,
)
%config InlineBackend.figure_format = 'retina'

Load Files

In [3]:
IN_FP = '../input/store-sales-time-series-forecasting'
TRAIN_FP = os.path.join(IN_FP, 'train.csv')
TEST_FP = os.path.join(IN_FP, 'test.csv')
HOLIDAYS_FP = os.path.join(IN_FP, 'holidays_events.csv')

In [4]:
store_sales = pd.read_csv(
    TRAIN_FP,
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()
average_sales = store_sales.groupby('date').mean()['sales']

family_sales = (
    store_sales
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
    .loc['2017']    
)

test = pd.read_csv(
    TEST_FP,
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)

test['date'] = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family', 'date']).sort_index()

holidays_events = pd.read_csv(
    HOLIDAYS_FP,
    dtype={
        'type': 'category',
        'locale': 'category',
        'locale_name': 'category',
        'description': 'category',
        'transferred': 'bool',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
holidays_events['date'] = holidays_events['date'].replace({'2013-04-29':pd.to_datetime('2013-03-29')})
holidays_events = holidays_events.set_index('date').to_period('D')

Create Calender

In [5]:
# Creating workday calender using holidays
# https://www.kaggle.com/dkomyagin/simple-ts-ridge-rf

calender = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31')).to_period('D')
calender['dofw'] = calender.index.dayofweek

df_hev = holidays_events[holidays_events.locale == 'National']  # Only considering national holidays in this instance
df_hev = df_hev.groupby(df_hev.index).first()

calender['wd'] = True
calender.loc[calender.dofw > 4, 'wd'] = False  # Omit weekends

calender = calender.merge(df_hev, how='left', left_index=True, right_index=True)

calender.loc[calender.type == 'Bridge', 'wd'] = False
calender.loc[calender.type == 'Work Day', 'wd'] = True
calender.loc[calender.type == 'Transfer', 'wd'] = False
calender.loc[(calender.type == 'Holiday') & (calender.transferred == False), 'wd'] = False
calender.loc[(calender.type == 'Holiday') & (calender.transferred == True), 'wd'] = True

Problem definition

In [6]:
print("Training Data\n", store_sales, "\n\n")
print("Test Data\n", test)

Training Data
                                      sales  onpromotion
store_nbr family     date                              
1         AUTOMOTIVE 2013-01-01   0.000000            0
                     2013-01-02   2.000000            0
                     2013-01-03   3.000000            0
                     2013-01-04   3.000000            0
                     2013-01-05   5.000000            0
...                                    ...          ...
9         SEAFOOD    2017-08-11  23.830999            0
                     2017-08-12  16.859001            4
                     2017-08-13  20.000000            0
                     2017-08-14  17.000000            0
                     2017-08-15  16.000000            0

[3000888 rows x 2 columns] 


Test Data
                                       id  onpromotion
store_nbr family     date                            
1         AUTOMOTIVE 2017-08-16  3000888            0
                     2017-08-17  3002670            0

## Forecast Requirements and Considerations
- 16 day forecast with 1 step lead time from 16th August 2017 to 31st August 2017
- Large parts of information missing prior to June 2015 so removed from training set
- Validation using the last 16 days of the training set

In [7]:
train_start_day = datetime.datetime(2015, 6, 16)
train_end_day = datetime.datetime(2017, 7, 30)
full_train_end_day = datetime.datetime(2017, 8, 15)

val_start_day = datetime.datetime(2017, 7, 31)
val_end_day = full_train_end_day

test_start_day = datetime.datetime(2017, 8, 16)
test_end_day = datetime.datetime(2017, 8, 31)

## Boosted Hybrid Class

In [9]:
class BoostedHybrid:
    def __init__(self, model1, model2):
        self.model1 = model1
        self.model2 = model2
        self.y_columns = None
        self.stack_cols = None
        self.y_resid = None
    
    def fit1(self, X1, y, stack_cols=None):
        self.model1.fit(X1, y)
        
        y_fit = pd.DataFrame(
            self.model1.predict(X1),
            index=X1.index,
            columns=y.columns        
        )
        
        self.y_resid = y - y_fit
        self.y_resid = self.y_resid.stack(stack_cols).squeeze()
        
    def fit2(self, X2, ignore_first_n_rows, stack_cols=None):
        self.model_2.fit(X2.iloc[ignore_first_n_rows*1782:, :], self.y_resid.iloc[ignore_first_n_rows*1782:])
        self.y_columns = y.columns
        self.stack_cols = stack_cols
    
    def predict(self, X1, X2, ignore_first_n_rows):
        y_pred = pd.DataFrame(
            self.model1.predict(X1.iloc[ignore_first_n_rows:, :]),
            index=X1.iloc[ignore_first_n_rows:, :].index,
            columns=self.y_columns
        )
        
        y_pred = y_pred.stack(self.stack_cols).squeeze()
        y_pred += self.model2.predict(X2.iloc[ignore_first_n_rows*1782:, :])
        
        return y_pred.unstack(self.stack_cols)
        
        

## X1 Preparation
Creation of features for trend and seasonality

In [11]:
def create_X1_dp_features(df):
    y = df.loc[:, 'sales']
    fourier = CalendarFourier(freq='M', order=4)
    
    dp = DeterministicProcess(
        index=y.index,
        constant=True,
        order=1,
        seasonal=True,
        additional_terms=[fourier],
        drop=True    
    )
    
    return y, dp

def create_X1_day_features(X1, start_date, end_date, is_test_set=True):
    if is_test_set:
        X1 = X1.rename_axis('date')
    X1['NewYear'] = (X1.index.dayofyear == 1)
    X1['Christmas'] = (X1.index=='2016-12-25') | (X1.index=='2015-12-25') | (X1.index=='2014-12-25') | (X1.index=='2013-12-25')
    X1['wd'] = calender.loc[start_date:end_date]['wd'].values
    X1['type'] = calender.loc[start_date:end_date]['type'].values
    X1 = pd.get_dummies(X1, columns=['type'], drop_first=False)
    
    return X1

def create_X1(df, start_date, end_date):
    y, dp = create_X1_dp_features(df)
    X1 = create_X1_day_features(dp.in_sample(), start_date, end_date, is_test_set=False)
    
    return X1, y, dp

## X2 Preparation
Creation of features for categorical relationship identification

In [14]:
def encode_categoricals(df, columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])
    return df

def create_X2_lags(ts, lags, lead_time=1, name='y', stack_cols=None):
    ts = ts.unstack(stack_cols)
    df = pd.concat(
        {
            f'{name}_lag_{i}': ts.shift(i, freq="D")
            for i in range(lead_time, lags + lead_time)
        },
    axis=1)
    df = df.stack(stack_cols).reset_index()
    df = encode_categoricals(df, stack_cols)
    df = df.set_index('date').sort_values(by=stack_cols)
    
    return df

def create_X2_promo_features(df, X2):
    df['promo_mean_rolling_7'] = df['promo_lag_1'].rolling(window=7, center=False).mean()
    df['promo_mean_rolling_91'] = df['promo_lag_1'].rolling(window=91, center=False).median().fillna(method='bfill')
    return X2.merge(df, on=['date', 'store_nbr', 'family'], how='left')

def create_X2_lag_features(df, X2):
    df['y_mean_rolling_7'] = shifted_y_df['y_res_lag_1'].rolling(window=7, center=False).mean()
    df['y_median_rolling_91'] = shifted_y_df['y_res_lag_1'].rolling(window=91, center=False).median().fillna(method='bfill')
    return X2.merge(df, on=['date', 'store_nbr', 'family'], how='left')

def create_X2_other_features(df):
    df['wage_day'] = (df.index.day == df.index.daysinmonth) | (df.index.day == 15)
    df['wage_day_lag_1'] = (df.index.day == 1) | (df.index.day == 16)
    return df

def create_X2(df, y_resid):
    stack_columns = ['store_nbr', 'family']
    shifted_promo_df = make_X2_lags(df.squeeze(), lags=2, name='promo', stack_cols=['store_nbr', 'family'])    
    shifted_y_df = make_X2_lags(y_resid, lags=2, name='y_res', stack_cols=stack_solumns)
    
    df = df.reset_index(stack_columns)
    X2 = encode_categoricals(df, stack_colums)
    X2 = create_X2_other_features(X2)
    X2 = create_X2_promo_features(shifted_promo_df, X2)
    X2 = create_X2_lag_features(shifted_y_df, X2)
    
    return X2
    

## DirRec Forecasting Strategy

Based on initial testing the combination of LinearRegressor() and KNeighborsRegressor() gave the best performance on balance, so used as a starting point. [Found here](https://www.kaggle.com/jameskeogh/store-sales-time-series-forecasting)

In [None]:
model1 = LinearRegression()
model2 = KNeighborsRegressor()
max_lag = 7

In [15]:
training_days = (train_end_day - train_start_day).days + 1
validation_days = (val_end_day - val_start_day).days + 1
print(f'Training set of {training_days} days')
print(f'Validation set of {validation_days} days')

store_sales_in_date_range = store_sales.unstack(['store_nbr', 'family']).loc[train_start_day:train_end_day]
store_data_in_val_range = store_sales.unstack(['store_nbr', 'family']).loc[val_start_day:val_end_day]
y_val = y[val_start_day:val_end_day]

val_model = BoostedHybrid(model1=model1, model2=model2)

In [None]:
X1_train, y_train, dp_val = create_X1(store_sales_date_range, train_start_day, train_end_day)
val_model.fit1(X1_train, y_train, stack_cols=['store_nbr', 'family'])

X2_train = create_X2(store_sales_date_range.drop('sales', axis=1).stack(['store_nbr', 'family']),
                    val_model.y_resid)
val_model.fit2(X2_train, max_lag, stack_cols=['store_nbr', 'family'])

y_fit = val_model.predict(X1_train, X2_train, max_lag).clip(0.0)

In [None]:
dp_X1_val_date_range = dp_val.out_of_sample(steps=validation_days)

for step in range(validation_days):
    dp_steps_so_far = dp_X1_val_date_range.loc[val_start_day:val_start_day+pd.Timedelta(days=step),:]
    
    X1_combined_dp = pd.concat([dp_val.in_sample(), dp_steps_so_far])
    X2_combined = pd.concat([store_sales_in_date_range, store_data_in_val_range.loc[val_start_day:val_start_day+pd.Timedelta(days=step), :]])
    
    X1_val = create_X1_day_features(X1_combined_dp, train_start_day, val_start_day+pd.Timedelta(days=step))
    X2_val = create_X2(X2_combined.drop('sales', axis=1).stack(['store_nbr', 'family']),
                      val_model.y_resid)
    
    y_pred_combined = val_model.predict(X1_val, X2_val, max_lag).clip(0.0)
    y_plus_y_val = pd.concat([y_train, y_pred_combined.iloc[-(step+1):]])
    
    val_model.fit1(X1_val, y_plus_y_val, stack_cols=['store_nbr', 'family'])
    val_model.fit2(X2_val, max_lag, stack_cols=['store_nbr', 'family'])
    
    rmsle_valid = mean_squared_log_error(y_val.iloc[step:step+1], y_pred_combined.iloc[-1:]) ** 0.5