In [None]:
# Python Imports
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.deterministic import DeterministicProcess, CalendarFourier
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_log_error
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from sktime.forecasting.tbats import TBATS

In [None]:
# Create paths
root_path = './data'
train_path = os.path.join(root_path, 'train.csv')
oil_path = os.path.join(root_path, 'oil.csv')
holiday_path = os.path.join(root_path, 'holidays_events.csv')

In [None]:
# dates
start_train_date = '2016-07-16'
end_train_val_date = '2017-07-15'
end_train_test_date = '2017-08-15'
start_val_date = '2017-07-16'
end_val_date = '2017-07-31'
start_test_date = '2017-08-16'
end_test_date = '2017-08-31'

In [None]:
# Load in the train dataset
base_train_df = pd.read_csv(train_path, index_col='date', parse_dates=['date'])
base_train_df.head(15)

In [None]:
# Convert the data to log1p
log_train_df = base_train_df.loc[base_train_df.index >= start_train_date, ['store_nbr', 'family', 'sales']]
log_train_df.sales = np.log1p(log_train_df.sales)

# Clean up new years data and add christmas
families = log_train_df.family.unique()
stores = log_train_df.store_nbr.unique()
xmas_list = []
for store in stores:
    for family in families:
        xmas_list.append(['2016-12-25', store, family, 0])
xmas_df = pd.DataFrame(xmas_list, columns=['date', 'store_nbr', 'family', 'sales']).set_index('date')
log_train_df = pd.concat([log_train_df, xmas_df])
log_train_df = log_train_df.reset_index().set_index(['date', 'store_nbr', 'family'])
for store in stores:
    for family in families:
        # log_train_df.loc[('2016-01-01', store, family)] = log_train_df.loc[('2016-01-02', store, family)]
        log_train_df.loc[('2017-01-01', store, family)] = log_train_df.loc[('2017-01-02', store, family)]
        log_train_df.loc[('2016-12-25', store, family)] = log_train_df.loc[('2016-12-26', store, family)]

log_train_df = log_train_df.reset_index().set_index('date')

log_train_df.head(15)

In [None]:
# Create a plot of each family averaged
for family in families:
    plt.plot(log_train_df.loc[log_train_df.family == family, 'sales'].reset_index().groupby('date').sales.mean())
    plt.suptitle(family)
    plt.show()

In [None]:
# Load in the oil data
date_range = pd.date_range('2013-01-01', end_test_date)
base_oil_df = pd.read_csv(oil_path, index_col='date', parse_dates=['date'])
base_oil_df['rolling_avg'] = base_oil_df.dcoilwtico.rolling(window=7, center=False).mean()
avg_oil_df = pd.DataFrame(index=date_range)
avg_oil_df = avg_oil_df.join(base_oil_df)
avg_oil_df.rolling_avg.fillna(method='ffill', inplace=True)
avg_oil_df['wd'] = avg_oil_df.index.dayofweek <= 4
avg_oil_df.wd = avg_oil_df.wd.astype(int)
avg_oil_df.head(15)

# add fourier terms
fourier_w = CalendarFourier('W', 4)
fourier_a = CalendarFourier('A', 4)
fourier_m = CalendarFourier('M', 4)
dp = DeterministicProcess(
    index=pd.date_range(start_train_date, end_val_date),
    order=0,
    constant=False,
    additional_terms=[fourier_m, fourier_a, fourier_w]
)
dp_df = dp.in_sample()
exog_df = dp_df.join(avg_oil_df.loc[:, ['rolling_avg', 'wd']])
exog_df.head()

In [None]:
# Get the holiday information
base_holiday_df = pd.read_csv(holiday_path)
base_holiday_df

In [None]:
# Run adfuller test
def run_adfuller(df, f, s):
    dftest = adfuller(df.loc[(df.family == f) & (df.store_nbr == s), 'sales'], autolag = 'AIC')
    print("1. ADF : ",dftest[0])
    print("2. P-Value : ", dftest[1])
    print("3. Num Of Lags : ", dftest[2])
    print("4. Num Of Observations Used For ADF Regression and Critical Values Calculation :", dftest[3])
    print("5. Critical Values :")
    for key, val in dftest[4].items():
        print("\t",key, ": ", val)
#
# for family in families:
#     print(f'Family: {family}')
#     for store in stores:
#         print(f'Store: {store}')
#         run_adfuller(log_train_df, family, store)

In [None]:
# Grid Search the best params
def grid_search(train_series, test_series):
    p_params = [0, 1, 2]
    d_params = [0, 1, ]
    q_params = [0, 1, 2]
    t_params = ['n', 'c', 't', 'ct']
    P_params = [0, 1, 2]
    D_params = [0, 1]
    Q_params = [0, 1, 2]
    m_params = [7, 14, 28]
    best_error = 12
    best_cfg = None
    for p in p_params:
        for d in d_params:
            for q in q_params:
                for t in t_params:
                    for P in P_params:
                        for D in D_params:
                            for Q in Q_params:
                                for m in m_params:
                                    cfg = [(p, d, q), (P, D, Q, m), t]
                                    model = SARIMAX(train_series.sales, order=cfg[0], seasonal_order=cfg[1], trend=cfg[2], enforce_stationarity=False, enforce_invertibility=False)
                                    fit_model = model.fit(disp=False)
                                    predictions = fit_model.predict(start=len(train_series), end=len(train_series)+len(test_series)-1, dynamic=True)
                                    predictions[predictions < 0] = 0
                                    error = mean_squared_log_error(np.expm1(test_series.sales), np.expm1(predictions), squared=False)
                                    if error < best_error:
                                        best_error = error
                                        best_cfg = cfg
    return best_error, best_cfg

results_dict = dict()
log_train_df = log_train_df.reset_index()
for family in families:
    print(f'Running for {family}')
    results_dict[family] = dict()
    for store in tqdm(stores):
        train = log_train_df.loc[(log_train_df.date >= start_train_date) &
                                 (log_train_df.date <= end_train_val_date) &
                                 (log_train_df.family == family) &
                                 (log_train_df.store_nbr == store), ['date', 'sales']].reset_index().loc[:, ['date', 'sales']]
        val = log_train_df.loc[(log_train_df.date >= start_val_date) &
                                 (log_train_df.date <= end_val_date) &
                                 (log_train_df.family == family) &
                                 (log_train_df.store_nbr == store), ['date', 'sales']].reset_index().loc[:, ['date', 'sales']]
        err, cfg = grid_search(train, val)
        results_dict[family][store] = {
            'error': err,
            'cfg': cfg
        }

print(results_dict)

In [None]:
# Once best model is found for each store and family, run and fit model on full train data

In [None]:
# With each trained model, create the predictions