In [11]:
%pylab inline
import pandas as pd
import plotly.express as px
from tqdm import tqdm_notebook
from tqdm.autonotebook import tqdm

from sktime.forecasting.model_selection import temporal_train_test_split, SlidingWindowSplitter
from sktime.performance_metrics.forecasting import smape_loss, mase_loss
from sktime.utils.plotting.forecasting import plot_ys
from sktime.forecasting.naive import NaiveForecaster
from sktime.forecasting.trend import PolynomialTrendForecaster
from sktime.forecasting.arima import AutoARIMA
from sktime.forecasting.compose import EnsembleForecaster, ReducedRegressionForecaster, TransformedTargetForecaster
from sktime.transformers.single_series.detrend import Deseasonalizer, Detrender
from xgboost import XGBRegressor
import swifter

tqdm.pandas()

Populating the interactive namespace from numpy and matplotlib


## Objective:

Generate __42,840 28-day time-series projections__ where the forecasted time series are:

* Unit sales of all products, aggregated for all stores/states - __1__
* Unit sales of all products, aggregated for each State - __3__
* Unit sales of all products, aggregated for each store - __10__
* Unit sales of all products, aggregated for each category - __3__
* Unit sales of all products, aggregated for each department - __7__
* Unit sales of all products, aggregated for each State and category - __9__
* Unit sales of all products, aggregated for each State and department - __21__
* Unit sales of all products, aggregated for each store and category - __30__
* Unit sales of all products, aggregated for each store and department - __70__
* Unit sales of product x, aggregated for all stores/states - __3,049__
* Unit sales of product x, aggregated for each State - __9,147__
* Unit sales of product x, aggregated for each store - __30,490__

In [2]:
df_sales_train = pd.read_csv('../data/sales_train_evaluation.csv')
df_sell_prices = pd.read_csv('../data/sell_prices.csv')
df_calendar = pd.read_pickle('../data/processed/calendar_OHE.pkl')

### Data Preprocessing

In [3]:
df_sell_prices['id'] = df_sell_prices['item_id'] + '_' + df_sell_prices['store_id'] + '_evaluation'

In [180]:
def generate_naive_xgb_forecast(data, params=None, step_length=1):
    if params is None:
        params = {
            'booster': 'gblinear',
            'n_estimators': 800,
            'eta': 0.01,
            'objective': 'count:poisson',
#             'gpu_id': 0,
#             'tree_method': 'gpu_hist',
        }
    regressor = XGBRegressor(
        **params
    )
    forecaster = ReducedRegressionForecaster(regressor, window_length=730, strategy="recursive", step_length=step_length)
    fh = np.arange(28) + 1
    forecaster.fit(data, fh)
    y_last = forecaster.predict(fh)
    y_last = y_last.clip()
    return y_last.clip(data.min(), data.max()).round()

In [27]:
def generate_simple_xgb_forecast(data, params=None, step_length=7):
    if params is None:
        params = {
            'n_estimators': 400,
            'eta': 0.01,
            'objective': 'reg:squarederror',
            'colsample_bytree': 0.5,
            'subsample': 0.5,
            'max_depth': 13, 
#             'gpu_id': 0,
#             'tree_method': 'gpu_hist',
            'eval_metric': 'rmse'
        }
    regressor = XGBRegressor(
        **params
    )

    forecaster = TransformedTargetForecaster([
        ("deseasonalise", Deseasonalizer(model="additive", sp=12)),
        ("detrend", Detrender(forecaster=PolynomialTrendForecaster(degree=1))),
        ("forecast", ReducedRegressionForecaster(regressor=regressor, window_length=730, strategy="recursive", step_length=step_length))
    ])
    fh = np.arange(28) + 1
    forecaster.fit(data)
    y_pred = forecaster.predict(fh)
    return y_pred

In [145]:
results = df_sales_train[df_sales_train.columns[df_sales_train.columns.str.startswith('d_')]].copy().progress_apply(
    lambda row: generate_naive_xgb_forecast(row.reset_index(drop=True)), axis=1)

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))




In [146]:
tmp = pd.DataFrame(results).rename({i: f'F{j}' for i, j in zip(range(1941, 1969), range(1, 29))}, axis=1)

In [147]:
tmp['id'] = df_sales_train['id'].copy()

In [148]:
tmp = tmp[['id'] + tmp.columns[tmp.columns.str.contains('F')].tolist()]

In [161]:
tmp['id'] = tmp['id'].str.replace('validation', 'evaluation')

In [170]:
tmp = tmp.fillna(0)

In [171]:
tmp2 = df_sales_train[['id'] + [f'd_{i}' for i in range(1914, 1942)]].rename({f'd_{i}': f'F{j}' for i, j in zip(range(1914, 1942), range(1, 29))}, axis=1)
tmp2 = tmp2[tmp2['id'].isin(tmp['id'])]
tmp2['id'] = tmp2['id'].str.replace('evaluation', 'validation')

In [172]:
tmp2.append(tmp).to_csv('../data/results/poisson_xgb_submission.csv', index=False)

In [181]:
results_valid = df_sales_train[
    [f'd_{i}' for i in range(1, 1942)]
].copy().progress_apply(
    lambda row: generate_naive_xgb_forecast(row.reset_index(drop=True), step_length=28), axis=1)

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))




In [185]:
results_valid['id'] = tmp['id'].str.replace('evaluation', 'validation')

In [204]:
tmp.append(results_valid.rename({i: f'F{j}' for i, j in zip(range(1941, 1969), range(1, 29))}, axis=1)).fillna(0).to_csv('../data/results/poisson_xgb_submission.csv', index=False)

### Simple Pipeline

In [28]:
results = df_sales_train[df_sales_train.columns[df_sales_train.columns.str.startswith('d_')]].copy().progress_apply(
    lambda row: generate_simple_xgb_forecast(row.reset_index(drop=True)), axis=1)

HBox(children=(FloatProgress(value=0.0, max=30490.0), HTML(value='')))




In [34]:
results.max(axis=1).describe()

count    30490.000000
mean         2.371461
std          3.951908
min          0.008202
25%          0.710368
50%          1.307806
75%          2.536052
max        146.460925
dtype: float64

In [35]:
df_sales_train[df_sales_train.columns[df_sales_train.columns.str.startswith('d_')]].max(axis=1).describe()

count    30490.000000
mean        13.530764
std         22.180518
min          1.000000
25%          5.000000
50%          8.000000
75%         14.000000
max        763.000000
dtype: float64