In [15]:
# Setup feedback system
from learntools.core import binder
binder.bind(globals())
# from learntools.time_series.ex6 import *

# Setup notebook
from pathlib import Path
import ipywidgets as widgets
from learntools.time_series.style import *  # plot style settings
from learntools.time_series.utils import (create_multistep_example,
                                          load_multistep_data,
                                          make_lags,
                                          make_multistep_target,
                                          plot_multistep)

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import RegressorChain
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor


comp_dir = Path('store-sales-time-series-forecasting')
store_sales = pd.read_csv(
   'train.csv',
    usecols=['store_nbr', 'family', 'date', 'sales', 'onpromotion'],
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'sales': 'float32',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
store_sales['date'] = store_sales.date.dt.to_period('D')
store_sales = store_sales.set_index(['store_nbr', 'family', 'date']).sort_index()

family_sales = (
    store_sales
    .groupby(['family', 'date'])
    .mean()
    .unstack('family')
    .loc['2017']
)

test = pd.read_csv(
    'test.csv',
    dtype={
        'store_nbr': 'category',
        'family': 'category',
        'onpromotion': 'uint32',
    },
    parse_dates=['date'],
    infer_datetime_format=True,
)
test['date'] = test.date.dt.to_period('D')
test = test.set_index(['store_nbr', 'family', 'date']).sort_index()



In [16]:
print("Training Data", "\n" + "-" * 13 + "\n", store_sales)
print("\n")
print("Test Data", "\n" + "-" * 9 + "\n", test)

Training Data 
-------------
                                      sales  onpromotion
store_nbr family     date                              
1         AUTOMOTIVE 2013-01-01   0.000000            0
                     2013-01-02   2.000000            0
                     2013-01-03   3.000000            0
                     2013-01-04   3.000000            0
                     2013-01-05   5.000000            0
...                                    ...          ...
9         SEAFOOD    2017-08-11  23.830999            0
                     2017-08-12  16.859001            4
                     2017-08-13  20.000000            0
                     2017-08-14  17.000000            0
                     2017-08-15  16.000000            0

[3000888 rows x 2 columns]


Test Data 
---------
                                       id  onpromotion
store_nbr family     date                            
1         AUTOMOTIVE 2017-08-16  3000888            0
                     2017-08

In [19]:
# Create multistep dataset for Store Sales
y = family_sales.loc[:, 'sales']

# YOUR CODE HERE: Make 4 lag features
X = make_lags(y, lags=4).dropna()

# YOUR CODE HERE: Make multistep target
y = make_multistep_target(y, steps=16).dropna()

y, X = y.align(X, join='inner', axis=0)


In [20]:
le = LabelEncoder()
X = (X
    .stack('family')  # wide to long
    .reset_index('family')  # convert index to column
    .assign(family=lambda x: le.fit_transform(x.family))  # label encode
)
y = y.stack('family')  # wide to long

display(y)

Unnamed: 0_level_0,Unnamed: 1_level_0,y_step_1,y_step_2,y_step_3,y_step_4,y_step_5,y_step_6,y_step_7,y_step_8,y_step_9,y_step_10,y_step_11,y_step_12,y_step_13,y_step_14,y_step_15,y_step_16
date,family,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-01-05,AUTOMOTIVE,6.333333,6.018518,10.259259,9.388889,5.944445,4.777778,6.314815,5.388889,5.240741,8.500000,10.259259,6.407407,5.685185,5.703704,4.777778,5.148148
2017-01-05,BABY CARE,0.351852,0.277778,0.259259,0.240741,0.444444,0.240741,0.277778,0.296296,0.296296,0.388889,0.425926,0.314815,0.166667,0.222222,0.129630,0.166667
2017-01-05,BEAUTY,5.925926,6.518518,10.037037,11.611111,5.648148,6.500000,5.277778,4.370370,4.703704,7.777778,9.037037,5.648148,5.351852,4.740741,3.981482,4.592593
2017-01-05,BEVERAGES,3258.796387,3507.277832,4848.518555,5503.647949,3448.203613,3171.740723,3046.870361,2693.722168,3226.037109,4667.296387,5580.611328,3700.370361,3409.796387,3263.462891,2676.573975,3003.555664
2017-01-05,BOOKS,0.407407,0.537037,0.481481,0.722222,0.500000,0.518519,0.481481,0.388889,0.444444,0.574074,0.555556,0.388889,0.500000,0.407407,0.277778,0.351852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-07-31,POULTRY,364.955658,403.601349,377.313965,316.436096,533.497070,416.454010,464.596558,344.051727,313.780884,305.270203,278.819855,468.857361,354.342773,379.801208,344.398285,325.679810
2017-07-31,PREPARED FOODS,84.698647,87.836800,88.735962,77.172997,91.886757,100.384964,102.248146,86.627441,77.344131,84.796539,78.791443,96.286926,84.693817,91.509422,86.062500,85.954132
2017-07-31,PRODUCE,2257.140625,2609.180176,3122.895752,1792.220947,2079.319580,2418.970215,2675.105713,2111.133545,2168.535400,2663.076172,1670.264893,2198.854492,2070.154785,2331.922363,2134.399902,2316.832764
2017-07-31,SCHOOL AND OFFICE SUPPLIES,30.111111,49.333332,57.481480,51.907406,63.222221,85.203705,100.277779,64.407410,59.759258,53.740742,42.962963,65.240738,67.481483,68.851852,52.333332,46.851852


In [21]:
display(X)

Unnamed: 0_level_0,family,y_lag_1,y_lag_2,y_lag_3,y_lag_4
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-05,0,6.833333,8.296296,11.481482,0.092593
2017-01-05,1,0.333333,0.296296,0.259259,0.037037
2017-01-05,2,6.888889,7.185185,11.648149,0.055556
2017-01-05,3,3911.833252,4507.814941,6208.055664,74.222221
2017-01-05,4,0.759259,0.814815,0.481481,0.000000
...,...,...,...,...,...
2017-07-31,28,464.615662,416.242065,528.171875,269.486877
2017-07-31,29,101.991165,100.165146,87.455833,77.199738
2017-07-31,30,2704.551758,2444.234375,2073.127686,1675.579346
2017-07-31,31,37.537037,24.907408,10.500000,4.518518


In [23]:
from sklearn.multioutput import RegressorChain
model = RegressorChain(XGBRegressor())
from sklearn.multioutput import RegressorChain
model = RegressorChain(XGBRegressor())

In [None]:
category = set(store_sales.reset_index('family')['family'].tolist())
FAMILY = 'BEAUTY'
START = '2017-04-01'
EVERY = 1
for FAMILY in category:
    y_pred_ = y_pred.xs(FAMILY, level='family', axis=0).loc[START:]
    y_ = family_sales.loc[START:, 'sales'].loc[:, FAMILY]

    fig, ax = plt.subplots(1, 1, figsize=(11, 4))
    ax = y_.plot(**plot_params, ax=ax, alpha=0.5)
    ax = plot_multistep(y_pred_, ax=ax, every=EVERY)
    _ = ax.legend([FAMILY, FAMILY + ' Forecast'])