In [12]:
%matplotlib inline
import mxnet as mx
from mxnet import gluon
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import os
from tqdm import tqdm
from pathlib import Path

np.set_printoptions(threshold=np.inf)

In [5]:
data_path = '../data'

calendar = pd.read_csv(f'{data_path}/calendar.csv')
sell_prices = pd.read_csv(f'{data_path}/sell_prices.csv')
sales_train_validation = pd.read_csv(f'{data_path}/sales_train_evaluation.csv')

In [21]:
single_prediction_length = 28
submission_prediction_length = 56

cal_features = calendar.drop(
    ['date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year', 'event_name_1', 'event_name_2', 'd'], 
    axis=1
)
cal_features['event_type_1'] = cal_features['event_type_1'].apply(lambda x: 0 if str(x)=="nan" else 1)
cal_features['event_type_2'] = cal_features['event_type_2'].apply(lambda x: 0 if str(x)=="nan" else 1)

test_cal_features = cal_features.values.T

train_cal_features = test_cal_features[:,:-submission_prediction_length-single_prediction_length]
test_cal_features = test_cal_features[:,:-submission_prediction_length]

test_cal_features_list = [test_cal_features] * len(sales_train_validation)
train_cal_features_list = [train_cal_features] * len(sales_train_validation)

In [55]:
cal_features

Unnamed: 0,event_type_1,event_type_2,snap_CA,snap_TX,snap_WI
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,1,1,0
4,0,0,1,0,1
...,...,...,...,...,...
1936,0,0,0,0,0
1937,0,0,0,0,0
1938,0,0,0,0,0
1939,0,0,0,0,0


In [54]:
test_cal_features.shape

(5, 1885)

In [53]:
test_cal_features_list[0].shape

(5, 1885)

In [25]:
state_ids = sales_train_validation["state_id"].astype('category').cat.codes.values
state_ids_un , state_ids_counts = np.unique(state_ids, return_counts=True)

store_ids = sales_train_validation["store_id"].astype('category').cat.codes.values
store_ids_un , store_ids_counts = np.unique(store_ids, return_counts=True)

cat_ids = sales_train_validation["cat_id"].astype('category').cat.codes.values
cat_ids_un , cat_ids_counts = np.unique(cat_ids, return_counts=True)

dept_ids = sales_train_validation["dept_id"].astype('category').cat.codes.values
dept_ids_un , dept_ids_counts = np.unique(dept_ids, return_counts=True)

item_ids = sales_train_validation["item_id"].astype('category').cat.codes.values
item_ids_un , item_ids_counts = np.unique(item_ids, return_counts=True)

stat_cat_list = [item_ids, dept_ids, cat_ids, store_ids, state_ids]

stat_cat = np.concatenate(stat_cat_list)
stat_cat = stat_cat.reshape(len(stat_cat_list), len(item_ids)).T

stat_cat_cardinalities = [len(item_ids_un), len(dept_ids_un), len(cat_ids_un), len(store_ids_un), len(state_ids_un)]

In [28]:
train_df = sales_train_validation.drop(["id","item_id","dept_id","cat_id","store_id","state_id"], axis=1)
train_target_values = train_df.values

test_target_values = train_target_values.copy()
train_target_values = [ts[:-single_prediction_length] for ts in train_df.values] 

In [68]:
m5_dates = [pd.Timestamp("2011-01-29") for _ in range(len(sales_train_validation))]

In [69]:

from gluonts.dataset.common import ListDataset
from gluonts.dataset.field_names import FieldName

train_ds = ListDataset([
    {
        FieldName.TARGET: target,
        FieldName.START: start,
        FieldName.FEAT_DYNAMIC_REAL: fdr,
        FieldName.FEAT_STATIC_CAT: fsc
    }
    for (target, start, fdr, fsc) in zip(train_target_values, # (30490, 1913)
                                         m5_dates, # (30490, )
                                         train_cal_features_list,
                                         stat_cat)
], freq="D")


In [76]:
train_ds[1]['feat_static_cat'].shape

(5,)

In [34]:
len(sales_train_validation)

30490

In [27]:
train_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0,0,0,0,0,0,0,0,0,0,...,2,4,0,0,0,0,3,3,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,2,0,0,0,2,3,0,1
3,0,0,0,0,0,0,0,0,0,0,...,1,1,0,4,0,1,3,0,2,6
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,1,0,0,2,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,2,2,0,3,1,4,1,0,...,1,0,3,0,1,1,0,0,1,1
30486,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,1,0,1,0
30487,0,6,0,2,2,4,1,8,5,2,...,0,0,1,2,0,1,0,1,0,2
30488,0,0,0,0,0,0,0,0,0,0,...,1,1,1,4,6,0,1,1,1,0


In [32]:
len(train_target_values)

30490