# M5 Forecasting

[Introduction](#Introduction)


[EDA](#EDA)

To-do
- Denoising

Statistical Model
- ARIMA
- Exponential Smoothing
- Theta Method

Machine Learning Model
- GBM
- LSTM
Multi-step ahead forecasting

# Introduction
## Goal
Predict Sales data provided by Walmart **28** days into the future

## Data
sales_train.csv: this is our main training data. It has 1 column for each of the 1941 days from 2011-01-29 and 2016-05-22; not including the validation period of 28 days until 2016-06-19. It also includes the IDs for item, department, category, store, and state. The number of rows is 30490 for all combinations of 30490 items and 10 stores.

sell_prices.csv: the store and item IDs together with the sales price of the item as a weekly average.

calendar.csv: dates together with related features like day-of-the week, month, year, and an 3 binary flags for whether the stores in each state allowed purchases with SNAP food stamps at this date (1) or not (0).

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import os
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

In [6]:
'''
util function
https://www.kaggle.com/ratan123/m5-forecasting-lightgbm-with-timeseries-splits
'''
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
def load_data(dir_name, sales_path='sales_train_validation.csv',
             calendar_path='calendar.csv', price_path='sell_prices.csv'):
    sales = pd.read_csv(os.path.join(dir_name, sales_path))
    calendar = pd.read_csv(os.path.join(dir_name, calendar_path))
    price = pd.read_csv(os.path.join(dir_name, price_path))
    return sales, calendar, price
#     return reduce_mem_usage(sales), reduce_mem_usage(calendar), reduce_mem_usage(price)

sales, calendar, price = load_data('../m5-forecasting-accuracy')

In [3]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [5]:
calendar.head(-10)

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1954,2016-06-05,11619,Sunday,2,6,2016,d_1955,,,,,1,1,1
1955,2016-06-06,11619,Monday,3,6,2016,d_1956,,,,,1,1,1
1956,2016-06-07,11619,Tuesday,4,6,2016,d_1957,Ramadan starts,Religious,,,1,1,0
1957,2016-06-08,11619,Wednesday,5,6,2016,d_1958,,,,,1,0,1


## Merge

In [15]:
'''
Merge the dataframes
'''
# sales_train = pd.melt(sales, id_vars = ['id', 'item_id', 'dept_id',
#                                         'cat_id', 'store_id', 'state_id'],
#                       var_name = 'day',value_name = 'demand')
# train_df = pd.merge(sales_train, calendar, how='left',
#                     left_on=['day'], right_on=['d'])
# train_df = reduce_mem_usage(train_df)
# train_df = pd.merge(train_df, price, how='left', 
#                     on=['store_id','item_id','wm_yr_wk'])
# train_df.to_csv('merged_train.csv', index=False)

  interactivity=interactivity, compiler=compiler, result=result)


In [27]:
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 6953.16 Mb (32.1% reduction)


## RMean + WK/M lang

In [32]:
# weekday: overlap with wday
# wm_yr_wk: index for merging, no additional info
# date: no additional info
# d: overlap with day
drop_col = ['wm_yr_wk', 'date', 'd', 'weekday']
train_df.drop(drop_col, inplace=True, axis=1)

In [14]:
train_df.shape

(58327370, 19)

In [38]:
# week and month shift
lags = list(range(1,8))+[28]
for l in lags:
    train_df[f"lag_{l}"] = train_df[['id', 'demand']].groupby('id')['demand'].shift(l)


In [65]:
# rolling mean
window = [7,28]
for w in window:
    for l in window:
        train_df[f"rmean_{l}_{w}"] = train_df[['id',f"lag_{l}"]].groupby('id')[f"lag_{l}"].transform(
                                                lambda x: x.rolling(w).mean())
        

In [66]:
train_df.to_csv('../m5-forecasting-accuracy/merged_train.csv', index=False)

## Label Encoding

In [4]:
train_df = pd.read_csv('../m5-forecasting-accuracy/merged_train.csv')
print('finish loading dataframe')
print('starting reducing mem use')
train_df = reduce_mem_usage(train_df)


  interactivity=interactivity, compiler=compiler, result=result)


finish loading dataframe
starting reducing mem use
Mem. usage decreased to 7286.92 Mb (48.8% reduction)


In [7]:
check_na = pd.isna(train_df)
check_na.sum(axis=0)

id                     0
item_id                0
dept_id                0
cat_id                 0
store_id               0
state_id               0
day                    0
demand                 0
weekday                0
wday                   0
month                  0
year                   0
event_name_1    53631910
event_type_1    53631910
event_name_2    58205410
event_type_2    58205410
snap_CA                0
snap_TX                0
snap_WI                0
sell_price      12299413
lag_1              30499
lag_2              60989
lag_3              91479
lag_4             121969
lag_5             152459
lag_6             182949
lag_7             213439
lag_28            853729
rmean_7_7         396379
rmean_28_7       1036669
rmean_7_28       1036669
rmean_28_28      1676959
dtype: int64

In [12]:
# change nans to string type
nan_feature = ["event_name_1", "event_name_2", "event_type_1", 
               "event_type_2"]
for f in nan_feature:
    print(f"converting {f}")
    train_df[f][check_na[f]] = 'NaN'


# convert category features to non-negative int
cat_feature = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id',
              "event_name_1", "event_name_2", "event_type_1", 
               "event_type_2"]

d = defaultdict(LabelEncoder)
fit = train_df[cat_feature].apply(lambda x: d[x.name].fit_transform(x))

train_df = pd.concat([train_df[train_df.columns[~train_df.columns.isin(cat_feature)]],
                    fit], axis=1)
# # Inverse the encoded
# fit.apply(lambda x: d[x.name].inverse_transform(x))

# # Using the dictionary to label future data
# df.apply(lambda x: d[x.name].transform(x))



converting event_name_1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


converting event_name_2
converting event_type_1
converting event_type_2


In [34]:
train_df.dtypes

id               object
day              object
demand            int16
weekday          object
wday               int8
month              int8
year              int16
snap_CA            int8
snap_TX            int8
snap_WI            int8
sell_price      float16
lag_1           float16
lag_2           float16
lag_3           float16
lag_4           float16
lag_5           float16
lag_6           float16
lag_7           float16
lag_28          float16
rmean_7_7       float16
rmean_28_7      float16
rmean_7_28      float16
rmean_28_28     float16
item_id           int64
dept_id           int64
store_id          int64
cat_id            int64
state_id          int64
event_name_1      int64
event_name_2      int64
event_type_1      int64
event_type_2      int64
dtype: object

In [36]:
train_df.to_csv('../m5-forecasting-accuracy/preprocessed_train.csv', index=False)

## train lgbt

In [None]:
train_df = pd.read_csv('../m5-forecasting-accuracy/preprocessed_train.csv')
train_df.dtypes

In [7]:
train_df = reduce_mem_usage(train_df)

Mem. usage decreased to 3838.15 Mb (73.0% reduction)


In [8]:
drop_col = ['id', 'day', 'weekday','demand']
X_train = train_df[train_df.columns[~train_df.columns.isin(drop_col)]]
y_train = train_df['demand']


In [9]:
np.random.seed(12)

cat_feature = ['item_id', 'dept_id','store_id', 'cat_id', 'state_id',
              "event_name_1", "event_name_2", "event_type_1", 
               "event_type_2", 'snap_CA', 'snap_TX', 'snap_WI']
valid_inds = np.random.choice(X_train.index.values, 2_000_000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                         categorical_feature=cat_feature, free_raw_data=False)

valid_data = lgb.Dataset(X_train.loc[valid_inds], label = y_train.loc[valid_inds],
                        categorical_feature=cat_feature, free_raw_data=False)


In [None]:
del train_df, X_train, y_train, valid_inds,train_inds ; gc.collect()

In [12]:
params = {
        "objective" : "poisson",
        "metric" :"rmse",
        "force_row_wise" : True,
        "learning_rate" : 0.075,
#         "sub_feature" : 0.8,
        "sub_row" : 0.75,
        "bagging_freq" : 1,
        "lambda_l2" : 0.1,
#         "nthread" : 4
        "metric": ["rmse"],
    'verbosity': 1,
    'num_iterations' : 1200,
    'num_leaves': 128,
    "min_data_in_leaf": 100,
}

In [None]:
m_lgb = lgb.train(params, train_data, valid_sets = [valid_data], verbose_eval=20) 




[20]	valid_0's rmse: 14.1661
[40]	valid_0's rmse: 7.01214
[60]	valid_0's rmse: 3.90283
[80]	valid_0's rmse: 2.7241
[100]	valid_0's rmse: 2.33107
[120]	valid_0's rmse: 2.21216
[140]	valid_0's rmse: 2.16688
[160]	valid_0's rmse: 2.1472
[180]	valid_0's rmse: 2.13627
[200]	valid_0's rmse: 2.13128
[220]	valid_0's rmse: 2.12347
[240]	valid_0's rmse: 2.1183
[260]	valid_0's rmse: 2.11245
[280]	valid_0's rmse: 2.10618
[300]	valid_0's rmse: 2.09991
[320]	valid_0's rmse: 35.9053
[340]	valid_0's rmse: 22.4725
[360]	valid_0's rmse: 11.6977
[380]	valid_0's rmse: 7.49529
[400]	valid_0's rmse: 4.38698
[420]	valid_0's rmse: 3.18768
[440]	valid_0's rmse: 2.46608
[460]	valid_0's rmse: 2.30347
[480]	valid_0's rmse: 2.23412
[500]	valid_0's rmse: 2.17668
[520]	valid_0's rmse: 2.14594
[540]	valid_0's rmse: 2.11752
[560]	valid_0's rmse: 2.1072
[580]	valid_0's rmse: 2.10141
[600]	valid_0's rmse: 2.09706
[620]	valid_0's rmse: 2.08967
[640]	valid_0's rmse: 2.08255
[660]	valid_0's rmse: 2.07656
[680]	valid_0's rm