# M5 NIXTLA
Hacer forecast con datos de la competencia m5 utilizando nixtla.

Replicar y entender la versión de nixtla

SOURCE: https://www.kaggle.com/code/lemuz90/m5-mlforecast-eval

In [1]:
import os
# set path root of repo
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('/')[:-1]
root_path = '/'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  /Users/joseortega/Documents/GitHub/forecasting-m5-dataset


In [2]:
from pathlib import Path

import lightgbm as lgb
import mlforecast
import numpy as np
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean

In [3]:
mlforecast.__version__

'0.13.4'

### 1. Data setup

In [4]:
input_path = Path('data/data_input_raw/')

#### calender

In [5]:
cal_dtypes = {
    'd': 'category',
    'wm_yr_wk': np.uint16,
    'event_name_1': 'category',
    'event_type_1': 'category',
    'event_name_2': 'category',
    'event_type_2': 'category',
    'snap_CA': np.uint8,
    'snap_TX': np.uint8,
    'snap_WI': np.uint8,
}

cal = pd.read_csv(input_path / 'calendar.csv', 
                  dtype=cal_dtypes, 
                  usecols=list(cal_dtypes.keys()) + ['date'], 
                  parse_dates=['date'])

event_cols = [k for k in cal_dtypes if k.startswith('event')]

for col in event_cols:
    cal[col] = cal[col].cat.add_categories('nan').fillna('nan')

In [6]:
cal.head()

Unnamed: 0,date,wm_yr_wk,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,d_1,,,,,0,0,0
1,2011-01-30,11101,d_2,,,,,0,0,0
2,2011-01-31,11101,d_3,,,,,0,0,0
3,2011-02-01,11101,d_4,,,,,1,1,0
4,2011-02-02,11101,d_5,,,,,1,0,1


#### prices

In [7]:
prices_dtypes = {
    'store_id': 'category',
    'item_id': 'category',
    'wm_yr_wk': np.uint16,
    'sell_price': np.float32
}
prices = pd.read_csv(input_path / 'sell_prices.csv', dtype=prices_dtypes)

In [8]:
prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [9]:
prices.shape

(6841121, 4)

#### Sales

In [10]:
sales_dtypes = {
    'id': 'category',
    'item_id': prices.item_id.dtype,
    'dept_id': 'category',
    'cat_id': 'category',
    'store_id': 'category',
    'state_id': 'category',
    **{f'd_{i}': np.float32 for i in range(1942)}
}
sales = pd.read_csv(
    input_path / 'sales_train_evaluation.csv',
    dtype=sales_dtypes,
)

In [11]:
sales.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,2.0,4.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,4.0,0.0,1.0,3.0,0.0,2.0,6.0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,1.0,0.0


In [12]:
sales.shape

(30490, 1947)

In [13]:
# transformar las ventas a formato que necesita nixtla. ID_SERIE, TIMESTAMP, VALUE
long = sales.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='y'
)

In [14]:
long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0


In [15]:
long.shape

(59181090, 8)

In [16]:
# ver únicos valores depth
long['dept_id'].unique()

['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']
Categories (7, object): ['HOBBIES_1', 'HOBBIES_2', 'HOUSEHOLD_1', 'HOUSEHOLD_2', 'FOODS_1', 'FOODS_2', 'FOODS_3']

In [17]:
# ver únicos valores cat_id
long['cat_id'].unique()

['HOBBIES', 'HOUSEHOLD', 'FOODS']
Categories (3, object): ['HOBBIES', 'HOUSEHOLD', 'FOODS']

In [18]:
# transformaciones en los datos

#%%time
%time

print(long.shape[0])
long['d'] = long['d'].astype(cal.d.dtype) # columna con el día
long = long.merge(cal, on=['d']) # merge con la data calendario
dates = sorted(long['date'].unique())
long = long.sort_values(['id', 'date'])

without_leading_zeros = long['y'].gt(0).groupby(long['id'], observed=True).transform('cummax')
above_min_date = long['date'] >= dates[-400]
keep_mask = without_leading_zeros & above_min_date
long = long[keep_mask]

print(long.shape[0])

CPU times: user 0 ns, sys: 1 μs, total: 1 μs
Wall time: 12.2 μs
59181090
12159132


In [19]:
# más transformaciones en los datos
long = long.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk']) # merge con la data prices
last_wmyrwk = long['wm_yr_wk'].max()
last_date_train = long['date'].max()
long = long.drop(columns=['d', 'wm_yr_wk'])
long = long.sample(frac=1.0, random_state=0)

In [20]:
# build X_df
future_cal = cal[cal['date'] > last_date_train]
future_prices = prices[prices['wm_yr_wk'] >= last_wmyrwk].copy()
future_prices['id'] = future_prices['item_id'].astype(str) + '_' + future_prices['store_id'].astype(str) + '_evaluation'
X_df = future_prices.merge(future_cal, on='wm_yr_wk').drop(columns=['store_id', 'item_id', 'wm_yr_wk', 'd'])

#### print del formato de los datos que se van a utilizar para entrenar

In [21]:
# calendario
future_cal.head()

Unnamed: 0,date,wm_yr_wk,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1941,2016-05-23,11617,d_1942,,,,,0,0,0
1942,2016-05-24,11617,d_1943,,,,,0,0,0
1943,2016-05-25,11617,d_1944,,,,,0,0,0
1944,2016-05-26,11617,d_1945,,,,,0,0,0
1945,2016-05-27,11617,d_1946,,,,,0,0,0


In [22]:
# precios
future_prices.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
149,CA_1,HOBBIES_1_001,11617,8.38,HOBBIES_1_001_CA_1_evaluation
150,CA_1,HOBBIES_1_001,11618,8.38,HOBBIES_1_001_CA_1_evaluation
151,CA_1,HOBBIES_1_001,11619,8.38,HOBBIES_1_001_CA_1_evaluation
152,CA_1,HOBBIES_1_001,11620,8.38,HOBBIES_1_001_CA_1_evaluation
153,CA_1,HOBBIES_1_001,11621,8.38,HOBBIES_1_001_CA_1_evaluation


In [23]:
# dataframe que une los dataframes anteriores. 
# es el dataframe DE LAS VARIABLES EXÓGENAS QUE SE UTILIZAN PARA REALIZAR EL FCST A FUTURO
X_df.head()

Unnamed: 0,sell_price,id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-23,,,,,0,0,0
1,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-24,,,,,0,0,0
2,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-25,,,,,0,0,0
3,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-26,,,,,0,0,0
4,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-27,,,,,0,0,0


In [24]:
# print dataframe que se utilizara para entrenar el modelo
long.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,y,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
1687207,HOUSEHOLD_2_080_CA_2_evaluation,HOUSEHOLD_2_080,HOUSEHOLD_2,HOUSEHOLD,CA_2,CA,0.0,2015-06-11,,,,,0,1,1,8.98
2381879,FOODS_3_733_CA_2_evaluation,FOODS_3_733,FOODS_3,FOODS,CA_2,CA,1.0,2015-05-27,,,,,0,0,0,1.88
3042688,HOUSEHOLD_2_465_CA_3_evaluation,HOUSEHOLD_2_465,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,3.0,2015-08-02,,,,,1,0,1,7.24
4521052,FOODS_2_393_CA_4_evaluation,FOODS_2_393,FOODS_2,FOODS,CA_4,CA,0.0,2015-07-21,,,,,0,0,0,3.84
3119178,FOODS_1_173_CA_3_evaluation,FOODS_1_173,FOODS_1,FOODS,CA_3,CA,2.0,2015-10-31,Halloween,Cultural,,,0,0,0,2.0


In [25]:
long.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'y',
       'date', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price'],
      dtype='object')

### 2. Training

In [26]:
model_params = {
    'verbose': -1,
    'num_threads': 4,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}

In [27]:
fcst = MLForecast(
    models=[lgb.LGBMRegressor(**model_params)],
    freq='D',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1:  [ExpandingMean()],
        7:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'],    
    num_threads=4,
)

In [28]:
# print de todas las features que se utilizarán en el modelo
example_process_data = fcst.preprocess(
    long,
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
)

In [29]:
example_process_data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,y,date,event_name_1,event_type_1,...,rolling_mean_lag28_window_size7,rolling_mean_lag28_window_size14,rolling_mean_lag28_window_size28,seasonal_rolling_mean_lag28_season_length7_window_size4,year,month,day,dayofweek,quarter,week
3042688,HOUSEHOLD_2_465_CA_3_evaluation,HOUSEHOLD_2_465,HOUSEHOLD_2,HOUSEHOLD,CA_3,CA,3.0,2015-08-02,,,...,2.428571,2.571429,2.642857,3.75,2015,8,2,6,3,31
4521052,FOODS_2_393_CA_4_evaluation,FOODS_2_393,FOODS_2,FOODS,CA_4,CA,0.0,2015-07-21,,,...,0.142857,0.214286,0.25,0.0,2015,7,21,1,3,30
3119178,FOODS_1_173_CA_3_evaluation,FOODS_1_173,FOODS_1,FOODS,CA_3,CA,2.0,2015-10-31,Halloween,Cultural,...,8.857142,8.285714,7.107143,12.75,2015,10,31,5,4,44
7856130,HOUSEHOLD_2_263_TX_3_evaluation,HOUSEHOLD_2_263,HOUSEHOLD_2,HOUSEHOLD,TX_3,TX,0.0,2015-09-03,,,...,0.142857,0.214286,0.214286,0.75,2015,9,3,3,3,36
5426723,HOUSEHOLD_2_327_TX_1_evaluation,HOUSEHOLD_2_327,HOUSEHOLD_2,HOUSEHOLD,TX_1,TX,0.0,2016-02-28,,,...,0.428571,0.214286,0.214286,0.0,2016,2,28,6,1,8


In [30]:
example_process_data.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'y',
       'date', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag7', 'lag14', 'lag21',
       'lag28', 'lag35', 'lag42', 'lag49', 'lag56', 'expanding_mean_lag1',
       'rolling_mean_lag7_window_size7', 'rolling_mean_lag7_window_size14',
       'rolling_mean_lag7_window_size28',
       'seasonal_rolling_mean_lag7_season_length7_window_size4',
       'rolling_mean_lag14_window_size7', 'rolling_mean_lag14_window_size14',
       'rolling_mean_lag14_window_size28',
       'seasonal_rolling_mean_lag14_season_length7_window_size4',
       'rolling_mean_lag28_window_size7', 'rolling_mean_lag28_window_size14',
       'rolling_mean_lag28_window_size28',
       'seasonal_rolling_mean_lag28_season_length7_window_size4', 'year',
       'month', 'day', 'dayofweek', 'quarter', 'week'],
      dtype='object')

In [31]:
#### ENTRENAR MODELO
%time
fcst.fit(
    long,
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
) # OJO: A DIFERENCIA DE LOS EJEMPLOS DE NIXTLA, AQUÍ SI SE PASA EL ID COMO FEATURE CATEGÓRICA, SE REPITE LO DE SKFOREAST, ENTRENAR
# TODAS LAS SERIES JUNTAS PERO DEJAR UNA VARIABLES CATEGORICA PARA INDICAR QUÉ SERIES ES

CPU times: user 1 μs, sys: 1 μs, total: 2 μs
Wall time: 3.1 μs


MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag7', 'lag14', 'lag21', 'lag28', 'lag35', 'lag42', 'lag49', 'lag56', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size7', 'rolling_mean_lag7_window_size14', 'rolling_mean_lag7_window_size28', 'seasonal_rolling_mean_lag7_season_length7_window_size4', 'rolling_mean_lag14_window_size7', 'rolling_mean_lag14_window_size14', 'rolling_mean_lag14_window_size28', 'seasonal_rolling_mean_lag14_season_length7_window_size4', 'rolling_mean_lag28_window_size7', 'rolling_mean_lag28_window_size14', 'rolling_mean_lag28_window_size28', 'seasonal_rolling_mean_lag28_season_length7_window_size4'], date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'], num_threads=4)

### 3. Forecasting

In [32]:
# print variables exogenas
X_df.head()

Unnamed: 0,sell_price,id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-23,,,,,0,0,0
1,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-24,,,,,0,0,0
2,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-25,,,,,0,0,0
3,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-26,,,,,0,0,0
4,8.38,HOBBIES_1_001_CA_1_evaluation,2016-05-27,,,,,0,0,0


In [33]:
# predecir las proximas 28 observaciones
preds = fcst.predict(28, X_df=X_df)
preds

Unnamed: 0,id,date,LGBMRegressor
0,HOBBIES_1_001_CA_1_evaluation,2016-05-23,0.851461
1,HOBBIES_1_001_CA_1_evaluation,2016-05-24,0.832224
2,HOBBIES_1_001_CA_1_evaluation,2016-05-25,0.796482
3,HOBBIES_1_001_CA_1_evaluation,2016-05-26,0.954522
4,HOBBIES_1_001_CA_1_evaluation,2016-05-27,1.020796
...,...,...,...
853715,FOODS_3_827_WI_3_evaluation,2016-06-15,1.674513
853716,FOODS_3_827_WI_3_evaluation,2016-06-16,1.500619
853717,FOODS_3_827_WI_3_evaluation,2016-06-17,1.667293
853718,FOODS_3_827_WI_3_evaluation,2016-06-18,2.184993


### 4. Submission
Transformar dataframe a formato que era el output de la competencia

In [34]:
wide = preds.pivot_table(index='id', columns='date', observed=True)
wide.columns = [f'F{i+1}' for i in range(28)]
wide.columns.name = None
wide.index.name = 'id'
wide

Unnamed: 0_level_0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HOBBIES_1_001_CA_1_evaluation,0.851461,0.832224,0.796482,0.954522,1.020796,1.380087,1.300216,0.886998,0.923657,0.945963,...,0.958056,1.346782,1.271043,0.840132,0.799373,0.801135,0.926332,0.941998,1.166318,1.096699
HOBBIES_1_002_CA_1_evaluation,0.307082,0.320370,0.297425,0.293146,0.344540,0.400814,0.391188,0.291923,0.285291,0.292966,...,0.341222,0.410181,0.462158,0.338453,0.332249,0.335902,0.331950,0.363919,0.424782,0.385183
HOBBIES_1_003_CA_1_evaluation,0.551267,0.529019,0.527277,0.599512,0.767732,0.800839,0.832197,0.585946,0.579314,0.601344,...,0.705381,0.782182,0.875032,0.571769,0.580318,0.567327,0.607562,0.707429,0.763306,0.774720
HOBBIES_1_004_CA_1_evaluation,1.799735,1.409688,1.517120,1.567734,1.672249,1.981546,2.582892,1.918410,1.552432,1.656689,...,1.800094,2.209181,2.488655,1.941070,1.576792,1.596811,1.622636,1.726317,2.174295,2.178862
HOBBIES_1_005_CA_1_evaluation,1.223445,1.176356,1.097823,1.099001,1.232422,1.330924,1.437891,1.101030,1.079816,1.085436,...,1.161343,1.342300,1.507308,1.071408,1.054121,1.028625,0.996503,1.120274,1.288593,1.351706
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
FOODS_3_823_WI_3_evaluation,0.539850,0.555965,0.545949,0.545715,0.607415,0.719007,0.741051,0.559853,0.571024,0.619864,...,0.611788,0.800886,0.918138,0.619066,0.621271,0.637605,0.572944,0.601671,0.760246,0.721334
FOODS_3_824_WI_3_evaluation,0.215571,0.193097,0.187057,0.238210,0.225311,0.253728,0.244779,0.237729,0.211708,0.267947,...,0.221890,0.302247,0.288210,0.258070,0.254260,0.266593,0.228705,0.236464,0.265666,0.234377
FOODS_3_825_WI_3_evaluation,0.716715,0.613589,0.682452,0.602612,0.658624,0.794682,0.884873,0.752493,0.658759,0.720721,...,0.705600,0.902983,0.957247,0.768277,0.717316,0.755056,0.626830,0.671610,0.803549,0.797235
FOODS_3_826_WI_3_evaluation,1.099772,1.199994,1.139607,1.156562,1.178022,1.354000,1.289993,1.134941,1.088194,1.070129,...,1.204321,1.392661,1.396314,1.235607,1.450005,1.175857,1.113025,1.133643,1.312658,1.257849
