## LGBM - ENTRENAR UN ÚNICO MODELO - FORECAST POR RECURSIVIDAD NIXTLA
## BASADO EN BASELINE - SE HACE LO MISMO PERO SIN REORDENAR LAS FEATURES

PUNTOS DESTACADOS:
- Se entrena con solo las últimas 400 observaciones comenzando cuando el valor de incio sea distinto de cero (comenzar serie con un valor)
- No se reordenan las filas (se mantiene el orden temporal) a diferencia del caso base donde se hace un suffle

In [1]:
import os
# set path root of repo
actual_path = os.path.abspath(os.getcwd())
list_root_path = actual_path.split('/')[:-1]
root_path = '/'.join(list_root_path)
os.chdir(root_path)
print('root path: ', root_path)

root path:  /Users/joseortega/Documents/GitHub/forecasting-m5-dataset


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import skew, kurtosis, variation
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

import lightgbm as lgb
import mlforecast
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
mlforecast.__version__

'0.13.4'

### 1. Read raw files

In [5]:
folder_data = 'data/data_input_dtype/'

df_calender = pd.read_pickle(folder_data + 'calendar.pkl')
df_prices = pd.read_pickle(folder_data + 'sell_prices.pkl')
df_sales = pd.read_pickle(folder_data + 'sales_train_evaluation.pkl')
df_sample_output = pd.read_pickle(folder_data + 'sample_submission.pkl')

In [6]:
df_calender.head(3)

Unnamed: 0,date,wm_yr_wk,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,d_1,,,,,0,0,0
1,2011-01-30,11101,d_2,,,,,0,0,0
2,2011-01-31,11101,d_3,,,,,0,0,0


In [7]:
df_prices.head(3)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26


In [8]:
df_sales.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,2.0,4.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,1.0
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,0.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,0.0,0.0,2.0,3.0,0.0,1.0


### 2. Transformar df_sales a formato compatible con modelos de forecast timeseries

In [9]:
# transformar las ventas a formato que necesita nixtla. ID_SERIE, TIMESTAMP, VALUE
data = df_sales.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='y'
)

In [10]:
# correguir id de la serie. Eliminar "evaluation" para tener nombres más cortos
data['id'] = data['id'].str.rsplit('_', n=1).str[0].astype('category')

In [11]:
# cambiar tipo de dato a tipo category
# data['d'] = data['d'].astype(df_calender.d.dtype)

In [12]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y
0,HOBBIES_1_001_CA_1,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
1,HOBBIES_1_002_CA_1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
2,HOBBIES_1_003_CA_1,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
3,HOBBIES_1_004_CA_1,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0
4,HOBBIES_1_005_CA_1,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0


### 3. Merge con tabla calendario (key: "d")

In [13]:
# hacer merge data con df_calender. Tener df con la data de info de feriados, eventos, etc
data = data.merge(df_calender, on=['d'])

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59181090 entries, 0 to 59181089
Data columns (total 17 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            category      
 1   item_id       category      
 2   dept_id       category      
 3   cat_id        category      
 4   store_id      category      
 5   state_id      category      
 6   d             object        
 7   y             float32       
 8   date          datetime64[ns]
 9   wm_yr_wk      uint16        
 10  event_name_1  category      
 11  event_type_1  category      
 12  event_name_2  category      
 13  event_type_2  category      
 14  snap_CA       uint8         
 15  snap_TX       uint8         
 16  snap_WI       uint8         
dtypes: category(10), datetime64[ns](1), float32(1), object(1), uint16(1), uint8(3)
memory usage: 2.0+ GB


In [15]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,HOBBIES_1_001_CA_1,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,,,,,0,0,0
1,HOBBIES_1_002_CA_1,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,,,,,0,0,0
2,HOBBIES_1_003_CA_1,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,,,,,0,0,0
3,HOBBIES_1_004_CA_1,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,,,,,0,0,0
4,HOBBIES_1_005_CA_1,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,2011-01-29,11101,,,,,0,0,0


### 4. Filtrar cada serie: fecha de inicio cuando los paquetes dejan de ser cero y 400 días de la última fecha. ENTRENAR USANDO ÚNICAMENTE LOS ÚLTIMOS 400 DÍAS DE DATOS

In [16]:
data.shape # tamaño antes de filtrar

(59181090, 17)

In [17]:
# ordenar data de la forma. Serie, timestamp
data = data.sort_values(['id', 'date'])
data = data.reset_index().drop(columns = 'index')

In [18]:
# listado con las fechas
dates = sorted(data['date'].unique())

In [19]:
# mask: indicar cuando cada serie tiene valores que dejan de ser cero
mask_without_leading_zeros = data['y'].gt(0).groupby(data['id'], observed=True).transform('cummax')

In [20]:
# mask: tomar todos los datos, excepto los últimos 400 días (POR QUÉ SE HACE ESO???? NO TIENE SENTIDO)
mask_above_min_date = data['date'] >= dates[-400]

In [21]:
# mask: unir ambos mask y filtrar data desde fecha de inicio cuando comienzan los datos y fecha fin 400 días antes
keep_mask = mask_without_leading_zeros & mask_above_min_date
data = data[keep_mask]

In [22]:
data.shape # shape despues de filtrsar

(12159132, 17)

In [23]:
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1541,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1542,1.0,2015-04-19,11512,,,,,0,0,0
1542,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1543,0.0,2015-04-20,11512,,,,,0,0,0
1543,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1544,0.0,2015-04-21,11512,,,,,0,0,0
1544,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1545,0.0,2015-04-22,11512,,,,,0,0,0
1545,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1546,1.0,2015-04-23,11512,,,,,0,0,0


### 5. Concatenar con df precios (key: "store_id", "item_id", "week_year")
- RECORDAR QUE LA **DATA DE PRECIOS** ESTÁ AGREGADA A NIVEL **SEMANAL** Y NO DIARIA
- AQUI LA DATA YA ESTÁ FILTRADA A LAS ÚLTIMAS 400 OBSERVACIONES POR SERIE. Se usa left join para ir a buscar info de calendario solo de la data que se está usando

In [24]:
df_prices.head(3)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26


In [25]:
data.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1541,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1542,1.0,2015-04-19,11512,,,,,0,0,0
1542,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1543,0.0,2015-04-20,11512,,,,,0,0,0
1543,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1544,0.0,2015-04-21,11512,,,,,0,0,0


In [26]:
data.shape

(12159132, 17)

In [27]:
# concat con df precios
data = data.merge(df_prices, on=['store_id', 'item_id', 'wm_yr_wk'])

In [28]:
data.shape

(12159132, 18)

### 6. Eliminar columnas no utilizadas. LA UNICA DIFERENCIA CON RESPECTO A BASELINE. AQUÍ NO SE REORDENAN LAS FEATURES

In [29]:
data.shape

(12159132, 18)

In [30]:
data.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1542,1.0,2015-04-19,11512,,,,,0,0,0,2.24
1,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1543,0.0,2015-04-20,11512,,,,,0,0,0,2.24
2,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1544,0.0,2015-04-21,11512,,,,,0,0,0,2.24


In [31]:
# eliminar columnas
data = data.drop(columns=['d', 'wm_yr_wk'])

In [32]:
# REORDENAR LAS COLUMNAS DE FORMA ALEATORIA 

# - DIFERENCIA CON RESPECTO A BASELINE. EN ESTA PRUEBA NO SE REORDENAN 

# data = data.sample(frac=1.0, random_state=0)

In [33]:
# revisar no se pierda tamaño
data.shape

(12159132, 16)

In [34]:
# show data - validar que efectivamente la reordenó
data.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,y,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1.0,2015-04-19,,,,,0,0,0,2.24
1,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0.0,2015-04-20,,,,,0,0,0,2.24
2,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0.0,2015-04-21,,,,,0,0,0,2.24
3,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0.0,2015-04-22,,,,,0,0,0,2.24
4,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1.0,2015-04-23,,,,,0,0,0,2.24


### 7. Obtener conjunto de datos de test
Se filtra de acuerdo al rango de fechas definidos para test. DUDAS: REVISAR NOTEBOOK QUE GENERA EL RANGO DE FECHAS DE TEST. FORECAST A 28 DÍAS A FUTURO

In [35]:
#### DEJAR COMENTADO CODIGO DEL EJEMPLO ORGINAL, filtraba hacia valores que no se conocen los reales por lo que no se puede comparar

# obtener última semana y pultimo año en los datos
#last_wmyrwk = data['wm_yr_wk'].max()
#last_date_train = data['date'].max()

In [36]:
# definir fechas y semanas de inicio y fin de test a partir de lo obtenido en el notebook "generar_data_test"
date_start_test = '2016-04-25'
date_end_test = '2016-05-22'

week_start_test = 11613
week_end_test = 11617

In [37]:
data.shape

(12159132, 16)

In [38]:
# eliminar los datos que corresponden a fechas de test que no se forecastea
data = data[data['date'] < date_start_test]

In [39]:
data.shape

(11305412, 16)

In [40]:
data['date'].max() # revisar que la ultima fecha corresponde al dia anterior de inicio data test

Timestamp('2016-04-24 00:00:00')

### 8. Generar X_df, dataframe con las variables exógenas

In [41]:
# filtrar data calender en el rango de datos de test
mask_calender_test = (df_calender['date'] >= date_start_test) & (df_calender['date'] <= date_end_test)
future_calender = df_calender[mask_calender_test]
future_calender.head(3)

Unnamed: 0,date,wm_yr_wk,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
1913,2016-04-25,11613,d_1914,,,,,0,0,0
1914,2016-04-26,11613,d_1915,,,,,0,0,0
1915,2016-04-27,11613,d_1916,,,,,0,0,0


In [42]:
future_calender['date'].min()

Timestamp('2016-04-25 00:00:00')

In [43]:
future_calender['date'].max()

Timestamp('2016-05-22 00:00:00')

In [44]:
# filtrar data future price
mask_prices_test = (df_prices['wm_yr_wk'] >= 11613) & (df_prices['wm_yr_wk'] <= 11617)
future_prices = df_prices[mask_prices_test]
future_prices.head(3)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
145,CA_1,HOBBIES_1_001,11613,8.38
146,CA_1,HOBBIES_1_001,11614,8.38
147,CA_1,HOBBIES_1_001,11615,8.38


In [45]:
future_prices['wm_yr_wk'].min()

11613

In [46]:
future_prices['wm_yr_wk'].max()

11617

In [47]:
# crear columna id en future_prices (es el único df de variables exógenas que permite obtener el id)
future_prices['id'] = future_prices['item_id'].astype(str) + '_' + future_prices['store_id'].astype(str)
future_prices.head(3)

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
145,CA_1,HOBBIES_1_001,11613,8.38,HOBBIES_1_001_CA_1
146,CA_1,HOBBIES_1_001,11614,8.38,HOBBIES_1_001_CA_1
147,CA_1,HOBBIES_1_001,11615,8.38,HOBBIES_1_001_CA_1


In [48]:
# X_df = unir df calendario y df prices para generar VARIABLES EXÓGENAS VALORES FUTUROS
X_df = future_prices.merge(future_calender, on='wm_yr_wk').drop(columns=['store_id', 'item_id', 'wm_yr_wk', 'd'])
X_df.head()

Unnamed: 0,sell_price,id,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,8.38,HOBBIES_1_001_CA_1,2016-04-25,,,,,0,0,0
1,8.38,HOBBIES_1_001_CA_1,2016-04-26,,,,,0,0,0
2,8.38,HOBBIES_1_001_CA_1,2016-04-27,,,,,0,0,0
3,8.38,HOBBIES_1_001_CA_1,2016-04-28,,,,,0,0,0
4,8.38,HOBBIES_1_001_CA_1,2016-04-29,,,,,0,0,0


## II. ENTRENAR MODELO

### 1. Definir y entrenar modelo NIXTLA

In [49]:
# print data
data.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,y,date,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1.0,2015-04-19,,,,,0,0,0,2.24
1,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0.0,2015-04-20,,,,,0,0,0,2.24
2,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0.0,2015-04-21,,,,,0,0,0,2.24


In [50]:
# HIPERPARAMETROS MODELO
model_params = {
    'verbose': -1,
    'num_threads': 4,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}

In [51]:
# CREAR MODELO NIXTLA A PARTIR DE MODELO DE ARBOL DE DECISIÓN
fcst = MLForecast(
    models=[lgb.LGBMRegressor(**model_params)],
    freq='D',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1:  [ExpandingMean()],
        7:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'],    
    num_threads=4,
)

In [52]:
# print de todas las features que se utilizarán en el modelo
process_data = fcst.preprocess(
    data,
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], 
    # OJO LAS FEATURES ESTÁTICAS SOLO SE NECESITAN DEFINIR EN TRAIN. EN EL DF FUTURO EXOGENAS NO ES NECEASARIO
)

In [53]:
process_data.head(3)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,y,date,event_name_1,event_type_1,...,rolling_mean_lag28_window_size7,rolling_mean_lag28_window_size14,rolling_mean_lag28_window_size28,seasonal_rolling_mean_lag28_season_length7_window_size4,year,month,day,dayofweek,quarter,week
56,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,2.0,2015-06-14,,,...,0.428571,0.571429,0.607143,0.5,2015,6,14,6,2,24
57,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1.0,2015-06-15,,,...,0.428571,0.428571,0.607143,0.75,2015,6,15,0,2,25
58,FOODS_1_001_CA_1,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0.0,2015-06-16,NBAFinalsEnd,Sporting,...,0.285714,0.428571,0.607143,0.25,2015,6,16,1,2,25


In [54]:
process_data.columns

Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'y',
       'date', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'lag7', 'lag14', 'lag21',
       'lag28', 'lag35', 'lag42', 'lag49', 'lag56', 'expanding_mean_lag1',
       'rolling_mean_lag7_window_size7', 'rolling_mean_lag7_window_size14',
       'rolling_mean_lag7_window_size28',
       'seasonal_rolling_mean_lag7_season_length7_window_size4',
       'rolling_mean_lag14_window_size7', 'rolling_mean_lag14_window_size14',
       'rolling_mean_lag14_window_size28',
       'seasonal_rolling_mean_lag14_season_length7_window_size4',
       'rolling_mean_lag28_window_size7', 'rolling_mean_lag28_window_size14',
       'rolling_mean_lag28_window_size28',
       'seasonal_rolling_mean_lag28_season_length7_window_size4', 'year',
       'month', 'day', 'dayofweek', 'quarter', 'week'],
      dtype='object')

In [55]:
# REVISAR COMO LO HACE QUE AL APLICAR DELAY NO SE PIERDEN LOS DATOS QUE SE DEBERÍAN PERDER AL APLICAR DELAYS
process_data[['id', 'date','y', 'lag7']].sort_values(by = ['id', 'date'])

Unnamed: 0,id,date,y,lag7
56,FOODS_1_001_CA_1,2015-06-14,2.0,1.0
57,FOODS_1_001_CA_1,2015-06-15,1.0,0.0
58,FOODS_1_001_CA_1,2015-06-16,0.0,0.0
59,FOODS_1_001_CA_1,2015-06-17,0.0,0.0
60,FOODS_1_001_CA_1,2015-06-18,0.0,2.0
...,...,...,...,...
12159099,HOUSEHOLD_2_516_WI_3,2016-04-20,0.0,0.0
12159100,HOUSEHOLD_2_516_WI_3,2016-04-21,0.0,0.0
12159101,HOUSEHOLD_2_516_WI_3,2016-04-22,0.0,0.0
12159102,HOUSEHOLD_2_516_WI_3,2016-04-23,0.0,0.0


In [56]:
%%time

#### ENTRENAR MODELO
fcst.fit(
    data,
    id_col='id',
    time_col='date',
    target_col='y',
    static_features=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
)

CPU times: user 2min 14s, sys: 7.06 s, total: 2min 21s
Wall time: 38.8 s


MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag7', 'lag14', 'lag21', 'lag28', 'lag35', 'lag42', 'lag49', 'lag56', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size7', 'rolling_mean_lag7_window_size14', 'rolling_mean_lag7_window_size28', 'seasonal_rolling_mean_lag7_season_length7_window_size4', 'rolling_mean_lag14_window_size7', 'rolling_mean_lag14_window_size14', 'rolling_mean_lag14_window_size28', 'seasonal_rolling_mean_lag14_season_length7_window_size4', 'rolling_mean_lag28_window_size7', 'rolling_mean_lag28_window_size14', 'rolling_mean_lag28_window_size28', 'seasonal_rolling_mean_lag28_season_length7_window_size4'], date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'], num_threads=4)

### 2. Hacer forecast

In [57]:
# predecir las proximas 28 observaciones
preds = fcst.predict(28, X_df=X_df)
preds.columns = ['id', 'date', 'forecast']

In [58]:
# si la predicción es negativo, llevar a cero
preds['forecast'] = preds['forecast'].where(preds['forecast'] >= 0, 0)

In [59]:
# transformar predicciones a formato valido para evaluar performance. PIVOT
preds = preds.pivot(index='id', columns='date', values='forecast')
preds.reset_index(inplace = True)

In [60]:
# ordenar por ID de forma ascendente
preds = preds.sort_values(by = ['id'], ascending = True)

In [61]:
preds.head(3)

date,id,2016-04-25 00:00:00,2016-04-26 00:00:00,2016-04-27 00:00:00,2016-04-28 00:00:00,2016-04-29 00:00:00,2016-04-30 00:00:00,2016-05-01 00:00:00,2016-05-02 00:00:00,2016-05-03 00:00:00,...,2016-05-13 00:00:00,2016-05-14 00:00:00,2016-05-15 00:00:00,2016-05-16 00:00:00,2016-05-17 00:00:00,2016-05-18 00:00:00,2016-05-19 00:00:00,2016-05-20 00:00:00,2016-05-21 00:00:00,2016-05-22 00:00:00
0,FOODS_1_001_CA_1,1.026181,0.887429,0.916707,0.819761,1.019347,1.086427,1.094861,1.029805,0.856382,...,0.829713,1.142925,1.072765,0.895709,0.765372,0.792784,0.736079,0.866319,1.01994,1.054733
1,FOODS_1_001_CA_2,1.219081,1.215585,1.039519,1.199324,1.260039,1.423519,1.469733,1.057801,1.088085,...,1.13715,1.411194,1.755387,1.064455,1.11341,1.12409,1.13679,1.182283,1.515207,1.708119
2,FOODS_1_001_CA_3,1.034165,1.013107,1.009146,1.00234,1.112304,1.463326,1.034355,0.832312,0.835188,...,0.998525,1.351079,1.515065,0.86863,0.855912,0.869451,0.867788,0.97306,1.334868,1.274801


In [62]:
preds.shape

(30490, 29)

### 3. EVALUAR MÉTRICA

In [63]:
# cargar reales (reales obtenidos en notebook 1_generate_data_test.ipynb)
folder_data_modeling = 'data/data_input_dtype/'
data_test_true = pd.read_pickle(folder_data_modeling + 'data_test_true.pkl')

In [64]:
# hacer que reales y forecast compartan los mismos nombre de columnas (RENOMBRAR COLUMNAS, 100% SEGURO QUE SE RESPETA EL ORDEN)
preds.columns = data_test_true.columns

In [65]:
data_test_true

Unnamed: 0,id,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
1612,FOODS_1_001_CA_1,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,6.0,...,2.0,3.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4661,FOODS_1_001_CA_2,0.0,3.0,0.0,0.0,0.0,1.0,4.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,2.0,0.0
7710,FOODS_1_001_CA_3,1.0,0.0,1.0,0.0,8.0,1.0,0.0,0.0,1.0,...,1.0,2.0,2.0,0.0,0.0,1.0,0.0,3.0,2.0,2.0
10759,FOODS_1_001_CA_4,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
13808,FOODS_1_001_TX_1,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.0,0.0,2.0,2.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16856,HOUSEHOLD_2_516_TX_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
19905,HOUSEHOLD_2_516_TX_3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0
22954,HOUSEHOLD_2_516_WI_1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
26003,HOUSEHOLD_2_516_WI_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
preds

Unnamed: 0,id,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,FOODS_1_001_CA_1,1.026181,0.887429,0.916707,0.819761,1.019347,1.086427,1.094861,1.029805,0.856382,...,0.829713,1.142925,1.072765,0.895709,0.765372,0.792784,0.736079,0.866319,1.019940,1.054733
1,FOODS_1_001_CA_2,1.219081,1.215585,1.039519,1.199324,1.260039,1.423519,1.469733,1.057801,1.088085,...,1.137150,1.411194,1.755387,1.064455,1.113410,1.124090,1.136790,1.182283,1.515207,1.708119
2,FOODS_1_001_CA_3,1.034165,1.013107,1.009146,1.002340,1.112304,1.463326,1.034355,0.832312,0.835188,...,0.998525,1.351079,1.515065,0.868630,0.855912,0.869451,0.867788,0.973060,1.334868,1.274801
3,FOODS_1_001_CA_4,0.345787,0.278383,0.264300,0.258485,0.316057,0.330121,0.392468,0.339696,0.335173,...,0.345229,0.383753,0.401305,0.333980,0.303839,0.303839,0.304470,0.333650,0.418983,0.434683
4,FOODS_1_001_TX_1,0.273532,0.220577,0.214762,0.213194,0.225021,0.224255,0.229559,0.391727,0.397112,...,0.424744,0.491221,0.493931,0.409389,0.406431,0.406431,0.407063,0.427551,0.498834,0.504625
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2,0.315264,0.305358,0.334784,0.317662,0.311814,0.331569,0.347744,0.300461,0.312183,...,0.320235,0.377600,0.368798,0.294937,0.290560,0.301337,0.298546,0.316949,0.340132,0.345923
30486,HOUSEHOLD_2_516_TX_3,0.168665,0.174676,0.165880,0.149289,0.159029,0.136041,0.147575,0.182282,0.175245,...,0.180484,0.189563,0.192272,0.162942,0.158565,0.158565,0.155917,0.174320,0.196715,0.214114
30487,HOUSEHOLD_2_516_WI_1,0.118956,0.116805,0.116805,0.116805,0.139521,0.126122,0.140594,0.137218,0.142603,...,0.192635,0.216261,0.218971,0.151097,0.148139,0.148139,0.148369,0.189349,0.215684,0.221475
30488,HOUSEHOLD_2_516_WI_2,0.087104,0.083535,0.083535,0.128174,0.127997,0.130244,0.147593,0.140723,0.144689,...,0.205390,0.225823,0.227785,0.188596,0.184219,0.195828,0.159186,0.177589,0.199983,0.212785


In [67]:
from utils.metrics import calculate_metrics_wrmse_v1
from utils.metrics import calculate_metrics_wrmse_v2
from utils.metrics import save_value_metric_csv

In [68]:
%%time
# CALCULAR MÉTRICA WEIGHTED ROOT MEAN SQUARE ERROR
calculate_metrics_wrmse_v1(df_pred = preds,
                           df_true = data_test_true
                          )

CPU times: user 24 s, sys: 124 ms, total: 24.1 s
Wall time: 23.9 s


4.153350061798857

In [69]:
%%time
# CALCULAR MÉTRICA WEIGHTED ROOT MEAN SQUARE ERROR
calculate_metrics_wrmse_v2(df_pred = preds,
                           df_true = data_test_true
                          )

CPU times: user 3.15 s, sys: 135 ms, total: 3.29 s
Wall time: 3.23 s


4.153350061798857

In [70]:
METRICA = calculate_metrics_wrmse_v2(df_pred = preds,
                                     df_true = data_test_true
                                    )

METRICA

# METRICA OBTENIDA 4.15

# PARECIERA QUE ES MEJOR NO HACER SUFFLE DE LOS DATOS

4.153350061798857

In [71]:
# GUARDAR METRICA OBTENIDA EN CSV PARA FÁCIL COMPARACIÓN CON OTROS MODELOS
# MODELO BASE, SIN SUFFLE DE LOS DATOS DE TRAIN
save_value_metric_csv(name_model = 'baseline_sin_shuffle', 
                      metric_value = METRICA
                     )