## Exogenous variables
Cómo usar variables exógenas

Source: https://nixtlaverse.nixtla.io/mlforecast/docs/how-to-guides/exogenous_features.html

In [1]:
import lightgbm as lgb
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.utils import generate_daily_series, generate_prices_for_series

### Data

#### generar data

In [2]:
series = generate_daily_series(
    100, equal_ends=True, n_static_features=2
).rename(columns={'static_1': 'product_id'})
series.head()

Unnamed: 0,unique_id,ds,y,static_0,product_id
0,id_00,2000-10-05,39.811983,79,45
1,id_00,2000-10-06,103.274013,79,45
2,id_00,2000-10-07,176.574744,79,45
3,id_00,2000-10-08,258.9879,79,45
4,id_00,2000-10-09,344.940404,79,45


In [3]:
series['unique_id'].unique()

['id_00', 'id_01', 'id_02', 'id_03', 'id_04', ..., 'id_95', 'id_96', 'id_97', 'id_98', 'id_99']
Length: 100
Categories (100, object): ['id_00', 'id_01', 'id_02', 'id_03', ..., 'id_96', 'id_97', 'id_98', 'id_99']

In [4]:
series[series['unique_id'] == 'id_00']['product_id'].unique() # cantidad de valors únicos de la variable exógena para una serie

[45]
Categories (63, int64): [1, 3, 7, 9, ..., 96, 97, 98, 99]

In [5]:
series.shape # tamaño del dataframe de series

(27003, 5)

#### generar data de variables exógenas que cambian a lo largo del tiempo
Por lo tanto, deben ser agregadas para realizar el forecast

In [6]:
prices_catalog = generate_prices_for_series(series)
prices_catalog.head()

Unnamed: 0,ds,unique_id,price
0,2000-10-05,id_00,0.548814
1,2000-10-06,id_00,0.715189
2,2000-10-07,id_00,0.602763
3,2000-10-08,id_00,0.544883
4,2000-10-09,id_00,0.423655


In [7]:
prices_catalog.shape # tamaño variable exógena generada. mismo tamaño que dataframe con las series de tiempo

(27703, 3)

#### unir data ds y data exógena

In [8]:
print('máxima fecha de cada time serie')
for id in series['unique_id'].unique().tolist():
    max_date = series[series['unique_id'] == id]['ds'].max()
    print(f'{id}: {max_date}')

máxima fecha de cada time serie
id_00: 2001-05-14 00:00:00
id_01: 2001-05-14 00:00:00
id_02: 2001-05-14 00:00:00
id_03: 2001-05-14 00:00:00
id_04: 2001-05-14 00:00:00
id_05: 2001-05-14 00:00:00
id_06: 2001-05-14 00:00:00
id_07: 2001-05-14 00:00:00
id_08: 2001-05-14 00:00:00
id_09: 2001-05-14 00:00:00
id_10: 2001-05-14 00:00:00
id_11: 2001-05-14 00:00:00
id_12: 2001-05-14 00:00:00
id_13: 2001-05-14 00:00:00
id_14: 2001-05-14 00:00:00
id_15: 2001-05-14 00:00:00
id_16: 2001-05-14 00:00:00
id_17: 2001-05-14 00:00:00
id_18: 2001-05-14 00:00:00
id_19: 2001-05-14 00:00:00
id_20: 2001-05-14 00:00:00
id_21: 2001-05-14 00:00:00
id_22: 2001-05-14 00:00:00
id_23: 2001-05-14 00:00:00
id_24: 2001-05-14 00:00:00
id_25: 2001-05-14 00:00:00
id_26: 2001-05-14 00:00:00
id_27: 2001-05-14 00:00:00
id_28: 2001-05-14 00:00:00
id_29: 2001-05-14 00:00:00
id_30: 2001-05-14 00:00:00
id_31: 2001-05-14 00:00:00
id_32: 2001-05-14 00:00:00
id_33: 2001-05-14 00:00:00
id_34: 2001-05-14 00:00:00
id_35: 2001-05-14 00:00

In [9]:
prices_catalog.head(2)

Unnamed: 0,ds,unique_id,price
0,2000-10-05,id_00,0.548814
1,2000-10-06,id_00,0.715189


In [10]:
print('máxima fecha de variables exógenas')
for id in prices_catalog['unique_id'].unique().tolist():
    max_date = prices_catalog[prices_catalog['unique_id'] == id]['ds'].max()
    print(f'{id}: {max_date}')

máxima fecha de variables exógenas
id_00: 2001-05-21 00:00:00
id_01: 2001-05-21 00:00:00
id_02: 2001-05-21 00:00:00
id_03: 2001-05-21 00:00:00
id_04: 2001-05-21 00:00:00
id_05: 2001-05-21 00:00:00
id_06: 2001-05-21 00:00:00
id_07: 2001-05-21 00:00:00
id_08: 2001-05-21 00:00:00
id_09: 2001-05-21 00:00:00
id_10: 2001-05-21 00:00:00
id_11: 2001-05-21 00:00:00
id_12: 2001-05-21 00:00:00
id_13: 2001-05-21 00:00:00
id_14: 2001-05-21 00:00:00
id_15: 2001-05-21 00:00:00
id_16: 2001-05-21 00:00:00
id_17: 2001-05-21 00:00:00
id_18: 2001-05-21 00:00:00
id_19: 2001-05-21 00:00:00
id_20: 2001-05-21 00:00:00
id_21: 2001-05-21 00:00:00
id_22: 2001-05-21 00:00:00
id_23: 2001-05-21 00:00:00
id_24: 2001-05-21 00:00:00
id_25: 2001-05-21 00:00:00
id_26: 2001-05-21 00:00:00
id_27: 2001-05-21 00:00:00
id_28: 2001-05-21 00:00:00
id_29: 2001-05-21 00:00:00
id_30: 2001-05-21 00:00:00
id_31: 2001-05-21 00:00:00
id_32: 2001-05-21 00:00:00
id_33: 2001-05-21 00:00:00
id_34: 2001-05-21 00:00:00
id_35: 2001-05-21 00

In [11]:
# al hacer el append se puede ver que no todas las fechas de las variables exógenas están en el dataframe
series_with_prices = series.merge(prices_catalog, how='left')
series_with_prices.head()

Unnamed: 0,unique_id,ds,y,static_0,product_id,price
0,id_00,2000-10-05,39.811983,79,45,0.548814
1,id_00,2000-10-06,103.274013,79,45,0.715189
2,id_00,2000-10-07,176.574744,79,45,0.602763
3,id_00,2000-10-08,258.9879,79,45,0.544883
4,id_00,2000-10-09,344.940404,79,45,0.423655


### ES NECESARIO INDICAR LAS EXÓGENAS QUE SON ESTÁTICAS Y LAS QUE SON DINÁMICAS. 
Estáticas: su valor es el mismo para LA SERIE A LA QUE ESTÁ ASOCIADA a lo largo de todo el horizonte de tiempo. Las variables exógenas fijas se asume que su valor será EL MISMO a lo largo de cualquier horizonte de forecast


**This dataframe will be passed to MLForecast.fit (or MLForecast.preprocess). However, since the price is dynamic we have to tell that method that only static_0 and product_id are static.**

In [12]:
fcst = MLForecast(
    models=lgb.LGBMRegressor(n_jobs=1, random_state=0, verbosity=-1),
    freq='D',
    lags=[7],
    lag_transforms={
        1: [ExpandingMean()],
        7: [RollingMean(window_size=14)],
    },
    date_features=['dayofweek', 'month'],
    num_threads=2,
)

In [13]:
# al momento de entrenar es neceario indicar variables exógenas CUYO VALOR ES FIJO PARA CADA TIMESERIE
fcst.fit(series_with_prices, static_features=['static_0', 'product_id'])

MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=2)

In [14]:
fcst

MLForecast(models=[LGBMRegressor], freq=D, lag_features=['lag7', 'expanding_mean_lag1', 'rolling_mean_lag7_window_size14'], date_features=['dayofweek', 'month'], num_threads=2)

In [15]:
# obtener info de las features usadas para entrenar el modelo
fcst.ts.features_order_

['static_0',
 'product_id',
 'price',
 'lag7',
 'expanding_mean_lag1',
 'rolling_mean_lag7_window_size14',
 'dayofweek',
 'month']

So in order to update the price in each timestep we just call MLForecast.predict with our forecast horizon and pass the prices catalog through X_df

In [16]:
preds = fcst.predict(h=7, X_df=prices_catalog) #, al hacer el forecast es necesario indicar el df con las variable exógenas que NO son estáticas
preds.head(8)

Unnamed: 0,unique_id,ds,LGBMRegressor
0,id_00,2001-05-15,418.930093
1,id_00,2001-05-16,499.487368
2,id_00,2001-05-17,20.321885
3,id_00,2001-05-18,102.310778
4,id_00,2001-05-19,185.340281
5,id_00,2001-05-20,261.021399
6,id_00,2001-05-21,338.568701
7,id_01,2001-05-15,117.351403


#### Predecir con el df de variables exógenas solo con los valores para el fcst
- FUNCIONA
- NIXTLA FILTRA POR EL TIMESTAMP. La columna ds debe estar correcta para hacer el join y con eso basta

In [17]:
# filtrar dataframe de variables exógenas solo con los valores que se necesitan para forecast
exogenous_df_fcst = prices_catalog.groupby('unique_id').tail(7)
exogenous_df_fcst

Unnamed: 0,ds,unique_id,price
222,2001-05-15,id_00,0.273542
223,2001-05-16,id_00,0.798047
224,2001-05-17,id_00,0.185636
225,2001-05-18,id_00,0.952792
226,2001-05-19,id_00,0.687488
...,...,...,...
27698,2001-05-17,id_99,0.682296
27699,2001-05-18,id_99,0.123657
27700,2001-05-19,id_99,0.068762
27701,2001-05-20,id_99,0.324157


In [18]:
#, al hacer el forecast es necesario indicar el df con las variable exógenas que NO son estáticas
preds_v2 = fcst.predict(h=7, X_df=exogenous_df_fcst) 
preds_v2.head(8)

Unnamed: 0,unique_id,ds,LGBMRegressor
0,id_00,2001-05-15,418.930093
1,id_00,2001-05-16,499.487368
2,id_00,2001-05-17,20.321885
3,id_00,2001-05-18,102.310778
4,id_00,2001-05-19,185.340281
5,id_00,2001-05-20,261.021399
6,id_00,2001-05-21,338.568701
7,id_01,2001-05-15,117.351403


#### Con esto, ya queda listo el uso de una feature exógena dinámicas

## PARTE 2: Generating exogenous features
Se puede observar que se necesita definir obligatoriamente las columnas
- unique_id
- ds
- y

Donde estas columnas representan el timestamp de la serie y además CUAL es la serie de tiempo a modelar. LOS VALORES QUE ESTÁN EN LA COLUMNA "y" es la serie de tiempo a la que se le pueden hacer transformaciones. 

El RESTO DE COLUMNAS en el dataframe son variables exógenas (la cual si no es estática se necesita especificar). Entonces, **SIMPLEMENTE SE INDICA EL NOMBRE DE LA COLUMNA QUE ES EXÓGENA PERO NO SE PUEDEN HACER TRANSFORMACIONES A DICHAS VARIABLES** de la misma forma que se hacen trasformaciones en la serie de tiempo target

Para poder realizar trasformaciones en las variables exógenas, ESTO SE TIENE QUE HACER ANTES. Nixtla ofrece un par de funciones para obtener variables exógenas interesantes para fcst de series de tiempo, ej transformada de fourier:

In [19]:
from sklearn.linear_model import LinearRegression
from utilsforecast.feature_engineering import fourier

In [20]:
series.head()

Unnamed: 0,unique_id,ds,y,static_0,product_id
0,id_00,2000-10-05,39.811983,79,45
1,id_00,2000-10-06,103.274013,79,45
2,id_00,2000-10-07,176.574744,79,45
3,id_00,2000-10-08,258.9879,79,45
4,id_00,2000-10-09,344.940404,79,45


Now we’d like to add some fourier terms to model the seasonality. We can do that with the following:

In [21]:
transformed_df, future_df = fourier(series, freq='D', season_length=7, k=2, h=7)

In [22]:
transformed_df # dataframe de train con más features exógenas (cuyo valor cambia en el tiempo)

Unnamed: 0,unique_id,ds,y,static_0,product_id,sin1_7,sin2_7,cos1_7,cos2_7
0,id_00,2000-10-05,39.811983,79,45,-0.974927,0.433893,-0.222526,-0.900964
1,id_00,2000-10-06,103.274013,79,45,-0.781834,-0.974926,0.623486,-0.222530
2,id_00,2000-10-07,176.574744,79,45,-0.000004,-0.000009,1.000000,1.000000
3,id_00,2000-10-08,258.987900,79,45,0.781829,0.974930,0.623493,-0.222513
4,id_00,2000-10-09,344.940404,79,45,0.974929,-0.433877,-0.222517,-0.900972
...,...,...,...,...,...,...,...,...,...
26998,id_99,2001-05-10,453.400509,69,35,-0.974931,0.433860,-0.222508,-0.900980
26999,id_99,2001-05-11,30.229478,69,35,-0.781833,-0.974927,0.623488,-0.222524
27000,id_99,2001-05-12,101.313713,69,35,0.000014,0.000028,1.000000,1.000000
27001,id_99,2001-05-13,145.724335,69,35,0.781831,0.974928,0.623491,-0.222519


In [23]:
future_df # se observa que el dataframe futuro parte de la fecha siguiente de la última observación

Unnamed: 0,unique_id,ds,sin1_7,sin2_7,cos1_7,cos2_7
0,id_00,2001-05-15,0.433884,-0.781832,-0.900969,0.623489
1,id_00,2001-05-16,-0.433898,0.781851,-0.900962,0.623466
2,id_00,2001-05-17,-0.974928,0.433883,-0.222521,-0.900969
3,id_00,2001-05-18,-0.781821,-0.974935,0.623502,-0.222490
4,id_00,2001-05-19,0.000001,0.000002,1.000000,1.000000
...,...,...,...,...,...,...
695,id_99,2001-05-17,-0.974928,0.433883,-0.222521,-0.900969
696,id_99,2001-05-18,-0.781821,-0.974935,0.623502,-0.222490
697,id_99,2001-05-19,0.000001,0.000002,1.000000,1.000000
698,id_99,2001-05-20,0.781842,0.974920,0.623477,-0.222554


We can now train using only these features (and the static ones).

In [24]:
fcst2 = MLForecast(models=LinearRegression(), freq='D')
fcst2.fit(transformed_df, static_features=['static_0', 'product_id']) # entrenar con dataframe con más features

MLForecast(models=[LinearRegression], freq=D, lag_features=[], date_features=[], num_threads=1)

In [25]:
# predecir 
fcst2.predict(h=7, X_df=future_df)

Unnamed: 0,unique_id,ds,LinearRegression
0,id_00,2001-05-15,250.202585
1,id_00,2001-05-16,241.510858
2,id_00,2001-05-17,248.653650
3,id_00,2001-05-18,268.419808
4,id_00,2001-05-19,277.885312
...,...,...,...
695,id_99,2001-05-17,216.127666
696,id_99,2001-05-18,235.893824
697,id_99,2001-05-19,245.359328
698,id_99,2001-05-20,238.807376


## PARTE 3: Transformar variables exógenas
La misma idea anterior, solo que se utilizan funciones que trasformar variables exógenas SIN GENERAR LA PREDICCIÓN.

Adicionalmente, **la función permite realizar las clasicas transformaciones de series de tiempo a las variables exógenas de forma fácil** 


SOURCE: https://nixtlaverse.nixtla.io/mlforecast/docs/how-to-guides/transforming_exog.html

In [26]:
from mlforecast.lag_transforms import ExpandingMean

from mlforecast.feature_engineering import transform_exog

In [29]:
prices = prices_catalog

In [40]:
# GENERAR TRANSFORMACIONES DE SERIES DE TIEMPO A LAS VARIABLES EXÓGENAS
transformed_prices = transform_exog(prices, lags=[7], lag_transforms={1: [ExpandingMean()]})
transformed_prices.head(10)

Unnamed: 0,ds,unique_id,price,price_lag7,price_expanding_mean_lag1
0,2000-10-05,id_00,0.548814,,
1,2000-10-06,id_00,0.715189,,0.548814
2,2000-10-07,id_00,0.602763,,0.632001
3,2000-10-08,id_00,0.544883,,0.622255
4,2000-10-09,id_00,0.423655,,0.602912
5,2000-10-10,id_00,0.645894,,0.567061
6,2000-10-11,id_00,0.437587,,0.5802
7,2000-10-12,id_00,0.891773,0.548814,0.559827
8,2000-10-13,id_00,0.963663,0.715189,0.60132
9,2000-10-14,id_00,0.383442,0.602763,0.64158


In [41]:
# UNIR VARIABLES EXÓGENAS TRANSFORMADAS A DATAFRAME PARA ENTRENAMIENTO
series_with_prices = series.merge(transformed_prices, on=['unique_id', 'ds'])
series_with_prices.head(10)

Unnamed: 0,unique_id,ds,y,static_0,product_id,price,price_lag7,price_expanding_mean_lag1
0,id_00,2000-10-05,39.811983,79,45,0.548814,,
1,id_00,2000-10-06,103.274013,79,45,0.715189,,0.548814
2,id_00,2000-10-07,176.574744,79,45,0.602763,,0.632001
3,id_00,2000-10-08,258.9879,79,45,0.544883,,0.622255
4,id_00,2000-10-09,344.940404,79,45,0.423655,,0.602912
5,id_00,2000-10-10,413.520305,79,45,0.645894,,0.567061
6,id_00,2000-10-11,506.990093,79,45,0.437587,,0.5802
7,id_00,2000-10-12,12.68807,79,45,0.891773,0.548814,0.559827
8,id_00,2000-10-13,111.133819,79,45,0.963663,0.715189,0.60132
9,id_00,2000-10-14,197.982842,79,45,0.383442,0.602763,0.64158


In [42]:
# entrenar. 
# No variables exógenas estáticas
# Eliminar los nulos para poder entrenar modelo
fcst = MLForecast(
    models=[LinearRegression()],
    freq='D',
    lags=[1],
    date_features=['dayofweek'],
)

In [43]:
fcst.preprocess(series_with_prices, static_features=[], dropna=True).head()

Unnamed: 0,unique_id,ds,y,static_0,product_id,price,price_lag7,price_expanding_mean_lag1,lag1,dayofweek
1,id_00,2000-10-06,103.274013,79,45,0.715189,,0.548814,39.811983,4
2,id_00,2000-10-07,176.574744,79,45,0.602763,,0.632001,103.274013,5
3,id_00,2000-10-08,258.9879,79,45,0.544883,,0.622255,176.574744,6
4,id_00,2000-10-09,344.940404,79,45,0.423655,,0.602912,258.9879,0
5,id_00,2000-10-10,413.520305,79,45,0.645894,,0.567061,344.940404,1


It’s important to note that the dropna argument only considers the null values generated by the lag features based on the target. If you want to drop all rows containing null values you have to do that in your original series.

In [44]:
series_with_prices2 = series_with_prices.dropna()

In [45]:
fcst.preprocess(series_with_prices2, dropna=True, static_features=[]).head()

Unnamed: 0,unique_id,ds,y,static_0,product_id,price,price_lag7,price_expanding_mean_lag1,lag1,dayofweek
8,id_00,2000-10-13,111.133819,79,45,0.963663,0.715189,0.60132,12.68807,4
9,id_00,2000-10-14,197.982842,79,45,0.383442,0.602763,0.64158,111.133819,5
10,id_00,2000-10-15,266.501075,79,45,0.791725,0.544883,0.615766,197.982842,6
11,id_00,2000-10-16,320.542865,79,45,0.528895,0.423655,0.631763,266.501075,0
12,id_00,2000-10-17,424.913844,79,45,0.568045,0.645894,0.62319,320.542865,1


In [46]:
fcst.fit(series_with_prices2, static_features=[])

MLForecast(models=[LinearRegression], freq=D, lag_features=['lag1'], date_features=['dayofweek'], num_threads=1)

In [47]:
fcst.predict(1, X_df=transformed_prices).head()

KeyError: "['static_0', 'product_id'] not in index"

**In this example we have prices for the next 7 days, if you try to forecast a longer horizon you’ll get an error.**

In [49]:
from fastcore.test import test_fail

ModuleNotFoundError: No module named 'fastcore'

In [50]:
test_fail(lambda: fcst.predict(8, X_df=transformed_prices), contains='Found missing inputs in X_df')

NameError: name 'test_fail' is not defined