# Central Hidroeléctrica Yacyretá
## 10 - Análisis de los DF de los modelo de predicción con algoritmos de aprendizaje automático  
### Fecha: 20-Enero-22

In [1]:
# Para que funcione más rápidamente el intellisense
#%config Completer.use_jedi = False

In [2]:
#Cargo numpy y pandas para trabajar con dataframes
import pandas as pd
pd.options.display.max_rows = None

import numpy as np
from datetime import date, timedelta
#import datetime

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 6)

In [3]:
# Imports necesarios para que funcione iplot() como método de pandas
%matplotlib inline
import plotly.graph_objs as go
from  plotly.offline import plot
import chart_studio.plotly as py
import cufflinks as cf
cf.go_offline()
from plotly.offline import download_plotlyjs, init_notebook_mode, plot,iplot
init_notebook_mode(connected='true')

In [4]:
# Planto una semilla para repetitibilidad
import random
random.seed(170162)
np.random.seed(170162)

In [5]:
import pprint
from tabulate import tabulate
import os

In [6]:
from sktime.utils.plotting import plot_series    

## Modelo por desplazamiento

In [7]:
# Directorio de trabajo
os.getcwd()

'C:\\Users\\josel\\OneDrive\\Documents\\Python\\tesis'

In [8]:
# Especifico el nombre del archivo con el dataset unificado de la CHY y el path
path = './output/TFI 08G - Modelo por desplazamiento, random_state/2022-06-05_17-41/'
file_name = 'metricas_df.csv'

# Leo el dataset creado en el notebook anterior
metricas_df = pd.read_csv(path + file_name)

In [9]:
metricas_df = metricas_df[['regresor', 'lags', 'fh', 'MAE', 'RMSE']]

In [10]:
# Busco el mínimo MAE por fh
min_metricas_df = pd.DataFrame()
fh_s = [56, 28, 14, 7]
for fh in fh_s:
    metricas_df_fh = metricas_df[metricas_df['fh']==fh]
    min_metricas_df = pd.concat([min_metricas_df, metricas_df_fh[metricas_df_fh['MAE']==metricas_df_fh['MAE'].min()].head(1)])

In [11]:
min_metricas_df.sort_values('fh').style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE
XGBRegressor,20,7,0.046314,0.052527
XGBRegressor,10,14,0.077188,0.086894
RandomForestRegressor,40,28,0.083811,0.099624
KNeighborsRegressor,20,56,0.088393,0.156881


In [12]:
# Creo un df adicional con los valores de min_metricas_df más una columna adicional con el nombre del modelo
min_metricas_df_desplazamiento = min_metricas_df.copy()
min_metricas_df_desplazamiento['modelo'] = 'Desplazamiento'

In [13]:
min_metricas_df_desplazamiento.sort_values('fh')

Unnamed: 0,regresor,lags,fh,MAE,RMSE,modelo
134,XGBRegressor,20,7,0.046314,0.052527,Desplazamiento
107,XGBRegressor,10,14,0.077188,0.086894,Desplazamiento
41,RandomForestRegressor,40,28,0.083811,0.099624,Desplazamiento
18,KNeighborsRegressor,20,56,0.088393,0.156881,Desplazamiento


In [14]:
# Muestro todas los 4 mejores registros para cada horizonte, de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4).style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE
XGBRegressor,20,7,0.046314,0.052527
LGBMRegressor,20,7,0.056664,0.059468
XGBRegressor,40,7,0.072174,0.085822
MLPRegressor,10,7,0.080116,0.081512
KNeighborsRegressor,20,56,0.088393,0.156881
KNeighborsRegressor,30,56,0.09175,0.169967
KNeighborsRegressor,10,56,0.09225,0.162111
LGBMRegressor,10,56,0.092551,0.167566


In [15]:
# Ploteo los mejores 4 desempeños por horizonte de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Modelo por desplazamiento, los cuatro mejores por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 300))

In [16]:
# Ploteo los mejores 4 desempeños por horizonte de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(2) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Modelo por desplazamiento, los cuatro mejores por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 300))

In [17]:
# Creo un df con los registros con los mínimos MAE
metricas_df_minMAE = metricas_df.sort_values('MAE').groupby('fh').head(1).sort_values('fh')

In [18]:
# Impresión del df con los mejores registros (mínimo MAE, por horizonte)
metricas_df_minMAE.style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE
XGBRegressor,20,7,0.046314,0.052527
XGBRegressor,10,14,0.077188,0.086894
RandomForestRegressor,40,28,0.083811,0.099624
KNeighborsRegressor,20,56,0.088393,0.156881


In [19]:
# Leo el df con las y_preds de cada predicción de los notebooks anteriores
file_name = 'y_preds_df.csv'

# Leo el dataset creado en el notebook anterior
y_preds_df_index = pd.read_csv(path + file_name)

In [20]:
# Convierto la columna FECHA de object a datetime64
y_preds_df_index['FECHA'] = pd.to_datetime(y_preds_df_index['FECHA'])

# Hago que la columna FECHA sea el índice
y_preds_df_index.set_index('FECHA', inplace=True)

# Si fuera necesario imputar NAN
y_preds_df_index = y_preds_df_index.asfreq('D', method='ffill')

In [21]:
y_preds_df_index

Unnamed: 0_level_0,56_y_test,56_40_KNeighborsRegressor,56_40_Lasso,56_40_LGBMRegressor,56_40_LinearRegression,56_40_MLPRegressor,56_40_RandomForestRegressor,56_40_Ridge,56_40_SVR,56_40_XGBRegressor,...,7_20_XGBRegressor,7_10_KNeighborsRegressor,7_10_Lasso,7_10_LGBMRegressor,7_10_LinearRegression,7_10_MLPRegressor,7_10_RandomForestRegressor,7_10_Ridge,7_10_SVR,7_10_XGBRegressor
FECHA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-05-06,23.76,23.618,22.936618,23.73154,23.76119,23.676291,23.7571,23.758846,23.65293,23.68656,...,,,,,,,,,,
2021-05-07,23.78,23.562,22.936618,23.694426,23.74861,23.625401,23.749,23.746159,23.589384,23.693813,...,,,,,,,,,,
2021-05-08,23.78,23.486,22.936618,23.669329,23.733374,23.532227,23.7176,23.730727,23.538625,23.497341,...,,,,,,,,,,
2021-05-09,23.8,23.486,22.936618,23.656506,23.708881,23.525605,23.6604,23.706162,23.499944,23.421236,...,,,,,,,,,,
2021-05-10,23.72,23.466,22.936618,23.619776,23.668943,23.54371,23.6347,23.666338,23.459575,23.337328,...,,,,,,,,,,
2021-05-11,23.7,23.472,22.936618,23.58714,23.626018,23.564528,23.6583,23.624286,23.45082,23.339577,...,,,,,,,,,,
2021-05-12,23.68,23.478,22.936618,23.54899,23.597302,23.576906,23.6538,23.595902,23.447615,23.348307,...,,,,,,,,,,
2021-05-13,23.67,23.478,22.936618,23.549779,23.579701,23.542834,23.6317,23.578285,23.441389,23.424885,...,,,,,,,,,,
2021-05-14,23.66,23.514,22.936618,23.542859,23.566205,23.546444,23.6194,23.565003,23.420766,23.508265,...,,,,,,,,,,
2021-05-15,23.72,23.514,22.936618,23.506295,23.559936,23.514143,23.612,23.558411,23.402585,23.522404,...,,,,,,,,,,


In [22]:
list(y_preds_df_index.columns)

['56_y_test',
 '56_40_KNeighborsRegressor',
 '56_40_Lasso',
 '56_40_LGBMRegressor',
 '56_40_LinearRegression',
 '56_40_MLPRegressor',
 '56_40_RandomForestRegressor',
 '56_40_Ridge',
 '56_40_SVR',
 '56_40_XGBRegressor',
 '56_30_KNeighborsRegressor',
 '56_30_Lasso',
 '56_30_LGBMRegressor',
 '56_30_LinearRegression',
 '56_30_MLPRegressor',
 '56_30_RandomForestRegressor',
 '56_30_Ridge',
 '56_30_SVR',
 '56_30_XGBRegressor',
 '56_20_KNeighborsRegressor',
 '56_20_Lasso',
 '56_20_LGBMRegressor',
 '56_20_LinearRegression',
 '56_20_MLPRegressor',
 '56_20_RandomForestRegressor',
 '56_20_Ridge',
 '56_20_SVR',
 '56_20_XGBRegressor',
 '56_10_KNeighborsRegressor',
 '56_10_Lasso',
 '56_10_LGBMRegressor',
 '56_10_LinearRegression',
 '56_10_MLPRegressor',
 '56_10_RandomForestRegressor',
 '56_10_Ridge',
 '56_10_SVR',
 '56_10_XGBRegressor',
 '28_y_test',
 '28_40_KNeighborsRegressor',
 '28_40_Lasso',
 '28_40_LGBMRegressor',
 '28_40_LinearRegression',
 '28_40_MLPRegressor',
 '28_40_RandomForestRegressor',


In [23]:
# Veo el índice de metricas_df_minMAE
metricas_df_minMAE.index

Int64Index([134, 107, 41, 18], dtype='int64')

In [24]:
# Genero los nombres e columnas para cada registro del df de MAE mínimos
columnas = []
columnas.append('56_y_test')
for i in metricas_df_minMAE.index:
    columnas.append(str(metricas_df_minMAE.loc[i]['fh'])+'_'+str(metricas_df_minMAE.loc[i]['lags'])+'_'+str(metricas_df_minMAE.loc[i]['regresor']))

list(columnas)    

['56_y_test',
 '7_20_XGBRegressor',
 '14_10_XGBRegressor',
 '28_40_RandomForestRegressor',
 '56_20_KNeighborsRegressor']

In [25]:
round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4)

0.0463

In [26]:
y_preds_df_index[[columnas[0], columnas[1]]].iloc[-14:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Modelo por desplazamiento, mejor predictor a 7 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

In [27]:
y_preds_df_index[[columnas[0], columnas[2]]].iloc[-28:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Modelo por desplazamiento, mejor predictor a 14 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==14]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

In [28]:
y_preds_df_index[[columnas[0], columnas[3]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Modelo por desplazamiento, mejor predictor a 28 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==28]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

In [29]:
y_preds_df_index[[columnas[0], columnas[4]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Modelo por desplazamiento, mejor predictor a 56 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==56]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

## Ensamble de modelos diarios (offset - variable)

In [30]:
# Especifico el nombre del archivo con el dataset unificado de la CHY y el path
path = './output/TFI 09G - Offset Variable, random_state/2022-06-04_13-41/'
file_name = 'metricas_df.csv'

# Leo el dataset creado en el notebook anterior
metricas_df = pd.read_csv(path + file_name)

In [31]:
metricas_df

Unnamed: 0,fh,lags,regresor,MAE,MSE,RMSE,MASE
0,56,40,KNeighborsRegressor,0.318857,0.153452,0.391729,4.182118
1,56,40,Lasso,0.78804,0.648174,0.805093,10.335905
2,56,40,LGBMRegressor,0.238568,0.084722,0.291071,3.129046
3,56,40,LinearRegression,0.351783,0.15949,0.399361,4.613975
4,56,40,MLPRegressor,0.184925,0.049034,0.221436,2.425465
5,56,40,RandomForestRegressor,0.349631,0.14309,0.378272,4.58575
6,56,40,Ridge,0.352608,0.159891,0.399864,4.624797
7,56,40,SVR,0.260438,0.078295,0.279812,3.41589
8,56,40,XGBRegressor,0.507308,0.34357,0.586149,6.653835
9,56,30,KNeighborsRegressor,0.331393,0.160435,0.400543,4.346536


In [32]:
# Busco el mínimo MAE
min_metricas_df = pd.DataFrame()
fh_s = [56, 28, 14, 7]
for fh in fh_s:
    metricas_df_fh = metricas_df[metricas_df['fh']==fh]
    min_metricas_df = pd.concat([min_metricas_df, metricas_df_fh[metricas_df_fh['MAE']==metricas_df_fh['MAE'].min()].head(1)])

In [33]:
min_metricas_df.sort_values('fh').style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE
7,20,MLPRegressor,0.069504,0.005638,0.075086,0.914549
14,30,MLPRegressor,0.056369,0.005552,0.07451,0.74078
28,30,MLPRegressor,0.061443,0.005659,0.075227,0.805811
56,10,MLPRegressor,0.093691,0.02707,0.164531,1.228847


In [34]:
# Creo un df adicional con los valores de min_metricas_df más una columna adicional con el nombre del modelo
min_metricas_df_offset = min_metricas_df.copy()
min_metricas_df_offset['modelo'] = 'Ensamble'

In [35]:
min_metricas_df_offset.sort_values('fh').style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE,modelo
7,20,MLPRegressor,0.069504,0.005638,0.075086,0.914549,Ensamble
14,30,MLPRegressor,0.056369,0.005552,0.07451,0.74078,Ensamble
28,30,MLPRegressor,0.061443,0.005659,0.075227,0.805811,Ensamble
56,10,MLPRegressor,0.093691,0.02707,0.164531,1.228847,Ensamble


In [36]:
# Muestro todas los 4 mejores registros para cada horizonte, de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4).style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE
MLPRegressor,20,7,0.069504,0.075086
XGBRegressor,10,7,0.071549,0.097546
MLPRegressor,10,7,0.075249,0.080368
SVR,40,7,0.084388,0.094725
MLPRegressor,10,56,0.093691,0.164531
MLPRegressor,20,56,0.100005,0.172538
MLPRegressor,30,56,0.100554,0.166794
MLPRegressor,40,56,0.184925,0.221436


In [37]:
# Ploteo los mejores 4 desempeños por horizonte de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Ensamble de modelos diarios, los cuatro mejores por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 300))

In [38]:
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)].sort_values(['fh', 'MAE']).groupby('fh').head(2) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Modelo por desplazamiento - Desempeño de los dos mejores modelos por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 350))

In [39]:
# Creo un df con los registros con los mínimos MAE
metricas_df_minMAE = metricas_df.sort_values('MAE').groupby('fh').head(1).sort_values('fh')

In [40]:
# Impresión del df con los mejores registros (mínimo MAE, por horizonte)
metricas_df_minMAE.style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE
7,20,MLPRegressor,0.069504,0.005638,0.075086,0.914549
14,30,MLPRegressor,0.056369,0.005552,0.07451,0.74078
28,30,MLPRegressor,0.061443,0.005659,0.075227,0.805811
56,10,MLPRegressor,0.093691,0.02707,0.164531,1.228847


In [41]:
# Leo el df con las y_preds de cada predicción de los notebooks anteriores
file_name = 'y_preds_df.csv'

# Leo el dataset creado en el notebook anterior
y_preds_df_index = pd.read_csv(path + file_name)

In [42]:
y_preds_df_index

Unnamed: 0,FECHA,56_40_y_test,56_40_KNeighborsRegressor,56_40_Lasso,56_40_LGBMRegressor,56_40_LinearRegression,56_40_MLPRegressor,56_40_RandomForestRegressor,56_40_Ridge,56_40_SVR,...,7_10_y_test,7_10_KNeighborsRegressor,7_10_Lasso,7_10_LGBMRegressor,7_10_LinearRegression,7_10_MLPRegressor,7_10_RandomForestRegressor,7_10_Ridge,7_10_SVR,7_10_XGBRegressor
0,2021-05-06,23.76,23.62,22.936392,23.695464,23.761088,23.656902,23.748,23.758734,23.648218,...,,,,,,,,,,
1,2021-05-07,23.78,23.558,22.93668,23.545008,23.747151,23.742023,23.6401,23.744592,23.650608,...,,,,,,,,,,
2,2021-05-08,23.78,23.486,22.936931,23.586102,23.734404,23.807553,23.5237,23.731923,23.631023,...,,,,,,,,,,
3,2021-05-09,23.8,23.48,22.937177,23.587312,23.715031,23.586137,23.422,23.712308,23.604036,...,,,,,,,,,,
4,2021-05-10,23.72,23.466,22.937441,23.625398,23.668391,23.662033,23.4153,23.666229,23.562691,...,,,,,,,,,,
5,2021-05-11,23.7,23.472,22.937733,23.653335,23.620015,23.730807,23.4056,23.618268,23.527353,...,,,,,,,,,,
6,2021-05-12,23.68,23.478,22.938107,23.622728,23.586618,23.669185,23.3946,23.584914,23.510397,...,,,,,,,,,,
7,2021-05-13,23.67,23.514,22.938496,23.529389,23.573026,23.754463,23.3619,23.571268,23.525905,...,,,,,,,,,,
8,2021-05-14,23.66,23.546,22.938892,23.552823,23.568916,23.70383,23.4162,23.567517,23.528226,...,,,,,,,,,,
9,2021-05-15,23.72,23.586,22.93924,23.382726,23.575599,23.625119,23.3723,23.573408,23.546433,...,,,,,,,,,,


In [43]:
# Convierto la columna FECHA de object a datetime64
y_preds_df_index['FECHA'] = pd.to_datetime(y_preds_df_index['FECHA'])

# Hago que la columna FECHA sea el índice
y_preds_df_index.set_index('FECHA', inplace=True)

# Si fuera necesario imputar NAN
y_preds_df_index = y_preds_df_index.asfreq('D', method='ffill')

In [44]:
list(y_preds_df_index.columns)

['56_40_y_test',
 '56_40_KNeighborsRegressor',
 '56_40_Lasso',
 '56_40_LGBMRegressor',
 '56_40_LinearRegression',
 '56_40_MLPRegressor',
 '56_40_RandomForestRegressor',
 '56_40_Ridge',
 '56_40_SVR',
 '56_40_XGBRegressor',
 '56_30_y_test',
 '56_30_KNeighborsRegressor',
 '56_30_Lasso',
 '56_30_LGBMRegressor',
 '56_30_LinearRegression',
 '56_30_MLPRegressor',
 '56_30_RandomForestRegressor',
 '56_30_Ridge',
 '56_30_SVR',
 '56_30_XGBRegressor',
 '56_20_y_test',
 '56_20_KNeighborsRegressor',
 '56_20_Lasso',
 '56_20_LGBMRegressor',
 '56_20_LinearRegression',
 '56_20_MLPRegressor',
 '56_20_RandomForestRegressor',
 '56_20_Ridge',
 '56_20_SVR',
 '56_20_XGBRegressor',
 '56_10_y_test',
 '56_10_KNeighborsRegressor',
 '56_10_Lasso',
 '56_10_LGBMRegressor',
 '56_10_LinearRegression',
 '56_10_MLPRegressor',
 '56_10_RandomForestRegressor',
 '56_10_Ridge',
 '56_10_SVR',
 '56_10_XGBRegressor',
 '28_40_y_test',
 '28_40_KNeighborsRegressor',
 '28_40_Lasso',
 '28_40_LGBMRegressor',
 '28_40_LinearRegression'

In [45]:
# Veo el índice de metricas_df_minMAE
metricas_df_minMAE.index

Int64Index([130, 85, 49, 31], dtype='int64')

In [46]:
# Genero los nombres e columnas para cada registro del df de MAE mínimos
columnas = []
columnas.append('56_40_y_test')
for i in metricas_df_minMAE.index:
    columnas.append(str(metricas_df_minMAE.loc[i]['fh'])+'_'+str(metricas_df_minMAE.loc[i]['lags'])+'_'+str(metricas_df_minMAE.loc[i]['regresor']))

list(columnas)    

['56_40_y_test',
 '7_20_MLPRegressor',
 '14_30_MLPRegressor',
 '28_30_MLPRegressor',
 '56_10_MLPRegressor']

In [47]:
round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4)

0.0695

In [48]:
y_preds_df_index[[columnas[0], columnas[1]]].iloc[-14:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios, mejor predictor a 7 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

In [49]:
y_preds_df_index[[columnas[0], columnas[2]]].iloc[-28:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios, mejor predictor a 14 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==14]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

In [50]:
y_preds_df_index[[columnas[0], columnas[3]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios, mejor predictor a 28 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==28]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

In [51]:
y_preds_df_index[[columnas[0], columnas[4]]].iloc[-112:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios, mejor predictor a 56 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==56]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

## Ensamble de modelos diarios (offset - variable) más Feature Engineering

In [52]:
# Especifico el nombre del archivo con el dataset unificado de la CHY y el path
path = './output/TFI 09I - Univariate Feature Engineering/2022-06-04_13-46/'
file_name = 'metricas_df.csv'

# Leo el dataset creado en el notebook anterior
metricas_df = pd.read_csv(path + file_name)

In [53]:
metricas_df

Unnamed: 0,fh,lags,regresor,MAE,MSE,RMSE,MASE
0,56,40,KNeighborsRegressor,0.093107,0.028581,0.16906,1.22119
1,56,40,Lasso,0.68324,0.493159,0.702253,8.961351
2,56,40,LGBMRegressor,1.184844,5.488417,2.342737,15.540373
3,56,40,LinearRegression,0.288508,0.107481,0.327843,3.784062
4,56,40,MLPRegressor,1.239336,1.657778,1.287547,16.255083
5,56,40,RandomForestRegressor,0.302017,0.122614,0.350163,3.961243
6,56,40,Ridge,0.27229,0.10014,0.31645,3.57134
7,56,40,SVR,0.731413,0.580951,0.762202,9.593184
8,56,40,XGBRegressor,0.354256,0.158025,0.397524,4.646409
9,56,30,KNeighborsRegressor,0.093107,0.028581,0.16906,1.22119


In [54]:
# Busco el mínimo MAE
min_metricas_df = pd.DataFrame()
fh_s = [56, 28, 14, 7]
for fh in fh_s:
    metricas_df_fh = metricas_df[metricas_df['fh']==fh]
    min_metricas_df = pd.concat([min_metricas_df, metricas_df_fh[metricas_df_fh['MAE']==metricas_df_fh['MAE'].min()].head(1)])

In [55]:
min_metricas_df.sort_values('fh').style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE
7,40,KNeighborsRegressor,0.082286,0.007053,0.08398,1.082729
14,40,XGBRegressor,0.068977,0.007422,0.086151,0.906472
28,40,LinearRegression,0.081197,0.010901,0.104406,1.064887
56,40,KNeighborsRegressor,0.093107,0.028581,0.16906,1.22119


In [56]:
# Creo un df adicional con los valores de min_metricas_df más una columna adicional con el nombre del modelo
min_metricas_df_featureEngineering = min_metricas_df.copy()
min_metricas_df_featureEngineering['modelo'] = 'Ensamble+featEng'

In [57]:
min_metricas_df_featureEngineering.sort_values('fh')

Unnamed: 0,fh,lags,regresor,MAE,MSE,RMSE,MASE,modelo
108,7,40,KNeighborsRegressor,0.082286,0.007053,0.08398,1.082729,Ensamble+featEng
80,14,40,XGBRegressor,0.068977,0.007422,0.086151,0.906472,Ensamble+featEng
39,28,40,LinearRegression,0.081197,0.010901,0.104406,1.064887,Ensamble+featEng
0,56,40,KNeighborsRegressor,0.093107,0.028581,0.16906,1.22119,Ensamble+featEng


In [58]:
# Muestro todas los 4 mejores registros para cada horizonte, de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4).style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE
KNeighborsRegressor,10,7,0.082286,0.08398
KNeighborsRegressor,20,7,0.082286,0.08398
KNeighborsRegressor,30,7,0.082286,0.08398
KNeighborsRegressor,40,7,0.082286,0.08398
KNeighborsRegressor,10,56,0.093107,0.16906
KNeighborsRegressor,20,56,0.093107,0.16906
KNeighborsRegressor,40,56,0.093107,0.16906
KNeighborsRegressor,30,56,0.093107,0.16906


In [59]:
# Ploteo los mejores 4 desempeños por horizonte de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Ensamble de modelos diarios con feature-engineering, los cuatro mejores por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 300))

In [60]:
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)].sort_values(['fh', 'MAE']).groupby('fh').head(2) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Modelo por desplazamiento - Desempeño de los dos mejores modelos por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 350))

In [61]:
# Creo un df con los registros con los mínimos MAE
metricas_df_minMAE = metricas_df.sort_values('MAE').groupby('fh').head(1).sort_values('fh')

In [62]:
# Impresión del df con los mejores registros (mínimo MAE, por horizonte)
metricas_df_minMAE.style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE
7,10,KNeighborsRegressor,0.082286,0.007053,0.08398,1.082729
14,40,XGBRegressor,0.068977,0.007422,0.086151,0.906472
28,40,LinearRegression,0.081197,0.010901,0.104406,1.064887
56,10,KNeighborsRegressor,0.093107,0.028581,0.16906,1.22119


In [63]:
# Leo el df con las y_preds de cada predicción de los notebooks anteriores
file_name = 'y_preds_df.csv'

# Leo el dataset creado en el notebook anterior
y_preds_df_index = pd.read_csv(path + file_name)

In [64]:
y_preds_df_index

Unnamed: 0,FECHA,56_40_y_test,56_40_KNeighborsRegressor,56_40_Lasso,56_40_LGBMRegressor,56_40_LinearRegression,56_40_MLPRegressor,56_40_RandomForestRegressor,56_40_Ridge,56_40_SVR,...,7_10_y_test,7_10_KNeighborsRegressor,7_10_Lasso,7_10_LGBMRegressor,7_10_LinearRegression,7_10_MLPRegressor,7_10_RandomForestRegressor,7_10_Ridge,7_10_SVR,7_10_XGBRegressor
0,2021-05-06,23.76,23.764,23.136153,23.749744,23.75639,22.285159,23.763,23.772532,23.629112,...,,,,,,,,,,
1,2021-05-07,23.78,23.764,23.107865,23.745013,23.736837,23.15207,23.7149,23.762545,23.423403,...,,,,,,,,,,
2,2021-05-08,23.78,23.764,23.081244,23.635606,23.734523,22.337113,23.68395,23.757041,23.394285,...,,,,,,,,,,
3,2021-05-09,23.8,23.764,23.058398,23.593869,23.724292,22.530942,23.5742,23.742654,23.34199,...,,,,,,,,,,
4,2021-05-10,23.72,23.764,23.057996,22.623191,23.671691,22.930565,23.5707,23.700798,23.282572,...,,,,,,,,,,
5,2021-05-11,23.7,23.764,23.05754,23.594098,23.633725,22.250624,23.6033,23.653541,23.238039,...,,,,,,,,,,
6,2021-05-12,23.68,23.764,23.056915,22.658331,23.60149,22.289711,23.5373,23.619426,23.187149,...,,,,,,,,,,
7,2021-05-13,23.67,23.764,23.05626,23.783281,23.58143,21.825416,23.6529,23.613514,23.161606,...,,,,,,,,,,
8,2021-05-14,23.66,23.764,23.055588,25.635575,23.590478,22.319153,23.6334,23.616137,23.044429,...,,,,,,,,,,
9,2021-05-15,23.72,23.764,23.05501,23.539198,23.606476,23.348234,23.5986,23.630885,23.004315,...,,,,,,,,,,


In [65]:
# Convierto la columna FECHA de object a datetime64
y_preds_df_index['FECHA'] = pd.to_datetime(y_preds_df_index['FECHA'])

# Hago que la columna FECHA sea el índice
y_preds_df_index.set_index('FECHA', inplace=True)

# Si fuera necesario imputar NAN
y_preds_df_index = y_preds_df_index.asfreq('D', method='ffill')

In [66]:
list(y_preds_df_index.columns)

['56_40_y_test',
 '56_40_KNeighborsRegressor',
 '56_40_Lasso',
 '56_40_LGBMRegressor',
 '56_40_LinearRegression',
 '56_40_MLPRegressor',
 '56_40_RandomForestRegressor',
 '56_40_Ridge',
 '56_40_SVR',
 '56_40_XGBRegressor',
 '56_30_y_test',
 '56_30_KNeighborsRegressor',
 '56_30_Lasso',
 '56_30_LGBMRegressor',
 '56_30_LinearRegression',
 '56_30_MLPRegressor',
 '56_30_RandomForestRegressor',
 '56_30_Ridge',
 '56_30_SVR',
 '56_30_XGBRegressor',
 '56_20_y_test',
 '56_20_KNeighborsRegressor',
 '56_20_Lasso',
 '56_20_LGBMRegressor',
 '56_20_LinearRegression',
 '56_20_MLPRegressor',
 '56_20_RandomForestRegressor',
 '56_20_Ridge',
 '56_20_SVR',
 '56_20_XGBRegressor',
 '56_10_y_test',
 '56_10_KNeighborsRegressor',
 '56_10_Lasso',
 '56_10_LGBMRegressor',
 '56_10_LinearRegression',
 '56_10_MLPRegressor',
 '56_10_RandomForestRegressor',
 '56_10_Ridge',
 '56_10_SVR',
 '56_10_XGBRegressor',
 '28_40_y_test',
 '28_40_KNeighborsRegressor',
 '28_40_Lasso',
 '28_40_LGBMRegressor',
 '28_40_LinearRegression'

In [67]:
# Veo el índice de metricas_df_minMAE
metricas_df_minMAE.index

Int64Index([135, 80, 39, 27], dtype='int64')

In [68]:
# Genero los nombres e columnas para cada registro del df de MAE mínimos
columnas = []
columnas.append('56_40_y_test')
for i in metricas_df_minMAE.index:
    columnas.append(str(metricas_df_minMAE.loc[i]['fh'])+'_'+str(metricas_df_minMAE.loc[i]['lags'])+'_'+str(metricas_df_minMAE.loc[i]['regresor']))

list(columnas)    

['56_40_y_test',
 '7_10_KNeighborsRegressor',
 '14_40_XGBRegressor',
 '28_40_LinearRegression',
 '56_10_KNeighborsRegressor']

In [69]:
round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4)

0.0823

In [70]:
y_preds_df_index[[columnas[0], columnas[1]]].iloc[-14:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con feat-eng, mejor predictor a 7 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

In [71]:
y_preds_df_index[[columnas[0], columnas[2]]].iloc[-28:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con feat-eng, mejor predictor a 14 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==14]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

In [72]:
y_preds_df_index[[columnas[0], columnas[3]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con feat-eng, mejor predictor a 28 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==28]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

In [73]:
y_preds_df_index[[columnas[0], columnas[4]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con feat-eng, mejor predictor a 56 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==56]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 300)
                                                )

## Multivariable, Granger Causality Test

In [74]:
# Especifico el nombre del archivo con el dataset unificado de la CHY y el path
path = './output/TFI 09J - Multivariable with Granger Causality, random_state/2022-06-04_13-50/'
file_name = 'metricas_df.csv'

# Leo el dataset creado en el notebook anterior
metricas_df = pd.read_csv(path + file_name)

In [75]:
metricas_df

Unnamed: 0,fh,lags,regresor,MAE,MSE,RMSE,MASE
0,56,40,KNeighborsRegressor,0.643714,0.461575,0.679393,8.442932
1,56,40,Lasso,0.787078,0.646986,0.804354,10.323292
2,56,40,LGBMRegressor,0.220546,0.066478,0.257833,2.892672
3,56,40,LinearRegression,0.357212,0.161091,0.401361,4.685173
4,56,40,MLPRegressor,1.167752,2.264759,1.504912,15.316197
5,56,40,RandomForestRegressor,0.342159,0.139552,0.373567,4.487744
6,56,40,Ridge,0.356921,0.161058,0.40132,4.681356
7,56,40,SVR,0.794642,0.658568,0.811522,10.422493
8,56,40,XGBRegressor,0.409718,0.224039,0.473327,5.373852
9,56,30,KNeighborsRegressor,0.643714,0.461575,0.679393,8.442932


In [76]:
# Busco el mínimo MAE
min_metricas_df = pd.DataFrame()
fh_s = [56, 28, 14, 7]
for fh in fh_s:
    metricas_df_fh = metricas_df[metricas_df['fh']==fh]
    min_metricas_df = pd.concat([min_metricas_df, metricas_df_fh[metricas_df_fh['MAE']==metricas_df_fh['MAE'].min()].head(1)])

In [77]:
min_metricas_df.sort_values('fh').style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE
7,40,Ridge,0.125003,0.016652,0.129043,1.644814
14,10,LGBMRegressor,0.104597,0.016028,0.126602,1.374572
28,40,LGBMRegressor,0.200651,0.054775,0.234041,2.631496
56,40,LGBMRegressor,0.220546,0.066478,0.257833,2.892672


In [78]:
# Creo un df adicional con los valores de min_metricas_df más una columna adicional con el nombre del modelo
min_metricas_df_multivariable = min_metricas_df.copy()
min_metricas_df_multivariable['modelo'] = 'Ensamble+exógenas'

In [79]:
min_metricas_df_multivariable.sort_values('fh')

Unnamed: 0,fh,lags,regresor,MAE,MSE,RMSE,MASE,modelo
114,7,40,Ridge,0.125003,0.016652,0.129043,1.644814,Ensamble+exógenas
101,14,10,LGBMRegressor,0.104597,0.016028,0.126602,1.374572,Ensamble+exógenas
38,28,40,LGBMRegressor,0.200651,0.054775,0.234041,2.631496,Ensamble+exógenas
2,56,40,LGBMRegressor,0.220546,0.066478,0.257833,2.892672,Ensamble+exógenas


In [80]:
# Muestro todas los 4 mejores registros para cada horizonte, de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4).style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE
Ridge,40,7,0.125003,0.129043
LinearRegression,40,7,0.126541,0.130551
Ridge,30,7,0.137421,0.141339
LinearRegression,30,7,0.138863,0.142784
LGBMRegressor,40,56,0.220546,0.257833
LGBMRegressor,30,56,0.271625,0.300049
RandomForestRegressor,40,56,0.342159,0.373567
LGBMRegressor,20,56,0.347497,0.383712


In [81]:
# Ploteo los mejores 4 desempeños por horizonte de 7 y de 56 días
metricas_df.loc[(metricas_df.fh==7) | (metricas_df.fh==56)] \
.sort_values(['MAE'], ascending=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE']] \
.groupby('fh').head(4) \
.pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                             ,title='Ensamble de modelos diarios con variables exógenas, los cuatro mejores por horizonte de pronóstico'
                                                                             ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                             ,yTitle='MAE mínimo (metros)'
                                                                             ,dimensions=(950, 300))

In [82]:
metricas_df[(metricas_df.fh==7) | (metricas_df.fh==56)].sort_values(['fh', 'MAE']).groupby('fh').head(9).pivot_table(index=['fh', 'lags'], columns=['regresor'], values='MAE').iplot(kind='bar'
                                                                                                                                         ,title='Modelo de offset-variable con variables exógenas, los 3 mejores por horizonte de pronóstico'
                                                                                                                                         ,xTitle='(horizonte de pronóstico, versiones retrazadas)'
                                                                                                                                         ,yTitle='MAE mínimo (metros)'
                                                                                                                                         ,dimensions=(950, 300))

In [83]:
# Creo un df con los registros con los mínimos MAE
metricas_df_minMAE = metricas_df.sort_values('MAE').groupby('fh').head(1).sort_values('fh')

In [84]:
# Impresión del df con los mejores registros (mínimo MAE, por horizonte)
metricas_df_minMAE.style.set_properties(**{'text-align': 'left'}).hide_index()

fh,lags,regresor,MAE,MSE,RMSE,MASE
7,40,Ridge,0.125003,0.016652,0.129043,1.644814
14,10,LGBMRegressor,0.104597,0.016028,0.126602,1.374572
28,40,LGBMRegressor,0.200651,0.054775,0.234041,2.631496
56,40,LGBMRegressor,0.220546,0.066478,0.257833,2.892672


In [85]:
# Leo el df con las y_preds de cada predicción de los notebooks anteriores
file_name = 'y_preds_df.csv'

# Leo el dataset creado en el notebook anterior
y_preds_df_index = pd.read_csv(path + file_name)

In [86]:
y_preds_df_index

Unnamed: 0,FECHA,56_40_y_test,56_40_KNeighborsRegressor,56_40_Lasso,56_40_LGBMRegressor,56_40_LinearRegression,56_40_MLPRegressor,56_40_RandomForestRegressor,56_40_Ridge,56_40_SVR,...,7_10_y_test,7_10_KNeighborsRegressor,7_10_Lasso,7_10_LGBMRegressor,7_10_LinearRegression,7_10_MLPRegressor,7_10_RandomForestRegressor,7_10_Ridge,7_10_SVR,7_10_XGBRegressor
0,2021-05-06,23.76,23.35,22.937932,23.736883,23.749199,25.659958,23.7344,23.748307,22.928265,...,,,,,,,,,,
1,2021-05-07,23.78,23.366,22.943425,23.677531,23.721693,23.07637,23.6804,23.723009,22.928642,...,,,,,,,,,,
2,2021-05-08,23.78,23.292,22.946031,23.661436,23.704186,22.874838,23.5616,23.70706,22.928964,...,,,,,,,,,,
3,2021-05-09,23.8,23.266,22.945606,23.646293,23.687609,21.98609,23.4823,23.690635,22.929279,...,,,,,,,,,,
4,2021-05-10,23.72,23.26,22.94158,23.423756,23.647387,22.614637,23.4637,23.650175,22.929616,...,,,,,,,,,,
5,2021-05-11,23.7,23.258,22.937375,23.477458,23.61143,24.554692,23.4376,23.612917,22.930003,...,,,,,,,,,,
6,2021-05-12,23.68,23.302,22.939416,23.557064,23.58547,24.384157,23.428,23.586055,22.930399,...,,,,,,,,,,
7,2021-05-13,23.67,23.356,22.940132,23.507274,23.569347,24.006553,23.3917,23.569342,22.930789,...,,,,,,,,,,
8,2021-05-14,23.66,23.36,22.942816,23.497581,23.562191,23.874687,23.4217,23.562492,22.931183,...,,,,,,,,,,
9,2021-05-15,23.72,23.234,22.9431,23.364505,23.564311,26.738816,23.4075,23.564458,22.931576,...,,,,,,,,,,


In [87]:
# Convierto la columna FECHA de object a datetime64
y_preds_df_index['FECHA'] = pd.to_datetime(y_preds_df_index['FECHA'])

# Hago que la columna FECHA sea el índice
y_preds_df_index.set_index('FECHA', inplace=True)

# Si fuera necesario imputar NAN
y_preds_df_index = y_preds_df_index.asfreq('D', method='ffill')

In [88]:
list(y_preds_df_index.columns)

['56_40_y_test',
 '56_40_KNeighborsRegressor',
 '56_40_Lasso',
 '56_40_LGBMRegressor',
 '56_40_LinearRegression',
 '56_40_MLPRegressor',
 '56_40_RandomForestRegressor',
 '56_40_Ridge',
 '56_40_SVR',
 '56_40_XGBRegressor',
 '56_30_y_test',
 '56_30_KNeighborsRegressor',
 '56_30_Lasso',
 '56_30_LGBMRegressor',
 '56_30_LinearRegression',
 '56_30_MLPRegressor',
 '56_30_RandomForestRegressor',
 '56_30_Ridge',
 '56_30_SVR',
 '56_30_XGBRegressor',
 '56_20_y_test',
 '56_20_KNeighborsRegressor',
 '56_20_Lasso',
 '56_20_LGBMRegressor',
 '56_20_LinearRegression',
 '56_20_MLPRegressor',
 '56_20_RandomForestRegressor',
 '56_20_Ridge',
 '56_20_SVR',
 '56_20_XGBRegressor',
 '56_10_y_test',
 '56_10_KNeighborsRegressor',
 '56_10_Lasso',
 '56_10_LGBMRegressor',
 '56_10_LinearRegression',
 '56_10_MLPRegressor',
 '56_10_RandomForestRegressor',
 '56_10_Ridge',
 '56_10_SVR',
 '56_10_XGBRegressor',
 '28_40_y_test',
 '28_40_KNeighborsRegressor',
 '28_40_Lasso',
 '28_40_LGBMRegressor',
 '28_40_LinearRegression'

In [89]:
# Veo el índice de metricas_df_minMAE
metricas_df_minMAE.index

Int64Index([114, 101, 38, 2], dtype='int64')

In [90]:
# Genero los nombres e columnas para cada registro del df de MAE mínimos
columnas = []
columnas.append('56_40_y_test')
for i in metricas_df_minMAE.index:
    columnas.append(str(metricas_df_minMAE.loc[i]['fh'])+'_'+str(metricas_df_minMAE.loc[i]['lags'])+'_'+str(metricas_df_minMAE.loc[i]['regresor']))

list(columnas)    

['56_40_y_test',
 '7_40_Ridge',
 '14_10_LGBMRegressor',
 '28_40_LGBMRegressor',
 '56_40_LGBMRegressor']

In [91]:
round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4)

0.125

In [92]:
y_preds_df_index[[columnas[0], columnas[1]]].iloc[-14:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con variables exógenas, mejor predictor a 7 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==7]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

In [93]:
y_preds_df_index[[columnas[0], columnas[2]]].iloc[-28:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con variables exógenas, mejor predictor a 14 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==14]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

In [94]:
y_preds_df_index[[columnas[0], columnas[3]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con variables exógenas, mejor predictor a 28 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==28]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

In [95]:
y_preds_df_index[[columnas[0], columnas[4]]].iloc[-56:].iplot(mode='lines+markers', size=5.0
                                                 ,title='Ensamble de modelos diarios con variables exógenas, mejor predictor a 56 días, MAE: ' + str(round(min_metricas_df[min_metricas_df['fh']==56]['MAE'].values[0], 4))
                                                 ,yTitle='metros'
                                                 ,xTitle='Fecha'
                                                 ,dimensions=(950, 350)
                                                )

### Comparación de desempeño de los modelos

In [96]:
# Concatenos los dos datasets con los MAE mínimos para una graficación
# Creo un df adicional con los valores de min_metricas_df más una columna adicional con el nombre del modelo
min_metricas_df_todos = pd.concat([
                                    min_metricas_df_desplazamiento
                                   ,min_metricas_df_offset
                                   ,min_metricas_df_featureEngineering
                                   ,min_metricas_df_multivariable
                                  ])

In [97]:
min_metricas_df_todos

Unnamed: 0,regresor,lags,fh,MAE,RMSE,modelo,MSE,MASE
18,KNeighborsRegressor,20,56,0.088393,0.156881,Desplazamiento,,
41,RandomForestRegressor,40,28,0.083811,0.099624,Desplazamiento,,
107,XGBRegressor,10,14,0.077188,0.086894,Desplazamiento,,
134,XGBRegressor,20,7,0.046314,0.052527,Desplazamiento,,
31,MLPRegressor,10,56,0.093691,0.164531,Ensamble,0.02707,1.228847
49,MLPRegressor,30,28,0.061443,0.075227,Ensamble,0.005659,0.805811
85,MLPRegressor,30,14,0.056369,0.07451,Ensamble,0.005552,0.74078
130,MLPRegressor,20,7,0.069504,0.075086,Ensamble,0.005638,0.914549
0,KNeighborsRegressor,40,56,0.093107,0.16906,Ensamble+featEng,0.028581,1.22119
39,LinearRegression,40,28,0.081197,0.104406,Ensamble+featEng,0.010901,1.064887


In [98]:
# Frecuencia de aparición de los regresores
min_metricas_df_todos['regresor'].value_counts()

MLPRegressor             4
KNeighborsRegressor      3
XGBRegressor             3
LGBMRegressor            3
RandomForestRegressor    1
LinearRegression         1
Ridge                    1
Name: regresor, dtype: int64

In [99]:
# Ordeno el df para ver qué modelo/regresor/lags tiene el menor MAE para cada fh
min_metricas_df_todos.sort_values(['fh', 'MAE'])

Unnamed: 0,regresor,lags,fh,MAE,RMSE,modelo,MSE,MASE
134,XGBRegressor,20,7,0.046314,0.052527,Desplazamiento,,
130,MLPRegressor,20,7,0.069504,0.075086,Ensamble,0.005638,0.914549
108,KNeighborsRegressor,40,7,0.082286,0.08398,Ensamble+featEng,0.007053,1.082729
114,Ridge,40,7,0.125003,0.129043,Ensamble+exógenas,0.016652,1.644814
85,MLPRegressor,30,14,0.056369,0.07451,Ensamble,0.005552,0.74078
80,XGBRegressor,40,14,0.068977,0.086151,Ensamble+featEng,0.007422,0.906472
107,XGBRegressor,10,14,0.077188,0.086894,Desplazamiento,,
101,LGBMRegressor,10,14,0.104597,0.126602,Ensamble+exógenas,0.016028,1.374572
49,MLPRegressor,30,28,0.061443,0.075227,Ensamble,0.005659,0.805811
39,LinearRegression,40,28,0.081197,0.104406,Ensamble+featEng,0.010901,1.064887


In [100]:
min_metricas_df_todos.sort_values(['fh', 'MAE']).iplot(kind='bar'
                                                       ,y=['MAE']
                                                       ,x=['fh', 'modelo']
                                                       ,colors=['red']
                                                      , sortbars=True)

#### Comparativo de desempeño

In [101]:
min_metricas_df_todos.loc[(min_metricas_df_todos.fh==7) | (min_metricas_df_todos.fh==56)] \
    .pivot_table(index=['fh'], columns=['modelo'], values='MAE') \
    .iplot(kind='bar', xTitle='Dias del pronóstico', yTitle='MAE mínimo (metros)'
           ,title='Comparativo de desempeño de los modelos de pronóstico, más allá del conjunto de datos'
           ,dimensions=(950, 300))

In [102]:
# Imprimo, agrupados por fh, los registros ordenados por MAE, de menor a mayor
min_metricas_df_todos.loc[(min_metricas_df_todos.fh==7) | (min_metricas_df_todos.fh==56)] \
.groupby(['fh']).apply(lambda x: x.sort_values(['MAE'], ascending = True)) \
.reset_index(drop=True)[['regresor', 'lags', 'fh', 'MAE', 'RMSE', 'modelo']] \
.style.set_properties(**{'text-align': 'left'}).hide_index()

regresor,lags,fh,MAE,RMSE,modelo
XGBRegressor,20,7,0.046314,0.052527,Desplazamiento
MLPRegressor,20,7,0.069504,0.075086,Ensamble
KNeighborsRegressor,40,7,0.082286,0.08398,Ensamble+featEng
Ridge,40,7,0.125003,0.129043,Ensamble+exógenas
KNeighborsRegressor,20,56,0.088393,0.156881,Desplazamiento
KNeighborsRegressor,40,56,0.093107,0.16906,Ensamble+featEng
MLPRegressor,10,56,0.093691,0.164531,Ensamble
LGBMRegressor,40,56,0.220546,0.257833,Ensamble+exógenas


In [103]:
# Pivoteo el df para poder graficar mejor en barras
min_metricas_df_todos.pivot_table(index=['fh'], columns='modelo', values=['MAE']).iplot(kind='bar'
                                                                                     ,xTitle='Dias del pronóstico'
                                                                                     ,yTitle='MAE mínimo (metros)'
                                                                                     ,title='Comparativo de desempeño de los modelos de pronóstico, más allá del conjunto de datos'
                                                                                     ,sortbars=True
                                                                                     ,dimensions=(950, 350))


#### Mejores modelos

In [104]:
# Selecciono los mejores modelos para cada fh
df_mejores_modelos = pd.DataFrame()
for i in range(0, len(min_metricas_df_todos.groupby('fh').min('MAE'))):
    df_mejores_modelos = pd.concat([df_mejores_modelos, min_metricas_df_todos[min_metricas_df_todos.MAE == min_metricas_df_todos.groupby('fh').min('MAE').iloc[i][1]]])

In [105]:
df_mejores_modelos[['fh', 'modelo', 'regresor', 'lags', 'MAE']].style.set_properties(**{'text-align': 'left'}).hide_index()  

fh,modelo,regresor,lags,MAE
7,Desplazamiento,XGBRegressor,20,0.046314
14,Ensamble,MLPRegressor,30,0.056369
28,Ensamble,MLPRegressor,30,0.061443
56,Desplazamiento,KNeighborsRegressor,20,0.088393
