# Cargamos datos transformados

In [1]:
import pandas as pd
from pathlib import Path

file_path = '../data/interim/df_fleca_semana'
df_fleca_semana = pd.read_parquet(file_path)
df_fleca_semana.head(30)

Unnamed: 0,semana,total,base_imponible,unidades_vendidas
0,2023-01-08,5627.68,5145.61,3075.54
1,2023-01-15,4392.67,4020.09,2569.91
2,2023-01-22,4803.98,4406.36,2771.42
3,2023-01-29,4397.29,4022.09,2528.31
4,2023-02-05,4320.69,3952.34,2520.7
5,2023-02-12,4081.89,3732.84,2385.15
6,2023-02-19,4559.06,4173.84,2659.92
7,2023-02-26,4356.34,3986.09,2572.95
8,2023-03-05,5004.76,4575.75,2892.58
9,2023-03-12,4458.26,4075.4,2585.22


In [3]:
# Vamos a predecir con 53 lags
n_lags = 53

# Creamos un dataframe para los meses ordenados
df_fleca_semana = df_fleca_semana.sort_values('semana').reset_index(drop=True)

# Creamos las columnas de los lags. Se podría hacer también con la libreria skforecast.org en lugar de pandas, pero no es necesario
# y así evitamos dependencias innecesarias.
for lag in range(n_lags, 0, -1):
    df_fleca_semana[f'semana_previa_{lag}'] = df_fleca_semana['base_imponible'].shift(lag)

df_fleca_semana.head(25)

Unnamed: 0,semana,total,base_imponible,unidades_vendidas,semana_previa_53,semana_previa_52,semana_previa_51,semana_previa_50,semana_previa_49,semana_previa_48,...,semana_previa_10,semana_previa_9,semana_previa_8,semana_previa_7,semana_previa_6,semana_previa_5,semana_previa_4,semana_previa_3,semana_previa_2,semana_previa_1
0,2023-01-08,5627.68,5145.61,3075.54,,,,,,,...,,,,,,,,,,
1,2023-01-15,4392.67,4020.09,2569.91,,,,,,,...,,,,,,,,,,5145.61
2,2023-01-22,4803.98,4406.36,2771.42,,,,,,,...,,,,,,,,,5145.61,4020.09
3,2023-01-29,4397.29,4022.09,2528.31,,,,,,,...,,,,,,,,5145.61,4020.09,4406.36
4,2023-02-05,4320.69,3952.34,2520.7,,,,,,,...,,,,,,,5145.61,4020.09,4406.36,4022.09
5,2023-02-12,4081.89,3732.84,2385.15,,,,,,,...,,,,,,5145.61,4020.09,4406.36,4022.09,3952.34
6,2023-02-19,4559.06,4173.84,2659.92,,,,,,,...,,,,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84
7,2023-02-26,4356.34,3986.09,2572.95,,,,,,,...,,,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84
8,2023-03-05,5004.76,4575.75,2892.58,,,,,,,...,,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09
9,2023-03-12,4458.26,4075.4,2585.22,,,,,,,...,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75


# Añadimos target

In [None]:
# La columna target srá la base imponible de la semana actual
df_fleca_semana['target'] = df_fleca_semana['base_imponible'].shift(-1)
df_fleca_semana.head(25)


Unnamed: 0,semana,total,base_imponible,unidades_vendidas,semana_previa_53,semana_previa_52,semana_previa_51,semana_previa_50,semana_previa_49,semana_previa_48,...,semana_previa_9,semana_previa_8,semana_previa_7,semana_previa_6,semana_previa_5,semana_previa_4,semana_previa_3,semana_previa_2,semana_previa_1,target
0,2023-01-08,5627.68,5145.61,3075.54,,,,,,,...,,,,,,,,,,4020.09
1,2023-01-15,4392.67,4020.09,2569.91,,,,,,,...,,,,,,,,,5145.61,4406.36
2,2023-01-22,4803.98,4406.36,2771.42,,,,,,,...,,,,,,,,5145.61,4020.09,4022.09
3,2023-01-29,4397.29,4022.09,2528.31,,,,,,,...,,,,,,,5145.61,4020.09,4406.36,3952.34
4,2023-02-05,4320.69,3952.34,2520.7,,,,,,,...,,,,,,5145.61,4020.09,4406.36,4022.09,3732.84
5,2023-02-12,4081.89,3732.84,2385.15,,,,,,,...,,,,,5145.61,4020.09,4406.36,4022.09,3952.34,4173.84
6,2023-02-19,4559.06,4173.84,2659.92,,,,,,,...,,,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,3986.09
7,2023-02-26,4356.34,3986.09,2572.95,,,,,,,...,,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,4575.75
8,2023-03-05,5004.76,4575.75,2892.58,,,,,,,...,,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4075.4
9,2023-03-12,4458.26,4075.4,2585.22,,,,,,,...,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,4604.77


# Eliminamos valores nulos

In [5]:
#Eliminamos la columna base imponible original y las filas con NaN (las primeras n_lags)
df_fleca_semana = df_fleca_semana.drop(columns=['base_imponible'])
df_fleca_semana = df_fleca_semana.dropna().reset_index(drop=True)
df_fleca_semana.head(25)

Unnamed: 0,semana,total,unidades_vendidas,semana_previa_53,semana_previa_52,semana_previa_51,semana_previa_50,semana_previa_49,semana_previa_48,semana_previa_47,...,semana_previa_9,semana_previa_8,semana_previa_7,semana_previa_6,semana_previa_5,semana_previa_4,semana_previa_3,semana_previa_2,semana_previa_1,target
0,2024-01-14,3909.62,2179.92,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,...,4243.769173,4547.02,3946.75,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3790.88
1,2024-01-21,4138.38,2259.595,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,...,4547.02,3946.75,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,4096.18
2,2024-01-28,4480.32,2418.645,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,...,3946.75,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,3750.64
3,2024-02-04,4101.25,2189.5,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,...,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3977.76
4,2024-02-11,4349.59,2294.89,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,...,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,4243.04
5,2024-02-18,4641.75,2466.765,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,...,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,3986.91
6,2024-02-25,4362.64,2334.21,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,...,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,4027.12
7,2024-03-03,4404.71,2373.26,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,...,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,3986.91,3973.31
8,2024-03-10,4343.3,2312.72,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,...,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,3986.91,4027.12,4315.62
9,2024-03-17,4719.9,2538.17,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,4416.02,...,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,3986.91,4027.12,3973.31,4651.66


# Seleccionamos las columnas de interés

In [6]:
# Eliminamos la columna mes, total y unidades vendidas, ya que no es necesaria para el modelo
df_fleca_semana = df_fleca_semana.drop(columns=['total', 'unidades_vendidas', 'semana'])

df_fleca_semana.head(25)

Unnamed: 0,semana_previa_53,semana_previa_52,semana_previa_51,semana_previa_50,semana_previa_49,semana_previa_48,semana_previa_47,semana_previa_46,semana_previa_45,semana_previa_44,...,semana_previa_9,semana_previa_8,semana_previa_7,semana_previa_6,semana_previa_5,semana_previa_4,semana_previa_3,semana_previa_2,semana_previa_1,target
0,5145.61,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,...,4243.769173,4547.02,3946.75,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3790.88
1,4020.09,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,...,4547.02,3946.75,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,4096.18
2,4406.36,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,...,3946.75,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,3750.64
3,4022.09,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,...,3888.56,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3977.76
4,3952.34,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,...,4603.02,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,4243.04
5,3732.84,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,...,3434.63,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,3986.91
6,4173.84,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,4416.02,...,3240.28,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,4027.12
7,3986.09,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,4416.02,5247.66,...,3918.68,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,3986.91,3973.31
8,4575.75,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,4416.02,5247.66,5827.58,...,3817.57,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,3986.91,4027.12,4315.62
9,4075.4,4604.77,4476.88,4892.79,9146.54,5647.79,4416.02,5247.66,5827.58,5097.56,...,3578.16,3790.88,4096.18,3750.64,3977.76,4243.04,3986.91,4027.12,3973.31,4651.66


# Dividimos en X e y

In [7]:
# Hacemos un split de X e y
X = df_fleca_semana.drop(columns=['target'])
y = df_fleca_semana['target']
   

# Paquetizamos en una función