### Imports

In [1]:
import pandas as pd
import polars as pl
import numpy as np
#import matplotlib.pyplot as plt
#from feature_baggingV2 import FeatureBaggingWithHyperparamTuning
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
#import lightgbm as lgb
import datetime
from sklearn.preprocessing import RobustScaler
#from keras.models import Sequential
#from keras.layers import LSTM, Dense
import labolibrary as labo

import os, sys, gc, time, warnings, pickle, random

### Función para escalar/desescalar y métrica

In [2]:
# Function to center, scale, and return a series
def scale_group(group):
    scaler = RobustScaler()
    scaled_values = scaler.fit_transform(group.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Store the scaler for this group
    return pd.Series(scaled_values, index=group.index, name=group.name)

# Function to inverse transform (de-scale) and decenter, and return a series
def inverse_scale_group(group):
    group_name = group.name
    scaler = scalers[group_name]
    inversed_centered_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    original_values = inversed_centered_values
    return pd.Series(original_values, index=group.index, name=group_name)

### Archivos

In [3]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'
df_sell_in = pd.read_csv(DATOS_DIR+'sell-in.txt', sep='\t')
df_predecir = pd.read_csv(DATOS_DIR+'productos_a_predecir.txt', sep='\t')
df_tb_stocks = pd.read_csv(DATOS_DIR+'tb_stocks.txt', sep='\t')
df_tb_productos = pd.read_csv(DATOS_DIR+'tb_productos_descripcion.txt', sep='\t')

Si quiero predecir algun mes en especifico y filtrar los productos con más de 3 meses de venta

In [4]:
#df_sell_in = df_sell_in[df_sell_in['periodo']<201808]
#df_sell_in['tn_2'] = np.where(df_sell_in['periodo'].isin([201805, 201804]), np.nan, df_sell_in['tn_2']) 
#df_sell_in = df_sell_in[df_sell_in['periodo']<201806]
#sales_counts = df_sell_in.groupby('product_id')['periodo'].count() # Contar el número de meses de ventas por product_id
#products_with_more_than_3_months = sales_counts[sales_counts > 3].index # Filtrar los productos con más de 3 meses de ventas
#df_sell_in = df_sell_in[df_sell_in['product_id'].isin(products_with_more_than_3_months)] # Filtrar el DataFrame original para incluir solo estos productos

### Preprocesamiento

In [8]:
#AGREGAR VENTAS EN CERO
pl.__version__

'1.0.0'

In [9]:
df_group = pl.DataFrame(df_sell_in)
df_group = df_group.group_by(['periodo','product_id','customer_id']).agg(pl.sum('tn').alias('tn_sum'))

# Encontrar las fechas de inicio y fin de ventas por producto
df_fechas = df_group.group_by('product_id').agg([
                                            pl.min('periodo').alias('periodo_min'),
                                            pl.max('periodo').alias('periodo_max')])

In [10]:
# Obtener valores únicos de customer_id, product_id y periodo
unique_customer_ids = df_sell_in['customer_id'].unique()
unique_product_ids = df_sell_in['product_id'].unique()
unique_periodos = df_sell_in['periodo'].unique()

# Crear DataFrames con todas las combinaciones posibles
df_customers = pl.DataFrame({'customer_id': unique_customer_ids})
df_products = pl.DataFrame({'product_id': unique_product_ids})
df_periods = pl.DataFrame({'periodo': unique_periodos})

# Realizar el cross join para obtener todas las combinaciones
df_all_combinations = df_customers.join(df_products, how = 'cross').join(df_periods, how= 'cross')

In [11]:
df_all_combinations_2 = df_all_combinations.join(df_fechas, on = 'product_id', how='left')
df_all_combinations_2

customer_id,product_id,periodo,periodo_min,periodo_max
i64,i64,i64,i64,i64
10234,20524,201701,201701,201912
10234,20524,201702,201701,201912
10234,20524,201703,201701,201912
10234,20524,201704,201701,201912
10234,20524,201705,201701,201912
…,…,…,…,…
10572,20770,201908,201912,201912
10572,20770,201909,201912,201912
10572,20770,201910,201912,201912
10572,20770,201911,201912,201912


In [12]:
filtered_df = df_all_combinations_2.filter((df_all_combinations_2['periodo'] >= df_all_combinations_2['periodo_min']) & (df_all_combinations_2['periodo'] <= df_all_combinations_2['periodo_max']))
filtered_df

customer_id,product_id,periodo,periodo_min,periodo_max
i64,i64,i64,i64,i64
10234,20524,201701,201701,201912
10234,20524,201702,201701,201912
10234,20524,201703,201701,201912
10234,20524,201704,201701,201912
10234,20524,201705,201701,201912
…,…,…,…,…
10572,20728,201911,201911,201912
10572,20728,201912,201911,201912
10572,20792,201912,201912,201912
10572,20854,201912,201912,201912


In [13]:
df_sell = pl.DataFrame(df_sell_in)

In [14]:
df_complete = filtered_df.join(df_sell, 
                 left_on=['customer_id', 'product_id','periodo'],
                 right_on=['customer_id', 'product_id','periodo'],
                 how='left'
                )
df_complete = df_complete.drop("plan_precios_cuidados")
df_complete = df_complete.fill_null(0)

In [15]:
df_complete.filter((pl.col("product_id") == 21276) & (pl.col("customer_id")==10550))

customer_id,product_id,periodo,periodo_min,periodo_max,cust_request_qty,cust_request_tn,tn
i64,i64,i64,i64,i64,i64,f64,f64
10550,21276,201903,201903,201912,0,0.0,0.0
10550,21276,201904,201903,201912,0,0.0,0.0
10550,21276,201905,201903,201912,0,0.0,0.0
10550,21276,201906,201903,201912,0,0.0,0.0
10550,21276,201907,201903,201912,0,0.0,0.0
10550,21276,201908,201903,201912,0,0.0,0.0
10550,21276,201909,201903,201912,1,0.00075,0.00075
10550,21276,201910,201903,201912,0,0.0,0.0
10550,21276,201911,201903,201912,2,0.00371,0.00371
10550,21276,201912,201903,201912,0,0.0,0.0


In [16]:
df_sell.filter((pl.col("product_id") == 21276) & (pl.col("customer_id")==10550))

periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn
i64,i64,i64,i64,i64,f64,f64
201909,10550,21276,0,1,0.00075,0.00075
201911,10550,21276,0,2,0.00371,0.00371


In [17]:
df_complete = df_complete.to_pandas()

In [18]:
df_tb_stocks['periodo'] = pd.to_datetime(df_tb_stocks['periodo'], format='%Y%m')
df_tb_stocks['periodo'] = df_tb_stocks['periodo'] - pd.DateOffset(months=1) #mes diferente en stock
df_tb_stocks['product_id'] = df_tb_stocks['product_id'].astype(int)
df_tb_stocks['stock_final'] = df_tb_stocks['stock_final'].astype(float)
df_tb_productos['product_id'] = df_tb_productos['product_id'].astype(int)
df_tb_productos['sku_size'] = df_tb_productos['sku_size'].astype(int)
df_complete['periodo'] = pd.to_datetime(df_complete['periodo'], format='%Y%m')
df_complete['product_id'] = df_complete['product_id'].astype(int)
df_complete['customer_id'] = df_complete['customer_id'].astype(int)
df_complete['cust_request_qty'] = df_complete['cust_request_qty'].astype(int)
df_complete['cust_request_tn'] = df_complete['cust_request_tn'].astype(float)
df_complete['tn'] = df_complete['tn'].astype(float)
#df_complete['plan_precios_cuidados'] = df_complete['plan_precios_cuidados'].astype(bool)

In [19]:
df_sell_in = df_complete

### Consolidar Datos

In [20]:
#### Agrupar y escalar
scalers = {}
df_sell_in['weight'] = df_sell_in['tn']
df_sell_in['tn'] = df_sell_in.groupby('product_id')['tn'].transform(scale_group)

In [21]:
df_sell_in['tn_2'] = df_sell_in['tn'].shift(-2)
df_sell_in['cust_request_qty'] = df_sell_in.groupby('product_id')['cust_request_qty'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)
df_sell_in['cust_request_tn'] = df_sell_in.groupby('product_id')['cust_request_tn'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)
df_sell_in['cust_request_qty'] = df_sell_in.groupby('product_id')['cust_request_qty'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)
df_tb_stocks['stock_final'] = df_tb_stocks.groupby('product_id')['stock_final'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)  

In [22]:
# Join tb_productos to sell_in on product_id
df_sell_in_merged = pd.merge(df_sell_in, df_tb_productos, on='product_id', how='left')
# Join tb_stocks to sell_in_merged on both product_id and periodo
df_final = pd.merge(df_sell_in_merged, df_tb_stocks, on=['product_id', 'periodo'], how='left')

#Convertir 'periodo' a formato de fecha y Calcular el trimestre desde 'periodo'
df_final['fecha'] = pd.to_datetime(df_final['periodo'], format='%Y%m')
df_final["trimestre"] = df_final.periodo.dt.quarter

#Establecer 'fecha' como índice y convertir a período mensual
df_final.set_index('fecha', inplace=True)
df_final.index = df_final.index.to_period('M')

In [23]:
df_final = df_final[df_final['product_id'].isin(df_predecir['product_id'])] #Filtrar solo los productos a predecir

### Clase de LightGBM

In [24]:
df_final['diff_tn_tn2'] =  df_final['tn_2'] - df_final['tn'] #NUEVA CLASE

## FE

### Lags

In [25]:
# Create lag variables for 'cust_request_qty', 'cust_request_tn', and 'tn'
n_lags = 36
for lag in range(1, n_lags + 1):
    df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
    
    df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
    
    df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
   
    df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
    

  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final

In [26]:
#DELTA DE LAGS 2 ANTERIORES

for lag in range(1, n_lags-1):
    df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
        df_final[f'cust_request_tn_lag_{lag+2}'] - df_final[f'cust_request_tn_lag_{lag}']
    )
    df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
        df_final[f'stock_final_lag_{lag+2}'] - df_final[f'stock_final_lag_{lag}']
    )
    df_final[f'delta_tn_{lag}_{lag+2}'] = (
        df_final[f'tn_lag_{lag+2}'] - df_final[f'tn_lag_{lag}']
    )

  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_f

In [27]:
#DELTA DE LAGS 1 ANTERIOR

for lag in range(1, n_lags):
    df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
        df_final[f'cust_request_tn_lag_{lag+1}'] - df_final[f'cust_request_tn_lag_{lag}']
    )
    df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
        df_final[f'stock_final_lag_{lag+1}'] - df_final[f'stock_final_lag_{lag}']
    )
    df_final[f'delta_tn_{lag}_{lag+1}'] = (
        df_final[f'tn_lag_{lag+1}'] - df_final[f'tn_lag_{lag}']
    )

  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_f

In [28]:
#Mes actual / (lag 2 + lag 3) 

# Calcular el ratio del valor actual con respecto a la suma de los dos lags anteriores
for lag in range(3, n_lags + 1):
    df_final[f'ratio_cust_request_tn_{lag}'] = (
        df_final[f'cust_request_tn_lag_{lag}'] / (
            df_final[f'cust_request_tn_lag_{lag-1}'] + df_final[f'cust_request_tn_lag_{lag-2}']
        )
    )
    df_final[f'ratio_stock_final_{lag}'] = (
        df_final[f'stock_final_lag_{lag}'] / (
            df_final[f'stock_final_lag_{lag-1}'] + df_final[f'stock_final_lag_{lag-2}']
        )
    )
    df_final[f'ratio_tn_{lag}'] = (
        df_final[f'tn_lag_{lag}'] / (
            df_final[f'tn_lag_{lag-1}'] + df_final[f'tn_lag_{lag-2}']
        )
    )

  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_fin

In [29]:
df_final.reset_index(inplace=True)

  df_final.reset_index(inplace=True)


### Medias Móviles

In [30]:
#MEDIAS MOVILES
rolling_windows = [3, 6, 9, 12, 24, 36]

# Agrupamos por 'product_id' y calculamos las medias móviles para 'tn'
for window in rolling_windows:
    df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
   

  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())


### FE

In [31]:
# Datetime features
df_final['year'] = df_final['periodo'].dt.year
df_final['month'] = df_final['periodo'].dt.month
df_final['quarter'] = df_final.periodo.dt.quarter

  df_final['year'] = df_final['periodo'].dt.year
  df_final['month'] = df_final['periodo'].dt.month
  df_final['quarter'] = df_final.periodo.dt.quarter


In [32]:
#Variables Dummies si es el max o el min de cierta cantidad de meses
months = [3, 6, 9, 12]

# Agrupamos por 'product_id' y calculamos las medias móviles para 'tn'
for i in months:
    df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
    df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
    # Crear las dummies
    df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
    df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)

  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
  df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
  df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)
  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
  df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
  df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)
  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['

In [33]:
#Calcular ventas por trimestre
df_final['tn_trimestre'] = df_final.groupby(['trimestre', 'product_id'])['tn'].transform('sum')

df_final['tn_trimestre_escalada'] = df_final.groupby('product_id')['tn_trimestre'].transform(scale_group) 
#Calcular ventas por trimestre por cliente
df_final['tn_trimestre_customer'] = df_final.groupby(['trimestre','customer_id', 'product_id'])['tn'].transform('sum')

df_final['ttn_trimestre_customer_escalada'] = df_final.groupby('product_id')['tn_trimestre_customer'].transform(scale_group) 

  df_final['tn_trimestre'] = df_final.groupby(['trimestre', 'product_id'])['tn'].transform('sum')
  df_final['tn_trimestre_escalada'] = df_final.groupby('product_id')['tn_trimestre'].transform(scale_group)
  df_final['tn_trimestre_customer'] = df_final.groupby(['trimestre','customer_id', 'product_id'])['tn'].transform('sum')
  df_final['ttn_trimestre_customer_escalada'] = df_final.groupby('product_id')['tn_trimestre_customer'].transform(scale_group)


In [34]:
df_final['tn_product_id'] = df_final.groupby(['periodo', 'product_id'])['tn'].transform('sum')

  df_final['tn_product_id'] = df_final.groupby(['periodo', 'product_id'])['tn'].transform('sum')


### FE variables externas

In [35]:
# Leer el archivo exportado
df_exported = pd.read_excel(DATOS_DIR+'23variables_externas.xlsx')

# Asegúrate de que las columnas de fecha estén en el formato datetime
df_exported['fecha'] = pd.to_datetime(df_exported['fecha'])

# Unir los DataFrames por la columna de fecha
df_merged = pd.merge(df_final, df_exported, on='fecha', how='left')


MemoryError: Unable to allocate 102. MiB for an array with shape (1, 13357875) and data type float64

In [None]:
#df_final['plan_precios_cuidados'] =df_final['plan_precios_cuidados'].astype(bool)
#df_final['dias_fin_trimestre'] = df_final['dias_fin_trimestre'].dt.days.astype(int) #TARDA!
df_final = df_final.drop(columns=['fecha'])

## Inicio Train

In [None]:
# #Cambiar las variables categoricas y hacer one-hot encoding

df_final["cat1"] = df_final["cat1"].astype("category")
df_final["cat2"] = df_final["cat2"].astype("category")
df_final["cat3"] = df_final["cat3"].astype("category")
df_final["brand"] = df_final["brand"].astype("category")
df_final["descripcion"] = df_final["descripcion"].astype("category")

# # Encode categorical variables explicitly. One-hot encoding
#cat1_dummies = pd.get_dummies(df_final['cat1'], prefix='cat1', drop_first=True)
#cat2_dummies = pd.get_dummies(df_final['cat2'], prefix='cat2', drop_first=True)
#cat3_dummies = pd.get_dummies(df_final['cat3'], prefix='cat3', drop_first=True)
#brand_dummies = pd.get_dummies(df_final['brand'], prefix='brand', drop_first=True)
#descripcion_dummies = pd.get_dummies(df_final['descripcion'], prefix='descripcion', drop_first=True)

# # Concatenate the dummy variables to the DataFrame and drop the original categorical columns
df_final= pd.concat([df_final, cat1_dummies, cat2_dummies, cat3_dummies, brand_dummies, descripcion_dummies], axis=1)
df_final.drop(columns=['cat1', 'cat2', 'cat3', 'brand'], inplace=True)

df_final.set_index('periodo', inplace=True)
df_final.index = df_final.index.to_period('M')
df_final.sort_index(inplace=True)


In [None]:
import pickle

# Assuming scalers_dict is the dictionary containing the scalers
# Convert the scalers_dict to binary file
with open(DATOS_DIR+'/scalers.pkl', 'wb') as file:#
     pickle.dump(scalers, file)

df_final.to_parquet(DATOS_DIR+'/FE_02_dataset.parquet', engine='pyarrow')  