### Imports

In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#from feature_baggingV2 import FeatureBaggingWithHyperparamTuning
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
#import lightgbm as lgb
import datetime
from sklearn.preprocessing import MinMaxScaler

import os, sys, gc, time, warnings, pickle, psutil, random

### Función para escalar/desescalar y métrica

In [2]:
# Función para escalar y devolver una serie
def minmax_scale_group(group):
    scaler = MinMaxScaler()
    scaled_values = scaler.fit_transform(group.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Almacenar el escalador para este grupo
    return pd.Series(scaled_values, index=group.index)

# Función para desescalar y devolver una serie
def inverse_minmax_scale_group(group):
    scaler = scalers[group.name]
    inversed_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    return pd.Series(inversed_values, index=group.index)

In [3]:
# Definir la métrica personalizada
def multinacional_metric(y_true, y_pred):
    return abs(sum(y_true - y_pred)) / sum(y_true)

### Archivos

In [4]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'
df_sell_in = pd.read_csv(DATOS_DIR+'sell-in.txt', sep='\t')
df_predecir = pd.read_csv(DATOS_DIR+'productos_a_predecir.txt', sep='\t')
df_tb_stocks = pd.read_csv(DATOS_DIR+'tb_stocks.txt', sep='\t')
df_tb_productos = pd.read_csv(DATOS_DIR+'tb_productos_descripcion.txt', sep='\t')

Si quiero predecir algun mes en especifico y filtrar los productos con más de 3 meses de venta

In [5]:
#df_sell_in = df_sell_in[df_sell_in['periodo']<201808]
#df_sell_in['tn_2'] = np.where(df_sell_in['periodo'].isin([201805, 201804]), np.nan, df_sell_in['tn_2']) 
#df_sell_in = df_sell_in[df_sell_in['periodo']<201806]
#sales_counts = df_sell_in.groupby('product_id')['periodo'].count() # Contar el número de meses de ventas por product_id
#products_with_more_than_3_months = sales_counts[sales_counts > 3].index # Filtrar los productos con más de 3 meses de ventas
#df_sell_in = df_sell_in[df_sell_in['product_id'].isin(products_with_more_than_3_months)] # Filtrar el DataFrame original para incluir solo estos productos

### Preprocesamiento

In [6]:
df_tb_stocks['periodo'] = pd.to_datetime(df_tb_stocks['periodo'], format='%Y%m')
df_tb_stocks['periodo'] = df_tb_stocks['periodo'] - pd.DateOffset(months=1) #mes diferente en stock


df_tb_stocks['product_id'] = df_tb_stocks['product_id'].astype(int)
df_tb_stocks['stock_final'] = df_tb_stocks['stock_final'].astype(float)
df_tb_productos['product_id'] = df_tb_productos['product_id'].astype(int)
df_tb_productos['sku_size'] = df_tb_productos['sku_size'].astype(int)
df_sell_in['periodo'] = pd.to_datetime(df_sell_in['periodo'], format='%Y%m')
df_sell_in['product_id'] = df_sell_in['product_id'].astype(int)
df_sell_in['customer_id'] = df_sell_in['customer_id'].astype(int)
df_sell_in['cust_request_qty'] = df_sell_in['cust_request_qty'].astype(int)
df_sell_in['cust_request_tn'] = df_sell_in['cust_request_tn'].astype(float)
df_sell_in['tn'] = df_sell_in['tn'].astype(float)
df_sell_in['plan_precios_cuidados'] = df_sell_in['plan_precios_cuidados'].astype(bool)

### Consolidar Datos

In [7]:
df_sell_in['tn_2'] = df_sell_in['tn'].shift(-2) #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)

In [8]:
# Join tb_productos to sell_in on product_id
df_sell_in_merged = pd.merge(df_sell_in, df_tb_productos, on='product_id', how='left')
# Join tb_stocks to sell_in_merged on both product_id and periodo
df_final = pd.merge(df_sell_in_merged, df_tb_stocks, on=['product_id', 'periodo'], how='left')

#Convertir 'periodo' a formato de fecha y Calcular el trimestre desde 'periodo'
df_final['fecha'] = pd.to_datetime(df_final['periodo'], format='%Y%m')
df_final["trimestre"] = df_final.periodo.dt.quarter

#Establecer 'fecha' como índice y convertir a período mensual
df_final.set_index('fecha', inplace=True)
df_final.index = df_final.index.to_period('M')

In [9]:
df_final = df_final[df_final['product_id'].isin(df_predecir['product_id'])] #Filtrar solo los productos a predecir

### Clase de LightGBM

In [10]:
df_final['diff_tn_tn2'] =  df_final['tn_2'] - df_final['tn'] #NUEVA CLASE

In [11]:
df_final

Unnamed: 0_level_0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,tn_2,cat1,cat2,cat3,brand,sku_size,descripcion,stock_final,trimestre,diff_tn_tn2
fecha,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2017-01,2017-01-01,10234,20524,False,2,0.05300,0.05300,0.03028,HC,VAJILLA,Cristalino,Importado,500.0,Abrillantador,,1,-0.02272
2017-01,2017-01-01,10032,20524,False,1,0.13628,0.13628,0.02271,HC,VAJILLA,Cristalino,Importado,500.0,Abrillantador,,1,-0.11357
2017-01,2017-01-01,10217,20524,False,1,0.03028,0.03028,1.54452,HC,VAJILLA,Cristalino,Importado,500.0,Abrillantador,,1,1.51424
2017-01,2017-01-01,10125,20524,False,1,0.02271,0.02271,0.01514,HC,VAJILLA,Cristalino,Importado,500.0,Abrillantador,,1,-0.00757
2017-01,2017-01-01,10012,20524,False,11,1.54452,1.54452,0.10600,HC,VAJILLA,Cristalino,Importado,500.0,Abrillantador,,1,-1.43852
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12,2019-12-01,10105,20853,False,1,0.02230,0.02230,0.02898,PC,CABELLO,Shampoo Bebe,NIVEA,200.0,Sabor 1,,4,0.00668
2019-12,2019-12-01,10092,20853,False,1,0.00669,0.00669,0.01561,PC,CABELLO,Shampoo Bebe,NIVEA,200.0,Sabor 1,,4,0.00892
2019-12,2019-12-01,10006,20853,False,7,0.02898,0.02898,0.01561,PC,CABELLO,Shampoo Bebe,NIVEA,200.0,Sabor 1,,4,-0.01337
2019-12,2019-12-01,10018,20853,False,4,0.01561,0.01561,,PC,CABELLO,Shampoo Bebe,NIVEA,200.0,Sabor 1,,4,


## FE

### Lags

In [12]:
# Create lag variables for 'cust_request_qty', 'cust_request_tn', and 'tn'
n_lags = 36
for lag in range(1, n_lags + 1):
    df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
    df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
    df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
    df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)

  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{

In [13]:
#DELTA DE LAGS 2 ANTERIORES

for lag in range(1, n_lags-1):
    df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
        df_final[f'cust_request_tn_lag_{lag+2}'] - df_final[f'cust_request_tn_lag_{lag}']
    )
    df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
        df_final[f'stock_final_lag_{lag+2}'] - df_final[f'stock_final_lag_{lag}']
    )
    df_final[f'delta_tn_{lag}_{lag+2}'] = (
        df_final[f'tn_lag_{lag+2}'] - df_final[f'tn_lag_{lag}']
    )

  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_f

In [14]:
#DELTA DE LAGS 1 ANTERIOR

for lag in range(1, n_lags):
    df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
        df_final[f'cust_request_tn_lag_{lag+1}'] - df_final[f'cust_request_tn_lag_{lag}']
    )
    df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
        df_final[f'stock_final_lag_{lag+1}'] - df_final[f'stock_final_lag_{lag}']
    )
    df_final[f'delta_tn_{lag}_{lag+1}'] = (
        df_final[f'tn_lag_{lag+1}'] - df_final[f'tn_lag_{lag}']
    )

  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_f

In [15]:
#Mes actual / (lag 2 + lag 3) 

# Calcular el ratio del valor actual con respecto a la suma de los dos lags anteriores
for lag in range(3, n_lags + 1):
    df_final[f'ratio_cust_request_tn_{lag}'] = (
        df_final[f'cust_request_tn_lag_{lag}'] / (
            df_final[f'cust_request_tn_lag_{lag-1}'] + df_final[f'cust_request_tn_lag_{lag-2}']
        )
    )
    df_final[f'ratio_stock_final_{lag}'] = (
        df_final[f'stock_final_lag_{lag}'] / (
            df_final[f'stock_final_lag_{lag-1}'] + df_final[f'stock_final_lag_{lag-2}']
        )
    )
    df_final[f'ratio_tn_{lag}'] = (
        df_final[f'tn_lag_{lag}'] / (
            df_final[f'tn_lag_{lag-1}'] + df_final[f'tn_lag_{lag-2}']
        )
    )

  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_fin

In [16]:
df_final.reset_index(inplace=True)

  df_final.reset_index(inplace=True)


### Medias Móviles

In [17]:
#MEDIAS MOVILES
rolling_windows = [3, 6, 9, 12, 24, 36]

# Agrupamos por 'product_id' y calculamos las medias móviles para 'tn'
for window in rolling_windows:
    df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())

  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())


### FE

In [18]:
# Datetime features
df_final['year'] = df_final['periodo'].dt.year
df_final['month'] = df_final['periodo'].dt.month
df_final['quarter'] = df_final.periodo.dt.quarter

  df_final['year'] = df_final['periodo'].dt.year
  df_final['month'] = df_final['periodo'].dt.month
  df_final['quarter'] = df_final.periodo.dt.quarter


In [19]:
#Variables Dummies si es el max o el min de cierta cantidad de meses
months = [3, 6, 9, 12]

# Agrupamos por 'product_id' y calculamos las medias móviles para 'tn'
for i in months:
    df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
    df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
    # Crear las dummies
    df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
    df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)

  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
  df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
  df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)
  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
  df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
  df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)
  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['

In [20]:
#Calcular ventas por trimestre
df_final['tn_trimestre'] = df_final.groupby(['trimestre', 'product_id'])['tn'].transform('sum')
#Calcular ventas por trimestre por cliente
df_final['tn_trimestre_customer'] = df_final.groupby(['trimestre','customer_id', 'product_id'])['tn'].transform('sum')

  df_final['tn_trimestre'] = df_final.groupby(['trimestre', 'product_id'])['tn'].transform('sum')
  df_final['tn_trimestre_customer'] = df_final.groupby(['trimestre','customer_id', 'product_id'])['tn'].transform('sum')


In [21]:
df_final['tn_product_id'] = df_final.groupby(['periodo', 'product_id'])['tn'].transform('sum')

  df_final['tn_product_id'] = df_final.groupby(['periodo', 'product_id'])['tn'].transform('sum')


### FE variables externas

In [22]:
# Leer el archivo exportado
df_exported = pd.read_excel('23variables_externas.xlsx')

# Asegúrate de que las columnas de fecha estén en el formato datetime
df_exported['fecha'] = pd.to_datetime(df_exported['fecha'])

# Unir los DataFrames por la columna de fecha
df_merged = pd.merge(df_final, df_exported, on='fecha', how='left')

# Mostrar el DataFrame unido
print(df_merged.head())

     fecha    periodo  customer_id  product_id  plan_precios_cuidados  \
0  2017-01 2017-01-01        10234       20524                  False   
1  2017-01 2017-01-01        10032       20524                  False   
2  2017-01 2017-01-01        10217       20524                  False   
3  2017-01 2017-01-01        10125       20524                  False   
4  2017-01 2017-01-01        10012       20524                  False   

   cust_request_qty  cust_request_tn       tn     tn_2 cat1  ...  \
0                 2          0.05300  0.05300  0.03028   HC  ...   
1                 1          0.13628  0.13628  0.02271   HC  ...   
2                 1          0.03028  0.03028  1.54452   HC  ...   
3                 1          0.02271  0.02271  0.01514   HC  ...   
4                11          1.54452  1.54452  0.10600   HC  ...   

  Promedio de uva_diario eph_continua_tasa_desempleo_total Promedio de merval  \
0                    NaN                               NaN             

In [23]:
df_final

Unnamed: 0,fecha,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,tn_2,cat1,...,min_9m,dummy_max_9m,dummy_min_9m,max_12m,min_12m,dummy_max_12m,dummy_min_12m,tn_trimestre,tn_trimestre_customer,tn_product_id
0,2017-01,2017-01-01,10234,20524,False,2,0.05300,0.05300,0.03028,HC,...,0.05300,1,1,0.05300,0.05300,1,1,64.00631,0.12871,6.48085
1,2017-01,2017-01-01,10032,20524,False,1,0.13628,0.13628,0.02271,HC,...,0.05300,1,0,0.13628,0.05300,1,0,64.00631,1.36281,6.48085
2,2017-01,2017-01-01,10217,20524,False,1,0.03028,0.03028,1.54452,HC,...,0.03028,0,1,0.13628,0.03028,0,1,64.00631,0.09842,6.48085
3,2017-01,2017-01-01,10125,20524,False,1,0.02271,0.02271,0.01514,HC,...,0.02271,0,1,0.13628,0.02271,0,1,64.00631,0.03785,6.48085
4,2017-01,2017-01-01,10012,20524,False,11,1.54452,1.54452,0.10600,HC,...,0.02271,1,0,1.54452,0.02271,1,0,64.00631,7.38191,6.48085
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293476,2019-12,2019-12-01,10105,20853,False,1,0.02230,0.02230,0.02898,PC,...,0.00446,0,0,0.62426,0.00446,0,0,18.55436,0.10479,2.89842
2293477,2019-12,2019-12-01,10092,20853,False,1,0.00669,0.00669,0.01561,PC,...,0.00446,0,0,0.62426,0.00446,0,0,18.55436,0.01561,2.89842
2293478,2019-12,2019-12-01,10006,20853,False,7,0.02898,0.02898,0.01561,PC,...,0.00446,0,0,0.62426,0.00446,0,0,18.55436,0.66662,2.89842
2293479,2019-12,2019-12-01,10018,20853,False,4,0.01561,0.01561,,PC,...,0.00446,0,0,0.62426,0.00446,0,0,18.55436,0.31213,2.89842


In [24]:
df_final['plan_precios_cuidados'] =df_final['plan_precios_cuidados'].astype(bool)
#df_final['dias_fin_trimestre'] = df_final['dias_fin_trimestre'].dt.days.astype(int) #TARDA!
df_final = df_final.drop(columns=['fecha'])

## Inicio Train

In [25]:
# #Cambiar las variables categoricas y hacer one-hot encoding

df_final["cat1"] = df_final["cat1"].astype("category")
df_final["cat2"] = df_final["cat2"].astype("category")
df_final["cat3"] = df_final["cat3"].astype("category")
df_final["brand"] = df_final["brand"].astype("category")
df_final["descripcion"] = df_final["descripcion"].astype("category")

# # Encode categorical variables explicitly. One-hot encoding
cat1_dummies = pd.get_dummies(df_final['cat1'], prefix='cat1', drop_first=True)
cat2_dummies = pd.get_dummies(df_final['cat2'], prefix='cat2', drop_first=True)
cat3_dummies = pd.get_dummies(df_final['cat3'], prefix='cat3', drop_first=True)
brand_dummies = pd.get_dummies(df_final['brand'], prefix='brand', drop_first=True)
descripcion_dummies = pd.get_dummies(df_final['descripcion'], prefix='descripcion', drop_first=True)

# # Concatenate the dummy variables to the DataFrame and drop the original categorical columns
df_final= pd.concat([df_final, cat1_dummies, cat2_dummies, cat3_dummies, brand_dummies, descripcion_dummies], axis=1)
df_final.drop(columns=['cat1', 'cat2', 'cat3', 'brand'], inplace=True)

df_final.set_index('periodo', inplace=True)
df_final.index = df_final.index.to_period('M')
df_final.sort_index(inplace=True)


In [26]:

df_final.to_parquet(DATOS_DIR+'/FE_dataset-CARLA.parquet', engine='pyarrow')  