### Imports

In [1]:
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
#from feature_baggingV2 import FeatureBaggingWithHyperparamTuning
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
#import lightgbm as lgb
import datetime
from sklearn.preprocessing import RobustScaler, PowerTransformer
from keras.models import Sequential
from keras.layers import LSTM, Dense
import labolibrary as labo

import os, sys, gc, time, warnings, pickle, psutil, random

### Función para escalar/desescalar y métrica

In [2]:
# Function to center, scale, and return a series
def scale_group(group):
    #scaler = RobustScaler()
    scaler = PowerTransformer()
    scaled_values = scaler.fit_transform(group.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Store the scaler for this group
    return pd.Series(scaled_values, index=group.index, name=group.name)

# Function to inverse transform (de-scale) and decenter, and return a series
def inverse_scale_group(group):
    group_name = group.name
    scaler = scalers[group_name]
    inversed_centered_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    original_values = inversed_centered_values
    return pd.Series(original_values, index=group.index, name=group_name)



### Archivos

In [3]:
#DATOS_DIR = '~/buckets/b1/datasets/'
DATOS_DIR = '../data/'
df_sell_in = pd.read_csv(DATOS_DIR+'sell-in.txt', sep='\t')
df_predecir = pd.read_csv(DATOS_DIR+'productos_a_predecir.txt', sep='\t')
df_tb_stocks = pd.read_csv(DATOS_DIR+'tb_stocks.txt', sep='\t')
df_tb_productos = pd.read_csv(DATOS_DIR+'tb_productos_descripcion.txt', sep='\t')

Si quiero predecir algun mes en especifico y filtrar los productos con más de 3 meses de venta

In [4]:
#df_sell_in = df_sell_in[df_sell_in['periodo']<201808]
#df_sell_in['tn_2'] = np.where(df_sell_in['periodo'].isin([201805, 201804]), np.nan, df_sell_in['tn_2']) 
#df_sell_in = df_sell_in[df_sell_in['periodo']<201806]
#sales_counts = df_sell_in.groupby('product_id')['periodo'].count() # Contar el número de meses de ventas por product_id
#products_with_more_than_3_months = sales_counts[sales_counts > 3].index # Filtrar los productos con más de 3 meses de ventas
#df_sell_in = df_sell_in[df_sell_in['product_id'].isin(products_with_more_than_3_months)] # Filtrar el DataFrame original para incluir solo estos productos

### Preprocesamiento

In [5]:
df_tb_stocks['periodo'] = pd.to_datetime(df_tb_stocks['periodo'], format='%Y%m')
df_tb_stocks['periodo'] = df_tb_stocks['periodo'] - pd.DateOffset(months=1) #mes diferente en stock
df_tb_stocks['product_id'] = df_tb_stocks['product_id'].astype(int)
df_tb_stocks['stock_final'] = df_tb_stocks['stock_final'].astype(float)
df_tb_productos['product_id'] = df_tb_productos['product_id'].astype(int)
df_tb_productos['sku_size'] = df_tb_productos['sku_size'].astype(int)
df_sell_in['periodo'] = pd.to_datetime(df_sell_in['periodo'], format='%Y%m')
df_sell_in['product_id'] = df_sell_in['product_id'].astype(int)
df_sell_in['customer_id'] = df_sell_in['customer_id'].astype(int)
df_sell_in['cust_request_qty'] = df_sell_in['cust_request_qty'].astype(int)
df_sell_in['cust_request_tn'] = df_sell_in['cust_request_tn'].astype(float)
df_sell_in['tn'] = df_sell_in['tn'].astype(float)
df_sell_in['plan_precios_cuidados'] = df_sell_in['plan_precios_cuidados'].astype(bool)

### Consolidar Datos

In [6]:
#### Agrupar y escalar
scalers = {}
df_sell_in['weight'] = df_sell_in['tn']
df_sell_in['tn'] = df_sell_in.groupby('product_id')['tn'].transform(scale_group)
df_sell_in['tn_inv'] = df_sell_in.groupby('product_id')['tn'].transform(inverse_scale_group)


In [7]:

df_sell_in['tn_2'] = df_sell_in['tn'].shift(-2)
df_sell_in['cust_request_qty'] = df_sell_in.groupby('product_id')['cust_request_qty'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)
df_sell_in['cust_request_tn'] = df_sell_in.groupby('product_id')['cust_request_tn'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)
df_sell_in['cust_request_qty'] = df_sell_in.groupby('product_id')['cust_request_qty'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)
df_tb_stocks['stock_final'] = df_tb_stocks.groupby('product_id')['stock_final'].transform(scale_group)  #OBTENGO TN 2 MESES ADELANTE (CLASE FINAL)

  

In [8]:
# Join tb_productos to sell_in on product_id
df_sell_in_merged = pd.merge(df_sell_in, df_tb_productos, on='product_id', how='left')
# Join tb_stocks to sell_in_merged on both product_id and periodo
df_final = pd.merge(df_sell_in_merged, df_tb_stocks, on=['product_id', 'periodo'], how='left')

#Convertir 'periodo' a formato de fecha y Calcular el trimestre desde 'periodo'
df_final['fecha'] = pd.to_datetime(df_final['periodo'], format='%Y%m')
df_final["trimestre"] = df_final.periodo.dt.quarter

#Establecer 'fecha' como índice y convertir a período mensual
df_final.set_index('fecha', inplace=True)
df_final.index = df_final.index.to_period('M')

In [9]:
df_final = df_final[df_final['product_id'].isin(df_predecir['product_id'])] #Filtrar solo los productos a predecir

### Clase de LightGBM

In [10]:
df_final['diff_tn_tn2'] =  df_final['tn_2'] - df_final['tn'] #NUEVA CLASE


## FE

### Lags

In [11]:
# Create lag variables for 'cust_request_qty', 'cust_request_tn', and 'tn'
n_lags = 36
for lag in range(1, n_lags + 1):
    df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
    
    df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
    
    df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
   
    df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
    

  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final['tn'].shift(lag)
  df_final[f'cust_request_qty_lag_{lag}'] = df_final['cust_request_qty'].shift(lag)
  df_final[f'cust_request_tn_lag_{lag}'] = df_final['cust_request_tn'].shift(lag)
  df_final[f'stock_final_lag_{lag}'] = df_final['stock_final'].shift(lag)
  df_final[f'tn_lag_{lag}'] = df_final

In [12]:
#DELTA DE LAGS 2 ANTERIORES

for lag in range(1, n_lags-1):
    df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
        df_final[f'cust_request_tn_lag_{lag+2}'] - df_final[f'cust_request_tn_lag_{lag}']
    )
    df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
        df_final[f'stock_final_lag_{lag+2}'] - df_final[f'stock_final_lag_{lag}']
    )
    df_final[f'delta_tn_{lag}_{lag+2}'] = (
        df_final[f'tn_lag_{lag+2}'] - df_final[f'tn_lag_{lag}']
    )

  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_final[f'delta_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+2}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+2}'] = (
  df_f

In [13]:
#DELTA DE LAGS 1 ANTERIOR

for lag in range(1, n_lags):
    df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
        df_final[f'cust_request_tn_lag_{lag+1}'] - df_final[f'cust_request_tn_lag_{lag}']
    )
    df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
        df_final[f'stock_final_lag_{lag+1}'] - df_final[f'stock_final_lag_{lag}']
    )
    df_final[f'delta_tn_{lag}_{lag+1}'] = (
        df_final[f'tn_lag_{lag+1}'] - df_final[f'tn_lag_{lag}']
    )

  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_final[f'delta_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_cust_request_tn_{lag}_{lag+1}'] = (
  df_final[f'delta_stock_final_{lag}_{lag+1}'] = (
  df_f

In [14]:
#Mes actual / (lag 2 + lag 3) 

# Calcular el ratio del valor actual con respecto a la suma de los dos lags anteriores
for lag in range(3, n_lags + 1):
    df_final[f'ratio_cust_request_tn_{lag}'] = (
        df_final[f'cust_request_tn_lag_{lag}'] / (
            df_final[f'cust_request_tn_lag_{lag-1}'] + df_final[f'cust_request_tn_lag_{lag-2}']
        )
    )
    df_final[f'ratio_stock_final_{lag}'] = (
        df_final[f'stock_final_lag_{lag}'] / (
            df_final[f'stock_final_lag_{lag-1}'] + df_final[f'stock_final_lag_{lag-2}']
        )
    )
    df_final[f'ratio_tn_{lag}'] = (
        df_final[f'tn_lag_{lag}'] / (
            df_final[f'tn_lag_{lag-1}'] + df_final[f'tn_lag_{lag-2}']
        )
    )

  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_final[f'ratio_cust_request_tn_{lag}'] = (
  df_final[f'ratio_stock_final_{lag}'] = (
  df_final[f'ratio_tn_{lag}'] = (
  df_fin

In [15]:
df_final.reset_index(inplace=True)

  df_final.reset_index(inplace=True)


### Medias Móviles

In [16]:
#MEDIAS MOVILES
rolling_windows = [3, 6, 9, 12, 24, 36]

# Agrupamos por 'product_id' y calculamos las medias móviles para 'tn'
for window in rolling_windows:
    df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
   

  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())
  df_final[f'rolling_mean_tn_{window}'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(window, min_periods=1).mean())


### FE

In [17]:
# Datetime features
df_final['year'] = df_final['periodo'].dt.year
df_final['month'] = df_final['periodo'].dt.month
df_final['quarter'] = df_final.periodo.dt.quarter

  df_final['year'] = df_final['periodo'].dt.year
  df_final['month'] = df_final['periodo'].dt.month
  df_final['quarter'] = df_final.periodo.dt.quarter


In [18]:
#Variables Dummies si es el max o el min de cierta cantidad de meses
months = [3, 6, 9, 12]

# Agrupamos por 'product_id' y calculamos las medias móviles para 'tn'
for i in months:
    df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
    df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
    # Crear las dummies
    df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
    df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)

  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
  df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
  df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)
  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).min())
  df_final[f'dummy_max_{i}m'] = np.where(df_final['tn'] == df_final[f'max_{i}m'], 1, 0)
  df_final[f'dummy_min_{i}m'] = np.where(df_final['tn'] == df_final[f'min_{i}m'], 1, 0)
  df_final[f'max_{i}m'] = df_final.groupby('product_id')['tn'].transform(lambda x: x.rolling(i, min_periods=1).max())
  df_final[f'min_{i}m'] = df_final.groupby('product_id')['

In [19]:
#Calcular ventas por trimestre
df_final['tn_trimestre'] = df_final.groupby(['trimestre', 'product_id'])['tn'].transform('sum')

df_final['tn_trimestre_escalada'] = df_final.groupby('product_id')['tn_trimestre'].transform(scale_group) 
#Calcular ventas por trimestre por cliente
df_final['tn_trimestre_customer'] = df_final.groupby(['trimestre','customer_id', 'product_id'])['tn'].transform('sum')

df_final['ttn_trimestre_customer_escalada'] = df_final.groupby('product_id')['tn_trimestre_customer'].transform(scale_group) 

  df_final['tn_trimestre'] = df_final.groupby(['trimestre', 'product_id'])['tn'].transform('sum')
  df_final['tn_trimestre_escalada'] = df_final.groupby('product_id')['tn_trimestre'].transform(scale_group)
  df_final['tn_trimestre_customer'] = df_final.groupby(['trimestre','customer_id', 'product_id'])['tn'].transform('sum')
  df_final['ttn_trimestre_customer_escalada'] = df_final.groupby('product_id')['tn_trimestre_customer'].transform(scale_group)


In [20]:
df_final['tn_product_id'] = df_final.groupby(['periodo', 'product_id'])['tn'].transform('sum')

  df_final['tn_product_id'] = df_final.groupby(['periodo', 'product_id'])['tn'].transform('sum')


### FE variables externas

In [21]:
# Leer el archivo exportado
df_exported = pd.read_excel(DATOS_DIR+'23variables_externas.xlsx')

# Asegúrate de que las columnas de fecha estén en el formato datetime
df_exported['fecha'] = pd.to_datetime(df_exported['fecha'])

# Unir los DataFrames por la columna de fecha
df_merged = pd.merge(df_final, df_exported, on='fecha', how='left')


In [22]:
df_final['plan_precios_cuidados'] =df_final['plan_precios_cuidados'].astype(bool)
#df_final['dias_fin_trimestre'] = df_final['dias_fin_trimestre'].dt.days.astype(int) #TARDA!
df_final = df_final.drop(columns=['fecha'])

## Inicio Train

In [23]:
# #Cambiar las variables categoricas y hacer one-hot encoding

df_final["cat1"] = df_final["cat1"].astype("category")
df_final["cat2"] = df_final["cat2"].astype("category")
df_final["cat3"] = df_final["cat3"].astype("category")
df_final["brand"] = df_final["brand"].astype("category")
df_final["descripcion"] = df_final["descripcion"].astype("category")

# # Encode categorical variables explicitly. One-hot encoding
cat1_dummies = pd.get_dummies(df_final['cat1'], prefix='cat1', drop_first=True)
cat2_dummies = pd.get_dummies(df_final['cat2'], prefix='cat2', drop_first=True)
cat3_dummies = pd.get_dummies(df_final['cat3'], prefix='cat3', drop_first=True)
brand_dummies = pd.get_dummies(df_final['brand'], prefix='brand', drop_first=True)
descripcion_dummies = pd.get_dummies(df_final['descripcion'], prefix='descripcion', drop_first=True)

# # Concatenate the dummy variables to the DataFrame and drop the original categorical columns
df_final= pd.concat([df_final, cat1_dummies, cat2_dummies, cat3_dummies, brand_dummies, descripcion_dummies], axis=1)
df_final.drop(columns=['cat1', 'cat2', 'cat3', 'brand'], inplace=True)

df_final.set_index('periodo', inplace=True)
df_final.index = df_final.index.to_period('M')
df_final.sort_index(inplace=True)


In [24]:
import pickle

# Assuming scalers_dict is the dictionary containing the scalers
# Convert the scalers_dict to binary file
with open(DATOS_DIR+'/scalers.pkl', 'wb') as file:#
     pickle.dump(scalers, file)

df_final.to_parquet(DATOS_DIR+'/FE_02_dataset.parquet', engine='pyarrow')  

#### EJECUTAR

In [None]:
weight = df_final.groupby(['product_id', df_final.index])['weight'].transform('mean')
#df_final.drop(columns=['weight'], inplace=True)

In [None]:

### Filtrar datos
df_true = df_final.loc['2019-12-01':'2020-01-01']
df_final = df_final.loc['2018-01-01':'2019-11-01']


#Filtro de no compradores
#Step 1: Ensure the index is a datetime type
df_final.index = df_final.index.to_timestamp()
# Step 2: Determine the last date and calculate the date 3 months prior
ls_date  = df_final.index.max()
three_months_prior = ls_date - pd.DateOffset(months=3)

# Step 3: Filter the dataframe to include onl+y rows within the last 3 months
last_3_months_df = df_final[df_final.index >= three_months_prior]

# Step 4: Identify the unique client_id that have purchased within this period
active_clients = last_3_months_df['customer_id'].unique()

# Step 5: Filter the original dataframe to include only these client_id
df_final = df_final[df_final['customer_id'].isin(active_clients)]

df_final.index = pd.PeriodIndex(df_final.index, freq='M')


#Filtro test
#df_final = df_final[df_final['product_id'] < 20013]

In [None]:

df_final.columns = df_final.columns.str.replace(' ', '_').str.replace(r'[^A-Za-z0-9_]', '', regex=True)

In [None]:
# Correr Modelo
params={
        'boosting_type': 'gbdt',
        'objective': 'Regression',
        'metric':'rmse',
        'verbose': -1,
        #'n_jobs': -1,
        #'seed': 113,
        #'learning_rate': 0.2,
        #'bagging_fraction': 0.85,
        #'bagging_freq': 1, 
        #'colsample_bytree': 0.85,
        #'colsample_bynode': 0.85,
        #'min_data_per_leaf': 25,
        #'num_leaves': 200,
        #'lambda_l1': 0.5,
        #'lambda_l2': 0.5
}

predictions_all = pd.DataFrame(columns=['tn'])
products = df_final['product_id'].unique()
tot = len(products)
nro = 0
for producto in products:
    print(f'Fitting and predicting for product_id: {producto}')
    # Filtrar los datos del producto
    df_producto = df_final[df_final['product_id'] == producto]

    # Prepare data for LSTM on tn_2 only
    X = df_producto[['tn_2']].values.astype('float32')
    y = df_producto['tn_2'].values.astype('float32')
    X = X.reshape((X.shape[0], 1, X.shape[1]))
    #######################################################    
    # Define LSTM model
    model = Sequential()
    model.add(LSTM(50, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy')

    # Train LSTM model
    model.fit(X, y, epochs=10, batch_size=32, verbose=1)

    # Extract features from LSTM
    features = model.predict(X)

    # Prepare data for LightGBM
    # Convert LSTM features to DataFrame
    features_df = pd.DataFrame(features, index=df_producto.index, columns=[f'lstm_feature_{i}' for i in range(features.shape[1])])
    df_producto = pd.concat([df_producto, features_df], axis=1)
    df_producto['tn_2'] = y
    #############################################################################  
    #### Agrupar y escalar
   
    model, average_metric = labo.train_lightgbm_model(df_producto,params,metric='rmse')
    print("Overall rmse metric: ", average_metric)
    # Predict values for the entire dataset using the trained models
    # Prepare last data points for prediction
    last_data_points = df_producto[df_producto.index == df_producto.index.max()].copy()
    last_data_points.drop(columns=['tn_2'], inplace=True)
    # Predict the next month's value using the trained model
    predictions = labo.predict_next_month(model, last_data_points)
    preds = predictions.groupby('product_id')['tn_2'].transform(inverse_scale_group)
    predictions['tn'] = preds
    predictions.drop(columns=['tn_2'], inplace=True)
    predictions = predictions.reset_index()
    predictions =  predictions.groupby('product_id')['tn'].sum()
    predictions.columns = ['product_id', 'tn']
    predictions_all = pd.concat([predictions_all, predictions])
    print(predictions_all[-1:])


In [None]:

predictions_all['tn']=predictions_all['tn'].astype('float32')
predictions_all.index.names = ['product_id']
predictions_all.to_csv(DATOS_DIR+'/pred/0018-prediccion-rmse_scaled_CORRECT2-product_id_LSTM-No_Buyer.csv', index=True,header=True)
print("Overall custom metric: ", average_metric)
