In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb



In [2]:
df_tb_productos = pd.read_csv('tb_productos.txt', sep='\t')
df_sell_in = pd.read_csv('sell-in.txt', sep='\t')
df_predecir = pd.read_csv('productos_a_predecir.txt', sep='\t')
df_tb_stocks = pd.read_csv('tb_stocks.txt', sep='\t')

In [3]:
# Join tb_productos to sell_in on product_id
df_sell_in_merged = pd.merge(df_sell_in, df_tb_productos, on='product_id', how='left')
# Join tb_stocks to sell_in_merged on both product_id and periodo
df_final = pd.merge(df_sell_in_merged, df_tb_stocks, on=['product_id', 'periodo'], how='left')
df_final['fecha'] = pd.to_datetime(df_final['periodo'], format='%Y%m')

In [4]:
df_sell_in = df_sell_in[df_sell_in['periodo']>=2019] #Tomo los ultimos 12 meses para predecir

In [5]:
# Join tb_productos to sell_in on product_id
df_sell_in_merged = pd.merge(df_sell_in, df_tb_productos, on='product_id', how='left')
# Join tb_stocks to sell_in_merged on both product_id and periodo
df_final = pd.merge(df_sell_in_merged, df_tb_stocks, on=['product_id', 'periodo'], how='left')

In [6]:
productos_a_predecir = df_predecir['product_id'].unique()

df_final = df_sell_in[df_sell_in['product_id'].isin(productos_a_predecir)]

In [7]:
df_final['fecha'] = pd.to_datetime(df_final['periodo'], format='%Y%m')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final['fecha'] = pd.to_datetime(df_final['periodo'], format='%Y%m')


In [8]:
df_final = df_final.groupby(['product_id','fecha'])['tn'].sum().reset_index()

In [9]:
df_final_filtrado = df_final[df_final['product_id'].isin(productos_a_predecir)]
df_final_filtrado['fecha'] = pd.to_datetime(df_final_filtrado['fecha'])
ultima_fecha = df_final_filtrado['fecha'].max()

print(df_final_filtrado)

       product_id      fecha          tn
0           20001 2017-01-01   934.77222
1           20001 2017-02-01   798.01620
2           20001 2017-03-01  1303.35771
3           20001 2017-04-01  1069.96130
4           20001 2017-05-01  1502.20132
...           ...        ...         ...
22344       21276 2019-08-01     0.01265
22345       21276 2019-09-01     0.01856
22346       21276 2019-10-01     0.02079
22347       21276 2019-11-01     0.03341
22348       21276 2019-12-01     0.00892

[22349 rows x 3 columns]


In [11]:
df_final_filtrado['year'] = df_final_filtrado['fecha'].dt.year
df_final_filtrado['month'] = df_final_filtrado['fecha'].dt.month
df_final_filtrado = df_final_filtrado.drop(['fecha'], axis=1)

In [12]:
scalers = {}

# Función para escalar y devolver una serie
def minmax_scale_group(group):
    scaler = MinMaxScaler()
    scaled_values = scaler.fit_transform(group.values.reshape(-1, 1)).flatten()
    scalers[group.name] = scaler  # Almacenar el escalador para este grupo
    return pd.Series(scaled_values, index=group.index)

df_final_filtrado['valor_escalado'] = df_final_filtrado.groupby('product_id')['tn'].transform(minmax_scale_group)

In [13]:
df_final_filtrado.head(3)

Unnamed: 0,product_id,tn,year,month,valor_escalado
0,20001,934.77222,2017,1,0.091342
1,20001,798.0162,2017,2,0.0
2,20001,1303.35771,2017,3,0.337528


In [14]:
dataset_final = df_final_filtrado.drop(['tn'], axis=1)

In [15]:
X = dataset_final.drop(['valor_escalado'], axis=1)
y = dataset_final.valor_escalado

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=2, shuffle=False)

In [17]:
# Crear dataset de LightGBM
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Definir parámetros del modelo
params = {
    'objective': 'regression',
    'metric': 'rmse',
}

# Entrenar el modelo
model = lgb.train(params, 
                  train_data, 
                  valid_sets=[test_data],
                  num_boost_round=100,
                  callbacks=[lgb.early_stopping(stopping_rounds=10)])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 272
[LightGBM] [Info] Number of data points in the train set: 22347, number of used features: 3
[LightGBM] [Info] Start training from score 0.414666
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[35]	valid_0's rmse: 0.0951565


In [18]:
# Obtener la última fecha disponible

# Calcular la fecha del segundo mes siguiente
fecha_prediccion = ultima_fecha + pd.DateOffset(months=2)

# Agregar la columna de fecha a df_predecir
df_predecir['fecha'] = fecha_prediccion


# Extraer características de la fecha futura
df_predecir['year'] = df_predecir['fecha'].dt.year
df_predecir['month'] = df_predecir['fecha'].dt.month

X_future = df_predecir.drop(['fecha'], axis=1)

# Realizar la predicción
df_predecir['pred'] = model.predict(X_future)

In [19]:
df_predecir.head(6)

Unnamed: 0,product_id,fecha,year,month,pred
0,20001,2020-02-01,2020,2,0.29123
1,20002,2020-02-01,2020,2,0.29123
2,20003,2020-02-01,2020,2,0.29123
3,20004,2020-02-01,2020,2,0.29123
4,20005,2020-02-01,2020,2,0.29123
5,20006,2020-02-01,2020,2,0.29123


In [20]:
# Función para desescalar y devolver una serie
def inverse_minmax_scale_group(group):
    scaler = scalers[group.name]
    inversed_values = scaler.inverse_transform(group.values.reshape(-1, 1)).flatten()
    return pd.Series(inversed_values, index=group.index)

# Aplicar la desescalado a cada grupo de productos
df_predecir['tn'] = df_predecir.groupby('product_id')['pred'].transform(inverse_minmax_scale_group)

In [21]:
df_predecir.head(3)

Unnamed: 0,product_id,fecha,year,month,pred,tn
0,20001,2020-02-01,2020,2,0.29123,1234.040326
1,20002,2020-02-01,2020,2,0.29123,935.057206
2,20003,2020-02-01,2020,2,0.29123,943.092043


In [22]:
# Especificar las columnas a mantener
columnas_a_mantener = ['product_id','tn']

# Seleccionar solo las columnas especificadas
df_predecir = df_predecir.loc[:, columnas_a_mantener]

In [23]:
# Convertir la lista a un DataFrame
df_predictions = pd.DataFrame(df_predecir)
df_predictions['tn'] = df_predictions['tn'].clip(lower=0)
df_predictions = df_predictions.fillna(0)
df_predictions.to_csv('predicciones_escaladasMinMaxV2.csv', index=False, header=True)