- Dataset 36 meses para todos los productos completados con ceros

In [1]:
import pandas as pd
import numpy as np
import importlib
import gc
import sys
import warnings
warnings.filterwarnings("ignore")

In [None]:
sys.path.append('../../notebooks/entregable/scripts')
import dataset
import preprocesamiento
import target
import feature_engineering
importlib.reload(dataset)
importlib.reload(preprocesamiento)
importlib.reload(target)
importlib.reload(feature_engineering)

In [None]:
df = pd.read_csv("../../data/preprocessed/base.csv", sep=',')
df.shape

In [None]:
#### COMBINATORIA ####
data = dataset.combinatoria_periodo_producto()
data['periodo'] = data['periodo'].dt.year * 100 + data['periodo'].dt.month
data.shape

In [None]:
productos_ok = pd.read_csv("../../data/raw/product_id_apredecir201912.csv", sep="\t")
data = data[data['product_id'].isin(productos_ok['product_id'].unique())]
data

In [None]:
#### MERGE CON PRODUCTOS ####
productos = pd.read_csv("../../data/raw/tb_productos.csv", sep='\t')
productos = productos.drop_duplicates(subset=['product_id'], keep='first')
data = data.merge(productos, how='left', on="product_id")
del productos

#### MERGE CON STOCKS ####
stocks = pd.read_csv("../../data/raw/tb_stocks.csv", sep='\t')
stocks = stocks.groupby(by=["periodo", "product_id"]).agg({"stock_final": "sum"}).reset_index()
data = data.merge(stocks, how='left', on=['periodo', 'product_id'])
del stocks

#### MERGE CON SELLIN ####
sellin = pd.read_csv("../../data/raw/sell-in.csv", sep='\t')
sellin = sellin.groupby(by=["periodo","product_id"]).agg({"tn":"sum", "plan_precios_cuidados":"sum", "cust_request_qty":"sum", "cust_request_tn":"sum"}).reset_index()
data = data.merge(sellin, how='left', on=['periodo', 'product_id'])
del sellin
gc.collect()

In [None]:
#### COMPLETO TN CON CEROS ####
####  ¿cuantos?
print(f"Total de periodos con Nan debido a la combinatoria periodo_x_producto: {data['tn'].isna().sum()}")
#### Lo completo con ceros
data['tn'] = data['tn'].fillna(0)

In [None]:
#### GUARDAR DATAFRAME ####
data.to_csv("./datasets/periodo_x_producto.csv", index=False, sep=',', encoding='utf-8')

In [None]:
data = pd.read_csv("./periodo_x_producto.csv", sep=',')

In [None]:
from neuralprophet import NeuralProphet
from tqdm import tqdm

# ---------------------
# 🛠 Preparación inicial
# ---------------------
# Levanto
data = pd.read_csv("./periodo_x_producto.csv", sep=',')

# Creo DF
df = data[['product_id','periodo', 'tn']].copy()

# Convertir 'periodo' (yyyymm) a datetime
df['periodo_dt'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')
df = df.sort_values(['product_id', 'periodo_dt'])  # Ordenar por producto y fecha

df.drop(columns=['periodo'], inplace=True)  # Eliminar columna temporal
df.rename(columns={'tn': 'y', 'periodo_dt':'ds'}, inplace=True)  # Renombrar columna de tn

productos_ok = pd.read_csv("./product_id_apredecir201912.csv", sep="\t")

# Lista para guardar predicciones
predicciones = []


# ---------------------
# 🔁 Loop por producto
# ---------------------
for product_id in productos_ok['product_id'].unique():
    df_prod = df[df['product_id'] == product_id].sort_values('ds')

    if len(df_prod) < 6:
        continue  # Salta productos con muy pocos datos

    try:
        # Definir modelo NeuralProphet
        model = NeuralProphet(
            yearly_seasonality=True,
            weekly_seasonality=False,
            daily_seasonality=False,
            seasonality_mode='additive'  # importante si hay ceros
        )

        # Entrenar modelo
        model.fit(df_prod[['ds', 'y']], freq='MS', progress='off')

        # Crear fechas futuras para mes+2
        future = model.make_future_dataframe(df_prod[['ds', 'y']], periods=2)
        forecast = model.predict(future)

        # Extraer predicción del mes+2 (última fila)
        forecast_mes2 = forecast.tail(1)

        predicciones.append({
            'product_id': product_id,
            'fecha_predicha': forecast_mes2['ds'].values[0],
            'yhat1': forecast_mes2['yhat1'].values[0]
        })
    except Exception as e:
        print(f"⚠️ Producto {product_id} falló: {e}")

# ---------------------
# 📊 Resultados finales
# ---------------------
df_predicciones = pd.DataFrame(predicciones)
print(df_predicciones.head())

In [4]:
df_kaggle = pd.read_csv("./datasets/neuralprophet_v4.csv", sep=',')
df_kaggle.rename(columns={'yhat1': 'tn'}, inplace=True)
df_kaggle[['product_id', 'tn']].to_csv("./datasets/neuralprophet_v4_kaggle.csv", index=False, sep=',', encoding='utf-8')

In [5]:
def promedio_12_meses_780p():
    
    df = pd.read_csv("./datasets/periodo_x_producto_con_target.csv", sep=',', encoding='utf-8')
    df = df[df['periodo'] >= 201901]  # Filtrar desde 201901
    
    productos_ok = pd.read_csv("../../data/raw/product_id_apredecir201912.csv", sep="\t")

    df = df.merge(productos_ok, on='product_id', how='inner')
    
    df = df.groupby('product_id').agg({'tn': 'mean'}).reset_index()
    
    return df

df_promedios = promedio_12_meses_780p()


In [6]:
df_kaggle = df_kaggle.merge(df_promedios, on='product_id', how='left')
df_kaggle

Unnamed: 0,product_id,fecha_predicha,tn_x,tn_y
0,20001,2020-02-01,978.498200,1454.732720
1,20002,2020-02-01,1085.884400,1175.437142
2,20003,2020-02-01,552.694200,784.976407
3,20004,2020-02-01,406.699340,627.215328
4,20005,2020-02-01,541.698850,668.270104
...,...,...,...,...
775,21263,2020-02-01,-0.026363,0.029993
776,21265,2020-02-01,0.067555,0.089541
777,21266,2020-02-01,0.074221,0.094659
778,21267,2020-02-01,0.019894,0.092835


In [7]:
df_kaggle.loc[df_kaggle['tn_x'] < 0, 'tn_x'] = df_kaggle['tn_y']
df_kaggle.rename(columns={'tn_x': 'tn'}, inplace=True)
df_kaggle[['product_id', 'tn']].to_csv("./datasets/neuralprophet_v4_kaggle_sinnegativos.csv", index=False, sep=',', encoding='utf-8')