<a href="https://colab.research.google.com/github/joako-m-g/2048IA/blob/main/src/TabTransformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set Up

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# --- Imports ---
import torch
from torch.serialization import add_safe_globals
from omegaconf import DictConfig
from typing import Any
from pytorch_lightning import LightningModule
from pytorch_tabular.models.common.layers import Embedding1dLayer
from pytorch_tabular import TabularModel
from pytorch_tabular.config import DataConfig, TrainerConfig, OptimizerConfig
from pytorch_tabular.models.tab_transformer import TabTransformerConfig


In [4]:
!pip install pytorch-tabular[all]


Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 447, in run
    conflicts = self._determine_conflicts(to_install)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 578, in _determine_conflicts
    return check_install_conflicts(to_install)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/operations/check.py", line 101, in check_install_conflicts
    package_set, _ = create_package_set_from_installed()
              

KeyboardInterrupt: 

In [6]:
data = pd.read_csv('/content/drive/MyDrive/datos_unidos.csv')

## Rolling features

Agregamos medias y desviaciones, teniendo en cuenta las no ventas, para que el modelo comprenda mejor la dinamica temporal.

In [7]:

def rolling_sales_stats(df, windows=[7,30,90], col="TOTAL_SALES"):
    """
    Calcula medias y desviaciones móviles para cada SKU-Tienda.
    Considera días sin ventas como 0 dentro de la ventana, excluye el día actual.

    Parámetros:
    - df: DataFrame con columnas ['SKU','STORE_ID','DATE', col]
    - windows: lista de tamaños de ventana
    - col: columna sobre la cual calcular los RF

    Retorna:
    - df con nuevas columnas de medias y desviaciones móviles
    """
    df = df.copy()
    df["DATE"] = pd.to_datetime(df["DATE"])
    df.sort_values(["SKU","STORE_ID","DATE"], inplace=True)

    df["_tmp"] = df[col]

    for w in windows:
        mean_col = f"{col}_mean_{w}D"
        std_col  = f"{col}_std_{w}D"

        def rolling_func(x):
            # Crear rango completo de fechas para incluir días sin ventas
            idx = pd.date_range(start=x.index.min(), end=x.index.max())
            x_full = x.reindex(idx, fill_value=0)
            # Excluir día actual
            rolled = x_full.shift(1).rolling(w, min_periods=1)
            return pd.DataFrame({
                mean_col: rolled.mean(),
                std_col: rolled.std().fillna(0)
            }).reindex(x.index)  # dejar solo filas originales

        rolled_df = (
            df.groupby(["SKU","STORE_ID"])["_tmp"]
              .apply(rolling_func)
              .reset_index(level=[0,1], drop=True)
        )

        df[mean_col] = rolled_df[mean_col].values
        df[std_col]  = rolled_df[std_col].values

    df.drop(columns=["_tmp"], inplace=True)
    return df

In [8]:
data = rolling_sales_stats(data)

In [26]:
# Llenar NaN en numéricas con 0 y en categóricas con 'missing'
data[numerical_cols] = data[numerical_cols].fillna(0)
data[categorical_cols] = data[categorical_cols].fillna("missing")

In [21]:
print(data.columns)
print(len(data))

Index(['SKU', 'DATE', 'STORE_ID', 'PRICE', 'QUANTITY', 'TOTAL_SALES', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE', 'OPENDATE', 'CLOSEDATE',
       'STORE_SUBGROUP_DATE_ID', 'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'INITIAL_TICKET_PRICE', 'BASE_PRICE',
       'COSTOS', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE',
       'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK', 'TOTAL_SALES_mean_7D',
       'TOTAL_SALES_std_7D', 'TOTAL_SALES_mean_30D', 'TOTAL_SALES_std_30D',
       'TOTAL_SALES_mean_90D', 'TOTAL_SALES_std_90D'],
      dtype='object')
6825953


## Primer acercamiento

Entrenamos y evaluamos el modelo con 10% de cada serie para ver un primer desempeño.

In [27]:
group_cols = ["SKU","STORE_ID"]

# --- Tomar el primer 10% de cada serie (orden cronológico) ---
data_sample = (
    data.groupby(group_cols, group_keys=False)
        .apply(lambda x: x.sort_values("DATE").iloc[:int(len(x)*0.1)])
        .reset_index(drop=True)
)

# --- Dividir 70/30 dentro de cada serie (primer 70% para train, resto para test) ---
train_list = []
test_list = []

for _, group in data_sample.groupby(group_cols):
    n_train = int(len(group) * 0.7)
    train_list.append(group.iloc[:n_train])   # primer 70%
    test_list.append(group.iloc[n_train:])    # resto 30%

train_data = pd.concat(train_list).reset_index(drop=True)
test_data  = pd.concat(test_list).reset_index(drop=True)

# --- Ordenar por SKU, STORE, DATE ---
train_data = train_data.sort_values(group_cols + ["DATE"]).reset_index(drop=True)
test_data  = test_data.sort_values(group_cols + ["DATE"]).reset_index(drop=True)


  .apply(lambda x: x.sort_values("DATE").iloc[:int(len(x)*0.1)])


In [28]:
print(train_data[numerical_cols].isna().sum())  # revisa NaN
print(train_data[target_col].isna().sum())      # target sin NaN
print(train_data[numerical_cols].describe())    # revisa outliers extremos


PRICE                   0
QUANTITY                0
TOTAL_SALES_mean_7D     0
TOTAL_SALES_std_7D      0
TOTAL_SALES_mean_30D    0
TOTAL_SALES_std_30D     0
TOTAL_SALES_mean_90D    0
TOTAL_SALES_std_90D     0
INITIAL_TICKET_PRICE    0
BASE_PRICE              0
COSTOS                  0
YEAR_OPEN               0
YEAR_CLOSE              0
MONTH_OPEN              0
MONTH_CLOSE             0
YEAR                    0
MONTH                   0
DAY                     0
WEEK                    0
dtype: int64
0
               PRICE       QUANTITY  TOTAL_SALES_mean_7D  TOTAL_SALES_std_7D  \
count  367178.000000  367178.000000             367178.0            367178.0   
mean       69.930710       3.944918                  0.0                 0.0   
std        84.829169       2.124560                  0.0                 0.0   
min         4.920000       1.000000                  0.0                 0.0   
25%        23.830000       2.000000                  0.0                 0.0   
50%        

Definimos columnas numericas y categoricas

In [29]:
categorical_cols = [
    "REGION", "CITY", "STATE", "STORE_TYPE", "CATEGORY", "GROUP",
    "SUBGROUP", "GROUP_TYPE", "PRICE_GROUP_ID", "BRAND",
    "DAY_OF_WEEK"  # mover aquí si es string o categoría
]

numerical_cols = [
    "PRICE", "QUANTITY", "TOTAL_SALES_mean_7D", "TOTAL_SALES_std_7D",
    "TOTAL_SALES_mean_30D", "TOTAL_SALES_std_30D",
    "TOTAL_SALES_mean_90D", "TOTAL_SALES_std_90D",
    "INITIAL_TICKET_PRICE", "BASE_PRICE", "COSTOS",
    "YEAR_OPEN", "YEAR_CLOSE", "MONTH_OPEN", "MONTH_CLOSE",
    "YEAR", "MONTH", "DAY", "WEEK"  # aquí solo floats/ints
]
# Target
target_col = "TOTAL_SALES"

# Features = todas las categóricas + numéricas
features = categorical_cols + numerical_cols

Instanciamos y configuramos el modelo

In [44]:
# Agregar todas las clases necesarias a los safe globals
torch.serialization.add_safe_globals([
    dict,  # El principal causante del error
    DictConfig,
    Any,
    LightningModule,
    Embedding1dLayer,
    ContainerMetadata,
    typing.Any
    # Agrega aquí otras clases que puedan ser necesarias
])

# --- Configuración de DataConfig ---
data_config = DataConfig(
    target=[target_col],
    continuous_cols=numerical_cols,
    categorical_cols=categorical_cols,
    num_workers=0
)

# --- Configuración de TrainerConfig ---
trainer_config = TrainerConfig(
    max_epochs=1,
    batch_size=1024,
    accelerator="gpu" if torch.cuda.is_available() else "cpu"
)

# --- Configuración de OptimizerConfig ---
optimizer_config = OptimizerConfig()  # Adam por defecto

# --- Configuración de TabTransformer (solo arquitectura básica y métricas) ---
model_config = TabTransformerConfig(
    task="regression",
    metrics=["mean_squared_error"]  # Métrica compatible con regresión
)

# --- Inicialización del modelo ---
tab_model = TabularModel(
    data_config=data_config,
    model_config=model_config,
    trainer_config=trainer_config,
    optimizer_config=optimizer_config,
    verbose=True
)

INFO:pytorch_tabular.tabular_model:Experiment Tracking is turned off


In [45]:
# --- Entrenamiento ---
tab_model.fit(train=train_data, validation=test_data)  # Forzar a guardar y cargar bien

INFO:lightning_fabric.utilities.seed:Seed set to 42
INFO:pytorch_tabular.tabular_model:Preparing the DataLoaders
INFO:pytorch_tabular.tabular_datamodule:Setting up the datamodule for regression task
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_encoded[col].fillna(self._imputed, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the ori

Output()

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


INFO:pytorch_tabular.tabular_model:Training the model completed
INFO:pytorch_tabular.tabular_model:Loading the best model


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL collections.defaultdict was not an allowed global by default. Please use `torch.serialization.add_safe_globals([defaultdict])` or the `torch.serialization.safe_globals([defaultdict])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [None]:
y_true = test_data[target_col].values          # valores reales
X_test = test_data.drop(columns=[target_col])  # eliminar la columna target

y_pred = tab_model.predict(X_test).detach().cpu().numpy()


# Calcular R²
r2 = r2_score(y_true, y_pred)
print("R²:", r2)