### Set Up

In [None]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [None]:
data = pd.read_csv("../data/procesados/datos_unidos.csv")

In [None]:
print(data.columns)

Index(['SKU', 'DATE', 'STORE_ID', 'PRICE', 'QUANTITY', 'TOTAL_SALES', 'REGION',
       'CITY', 'STATE', 'STORE_TYPE', 'OPENDATE', 'CLOSEDATE',
       'STORE_SUBGROUP_DATE_ID', 'CATEGORY', 'GROUP', 'SUBGROUP', 'GROUP_TYPE',
       'PRICE_GROUP_ID', 'BRAND', 'INITIAL_TICKET_PRICE', 'BASE_PRICE',
       'COSTOS', 'YEAR_OPEN', 'YEAR_CLOSE', 'MONTH_OPEN', 'MONTH_CLOSE',
       'YEAR', 'MONTH', 'DAY', 'DAY_OF_WEEK', 'WEEK'],
      dtype='object')


In [None]:
print(data.head())

          SKU        DATE STORE_ID  PRICE  QUANTITY  TOTAL_SALES     REGION  \
0  BEAHASH001  2021-01-01   S00068  35.53        10       355.30       West   
1  BEAHASH001  2021-01-01   S00086  33.52         2        67.04  Southeast   
2  BEAHASH001  2021-01-01   S00124  37.61         2        75.22  Northeast   
3  BEAHASH001  2021-01-01   S00140  34.51         4       138.04  Southeast   
4  BEAHASH001  2021-01-02   S00013  33.77         1        33.77  Southwest   

         CITY STATE STORE_TYPE  ... COSTOS YEAR_OPEN YEAR_CLOSE MONTH_OPEN  \
0    Lakewood    CO    Express  ...  15.07      2018       2025         11   
1     Raleigh    NC    Express  ...  15.07      2011       2025         12   
2        Erie    PA     Outlet  ...  15.07      2018       2025          5   
3  Greenville    SC       Mall  ...  15.07      2011       2025          5   
4     El Paso    TX       Mall  ...  15.07      2019       2025         10   

  MONTH_CLOSE  YEAR MONTH DAY DAY_OF_WEEK  WEEK  
0     

In [None]:
print(data.memory_usage())

Index                          132
SKU                       54607624
DATE                      54607624
STORE_ID                  54607624
PRICE                     54607624
QUANTITY                  54607624
TOTAL_SALES               54607624
REGION                    54607624
CITY                      54607624
STATE                     54607624
STORE_TYPE                54607624
OPENDATE                  54607624
CLOSEDATE                 54607624
STORE_SUBGROUP_DATE_ID    54607624
CATEGORY                  54607624
GROUP                     54607624
SUBGROUP                  54607624
GROUP_TYPE                54607624
PRICE_GROUP_ID            54607624
BRAND                     54607624
INITIAL_TICKET_PRICE      54607624
BASE_PRICE                54607624
COSTOS                    54607624
YEAR_OPEN                 54607624
YEAR_CLOSE                54607624
MONTH_OPEN                54607624
MONTH_CLOSE               54607624
YEAR                      54607624
MONTH               

## Rolling Features

Incluimos medias moviles y desviaciones estandar para captar dinamica temporal. Estas las calculamos teniendo en cuenta los dias de cero ventas. Debido a recursos computacionales no podemos expandir todo el dataset.

In [None]:
def create_rolling_features(
    df,
    definitions,
    mean_windows=None,
    std_windows=None
):
    """
    Crea lags y rolling features por SKU y Store_ID considerando días faltantes.

    df: DataFrame con columnas ['SKU', 'Store_ID', 'Date', ...]
    definitions: lista de columnas sobre las cuales calcular lags/rolling
    mean_windows: lista con ventanas para calcular medias móviles (ej. [7, 30, 90])
    std_windows: lista con ventanas para calcular desviaciones móviles (ej. [7, 30])

    Retorna df con las nuevas columnas agregadas.
    """

    df = df.copy()
    df['DATE'] = pd.to_datetime(df['DATE'])

    result_dfs = []

    # Iterar sobre cada grupo SKU + Store_ID
    for (sku, store), group in df.groupby(['SKU', 'STORE_ID']):
        # Crear rango completo de fechas
        full_dates = pd.DataFrame({'DATE': pd.date_range(group['DATE'].min(), group['DATE'].max())})
        full_group = full_dates.merge(group, on='DATE', how='left')
        full_group['SKU'] = sku
        full_group['STORE_ID'] = store

        for col in definitions:
            full_group[col] = full_group[col].fillna(0)

            # Lags fijos
            full_group[f'{col}_lag_1'] = full_group[col].shift(1)
            full_group[f'{col}_lag_7'] = full_group[col].shift(7)

            # Rolling means
            if mean_windows:
                for w in mean_windows:
                    full_group[f'{col}_rolling_mean_{w}'] = (
                        full_group[col].shift(1).rolling(window=w, min_periods=1).mean()
                    )

            # Rolling stds
            if std_windows:
                for w in std_windows:
                    full_group[f'{col}_rolling_std_{w}'] = (
                        full_group[col].shift(1).rolling(window=w, min_periods=1).std()
                    )

        result_dfs.append(full_group)

    df_final = pd.concat(result_dfs, ignore_index=True)
    df_final = df_final.sort_values(['SKU', 'STORE_ID', 'DATE'])

    return df_final

## Linear Regression

Separamos columnas categoricas y numericas.

In [None]:
categorical_cols = [
    'SKU',
    'REGION',
    'CITY',
    'STATE',
    'STORE_TYPE',
    'STORE_SUBGROUP_DATE_ID',
    'CATEGORY',
    'GROUP',
    'SUBGROUP',
    'GROUP_TYPE',
    'PRICE_GROUP_ID',
    'BRAND'
]

In [None]:
numeric_cols = [
    'PRICE',
    'QUANTITY',
    'TOTAL_SALES',
    'INITIAL_TICKET_PRICE',
    'BASE_PRICE',
    'COSTOS',
    'YEAR_OPEN',
    'YEAR_CLOSE',
    'MONTH_OPEN',
    'MONTH_CLOSE',
    'YEAR',
    'MONTH',
    'DAY',
    'DAY_OF_WEEK',
    'WEEK'
]

In [None]:
date_cols = [
    'DATE',
    'OPENDATE',
    'CLOSEDATE'
]

Agregamos rolling features

In [None]:
data = create_rolling_features(data, ['QUANTITY', 'PRICE', 'TOTAL_SALES', 'COSTOS'], mean_windows = [7, 30, 90],
std_windows = [7, 30, 90])

In [None]:
print(data.columns)

In [None]:
print(data.memory_usage) # Verificamos cuanta memoria ocupa ahora

In [None]:
data.to_csv("../data/procesados/datos_rolling.csv")

Entrenamos con el primer 10% a ver como se comporta

In [None]:
df_10pct_series = data.groupby(['SKU', 'Store_ID'], group_keys=False).apply(lambda x: x.head(int(len(x) * 0.1)))

In [None]:
X = df_10pct_series[categorical_cols + numeric_cols]  # variables predictoras
y = df_10pct_series['TOTAL_SALES']                    # variable objetivo

In [None]:
# OneHotEncoder para variables categóricas (transforma texto en variables binarias)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Passthrough para variables numéricas (se dejan igual)
numeric_transformer = 'passthrough'

# ColumnTransformer aplica transformaciones diferentes a columnas distintas
preprocessor = ColumnTransformer(
    transformers=[
        ('categorical', categorical_transformer, categorical_cols),
        ('numeric', numeric_transformer, numeric_cols)
    ]
)

In [None]:
# Definimos el Pipeline
# Combina el preprocesador y el modelo en un solo flujo
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

NameError: name 'Pipeline' is not defined

In [None]:
# Dividimos los datos
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

NameError: name 'train_test_split' is not defined

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Métricas
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

puta