In [169]:
import joblib
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error

import matplotlib.pyplot as plt
import datetime

In [150]:
def create_temporal_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add temporal features to the dataset, capturing short-term and long-term patterns.

    Features added:
    - 1-day lag value: captures immediate influence of the previous day's sales.
    - Cumulative sums for 7-day and 15-day windows: reflect weekly and bi-weekly consumption patterns.
    - Rolling means and standard deviations for 7-day and 15-day windows: track short-term and mid-term trends.
    - is_weekend: indicates if sales occurred on a weekend.
    - day_of_month: captures end-of-month or salary-related purchasing patterns.
    - is_holiday (if 'event_name_1' is available): identifies potential holiday-related sales spikes.

    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame containing sales data with the following columns:
        - 'item_id': Item identifier.
        - 'store_id': Store identifier.
        - 'date': Sales date.
        - 'y': Sales value (target variable).

    Returns:
    --------
    pd.DataFrame
        DataFrame with new temporal feature columns added.
    """
    # Ensure the DataFrame is sorted for correct lag and rolling calculations
    df = df.sort_values(['item_id', 'store_id', 'date'])

    # Difference between current day and previous day
    df['delta_1'] = df.groupby(['item_id', 'store_id'], observed=False)['y'].diff(1)

    # Cumulative sum of the last 7, 15 and 30 days
    df['sum_7'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 8))
    df['sum_15'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 16))
    df['sum_30'] = sum(df.groupby(['item_id', 'store_id'], observed=False)['y'].shift(i) for i in range(1, 31))
    
    # Rolling mean and standard deviation for 7-day and 15-day windows
    for window in [7, 15]:
        df[f'rolling_mean_{window}'] = (
            df.groupby(['item_id', 'store_id'], observed=False)['y']
              .rolling(window=window, min_periods=1).mean()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )
        df[f'rolling_std_{window}'] = (
            df.groupby(['item_id', 'store_id'], observed=False)['y']
              .rolling(window=window, min_periods=1).std()
              .shift(1)
              .reset_index(level=[0, 1], drop=True)
        )


    return df

In [151]:
def create_date_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Add calendar-based features to the dataset.

    Features added:
    - Day of the week (categorical)
    - Month (categorical)
    - Event type (if available): identifies specific events or holidays that could impact sales.

    Parameters:
    ----------
    df : pd.DataFrame
        DataFrame containing a 'date' column.

    Returns:
    --------
    pd.DataFrame
        DataFrame with new date-based features.
    """
    # Day of the week (1 = Monday, 7 = Sunday)
    df['wday'] = df['date'].dt.dayofweek + 1
    df['wday'] = df['wday'].astype('category')

    # Month as a categorical variable
    df['month'] = df['date'].dt.month
    df['month'] = df['month'].astype('category')

    # # Indicates if the date falls on a weekend
    # df['is_weekend'] = (df['date'].dt.dayofweek >= 5).astype('category')

    # Day of the month: identifies end-of-month trends
    df['day_of_month'] = df['date'].dt.day


    # Event indicator: uses event_type_1 or assigns 'No_Event' if missing
    if 'event_type_1' in df.columns:
        # Remove temporariamente o tipo 'category' para preencher corretamente
        df['event_indicator'] = df['event_type_1'].astype(str).fillna('No_Event')
        
        # Substitui strings 'nan' que surgiram ao converter para string
        df['event_indicator'] = df['event_indicator'].replace('nan', 'No_Event')

        # Converte novamente para category com a categoria 'No_Event' incluída
        df['event_indicator'] = df['event_indicator'].astype('category')
    else:
        # Se a coluna não existir, cria a coluna com 'No_Event'
        df['event_indicator'] = pd.Series(['No_Event'] * len(df), dtype='category')

    return df


In [152]:
def remove_unused_columns(df: pd.DataFrame, columns_to_drop: list) -> pd.DataFrame:
    return df.drop(columns=columns_to_drop, axis=1)


## Data Loading and Preparation

In [153]:
# Load training data
train_data = pd.read_parquet('../data/train.snap.parquet')
train_data

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,y,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1542,1.0,2015-04-19,11512,,,,,0,0,0,2.240234
1,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1543,0.0,2015-04-20,11512,,,,,0,0,0,2.240234
2,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1544,0.0,2015-04-21,11512,,,,,0,0,0,2.240234
3,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1545,0.0,2015-04-22,11512,,,,,0,0,0,2.240234
4,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,d_1546,1.0,2015-04-23,11512,,,,,0,0,0,2.240234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12159127,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_1937,0.0,2016-05-18,11616,,,,,0,0,0,5.941406
12159128,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_1938,0.0,2016-05-19,11616,,,,,0,0,0,5.941406
12159129,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_1939,0.0,2016-05-20,11616,,,,,0,0,0,5.941406
12159130,HOUSEHOLD_2_516_WI_3_evaluation,HOUSEHOLD_2_516,HOUSEHOLD_2,HOUSEHOLD,WI_3,WI,d_1940,0.0,2016-05-21,11617,,,,,0,0,0,5.941406


In [154]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12159132 entries, 0 to 12159131
Data columns (total 18 columns):
 #   Column        Dtype         
---  ------        -----         
 0   id            category      
 1   item_id       category      
 2   dept_id       category      
 3   cat_id        category      
 4   store_id      category      
 5   state_id      category      
 6   d             category      
 7   y             float16       
 8   date          datetime64[ns]
 9   wm_yr_wk      uint16        
 10  event_name_1  category      
 11  event_type_1  category      
 12  event_name_2  category      
 13  event_type_2  category      
 14  snap_CA       uint8         
 15  snap_TX       uint8         
 16  snap_WI       uint8         
 17  sell_price    float16       
dtypes: category(11), datetime64[ns](1), float16(2), uint16(1), uint8(3)
memory usage: 360.9 MB


In [155]:
print(train_data['event_type_1'].value_counts(dropna=False))
print(train_data['event_type_2'].value_counts(dropna=False))

event_type_1
nan          11186101
Religious      304462
National       304080
Cultural       273379
Sporting        91110
Name: count, dtype: int64
event_type_2
nan          12159132
Cultural            0
Religious           0
Name: count, dtype: int64


## Feature Engineering

In [156]:
# Apply functions to train_data dataset
train_data = create_date_features(train_data)

# Creating a consolidated snap column to reduce dimensionality
train_data['snap'] = train_data[['snap_CA', 'snap_TX', 'snap_WI']].max(axis=1).astype('category')

# Removing redundant columns after snap consolidation
columns_to_drop = ['id', 'dept_id', 'cat_id', 'state_id', 'wm_yr_wk', 
                   'event_type_1', 'event_name_1', 'event_type_2', 'event_name_2', 
                   'snap_CA', 'snap_TX', 'snap_WI', ]
train_data = remove_unused_columns(train_data, columns_to_drop)

train_data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,item_id,store_id,d,y,date,sell_price,wday,month,day_of_month,event_indicator,snap
0,FOODS_1_001,CA_1,d_1542,1.0,2015-04-19,2.240234,7,4,19,No_Event,0
1,FOODS_1_001,CA_1,d_1543,0.0,2015-04-20,2.240234,1,4,20,No_Event,0
2,FOODS_1_001,CA_1,d_1544,0.0,2015-04-21,2.240234,2,4,21,No_Event,0
3,FOODS_1_001,CA_1,d_1545,0.0,2015-04-22,2.240234,3,4,22,No_Event,0
4,FOODS_1_001,CA_1,d_1546,1.0,2015-04-23,2.240234,4,4,23,No_Event,0


In [157]:
# Apply temporal metric calculations
train_data = create_temporal_features(train_data)

cutoff_date = train_data['date'].min() + datetime.timedelta(days=30)
train_data = train_data[train_data['date'] > cutoff_date]


# Display the first rows to verify changes
train_data.head()

  has_large_values = (abs_vals > 1e6).any()


Unnamed: 0,item_id,store_id,d,y,date,sell_price,wday,month,day_of_month,event_indicator,snap,delta_1,sum_7,sum_15,sum_30,rolling_mean_7,rolling_std_7,rolling_mean_15,rolling_std_15
5718316,HOBBIES_1_001,CA_1,d_1573,0.0,2015-05-20,8.257812,3,5,20,No_Event,0,0.0,0.0,0.0,11.0,0.0,0.0,0.0,0.0
5718317,HOBBIES_1_001,CA_1,d_1574,0.0,2015-05-21,8.257812,4,5,21,No_Event,0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
5718318,HOBBIES_1_001,CA_1,d_1575,0.0,2015-05-22,8.257812,5,5,22,No_Event,0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
5718319,HOBBIES_1_001,CA_1,d_1576,0.0,2015-05-23,8.257812,6,5,23,No_Event,0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0
5718320,HOBBIES_1_001,CA_1,d_1577,0.0,2015-05-24,8.257812,7,5,24,No_Event,0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0


In [159]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11226877 entries, 5718316 to 5718284
Data columns (total 19 columns):
 #   Column           Dtype         
---  ------           -----         
 0   item_id          category      
 1   store_id         category      
 2   d                category      
 3   y                float16       
 4   date             datetime64[ns]
 5   sell_price       float16       
 6   wday             category      
 7   month            category      
 8   day_of_month     int32         
 9   event_indicator  category      
 10  snap             category      
 11  delta_1          float16       
 12  sum_7            float16       
 13  sum_15           float16       
 14  sum_30           float16       
 15  rolling_mean_7   float64       
 16  rolling_std_7    float64       
 17  rolling_mean_15  float64       
 18  rolling_std_15   float64       
dtypes: category(7), datetime64[ns](1), float16(6), float64(4), int32(1)
memory usage: 781.8 MB


## Training

### Split the dataset into training and validation sets 

| Dataset                      | Date Range                  | Purpose         |
|------------------------------|-----------------------------|-----------------|
| sales_train_validation.csv   | 2011-01-29 to 2016-04-24     | Training data   |
| sales_test_validation.csv    | 2016-04-25 to 2016-05-22     | Validation data |
| sales_test_evaluation.csv    | 2016-05-23 to 2016-06-19     | Test data       |


In [160]:
# Define the cutoff date for train-validation split
cutoff_date = '2016-04-25'

# Create copies to avoid modifying the original DataFrame
validation_data = train_data.query("date >= @cutoff_date").copy().reset_index(drop=True)
train_data = train_data.query("date < @cutoff_date").copy().reset_index(drop=True)

# Select features and target variable
features = [
    'item_id', 'store_id', 'wday', 'month', 'day_of_month', 'event_indicator', 'snap', 'sell_price',
    'delta_1', 'sum_7', 'sum_15', 'sum_30', 
    'rolling_mean_7', 'rolling_std_7', 'rolling_mean_15', 'rolling_std_15'
]

# Create feature matrices and target vectors
X_train, y_train = train_data[features], train_data['y']
X_val, y_val = validation_data[features], validation_data['y']

In [163]:
print(train_data['date'].max(), validation_data['date'].min())
print(validation_data.shape, train_data.shape)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

2016-04-24 00:00:00 2016-04-25 00:00:00
(853720, 19) (10373157, 19)
(10373157, 16) (10373157,)
(853720, 16) (853720,)


### Model Training


- **`enable_categorical=True`**: Simplifies data processing and improves model handling of categorical features.  
- **`early_stopping_rounds=10`**: Provides a balanced patience window to avoid unnecessary training.  
- **`min_child_weight=5`**: Reduces complexity by preventing splits on small data subsets, promoting generalization.  

In [165]:
model = xgb.XGBRegressor(
    enable_categorical=True,
    early_stopping_rounds=10,
    min_child_weight=5,
    random_state=33
)

model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=True)

[0]	validation_0-rmse:2.76666
[1]	validation_0-rmse:2.20629
[2]	validation_0-rmse:1.85416
[3]	validation_0-rmse:1.64568
[4]	validation_0-rmse:1.51985
[5]	validation_0-rmse:1.45126
[6]	validation_0-rmse:1.41174
[7]	validation_0-rmse:1.38274
[8]	validation_0-rmse:1.36297
[9]	validation_0-rmse:1.35165
[10]	validation_0-rmse:1.34380
[11]	validation_0-rmse:1.33752
[12]	validation_0-rmse:1.33285
[13]	validation_0-rmse:1.32809
[14]	validation_0-rmse:1.32544
[15]	validation_0-rmse:1.32420
[16]	validation_0-rmse:1.32194
[17]	validation_0-rmse:1.31906
[18]	validation_0-rmse:1.31518
[19]	validation_0-rmse:1.31390
[20]	validation_0-rmse:1.31101
[21]	validation_0-rmse:1.30755
[22]	validation_0-rmse:1.30542
[23]	validation_0-rmse:1.30447
[24]	validation_0-rmse:1.30376
[25]	validation_0-rmse:1.30224
[26]	validation_0-rmse:1.30136
[27]	validation_0-rmse:1.30017
[28]	validation_0-rmse:1.30053
[29]	validation_0-rmse:1.30004
[30]	validation_0-rmse:1.29950
[31]	validation_0-rmse:1.30040
[32]	validation_0-

In [167]:
# Save the model
joblib.dump(model, '../models/xgb_model.pkl')

['../models/xgb_model.pkl']

In [170]:
# Predictions on the training set
y_pred_train = model.predict(X_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

print(f"MAE (train): {mae_train:.2f}")
print(f"RMSE (train): {rmse_train:.2f}")

# Predictions on the validation set
y_pred_val = model.predict(X_val)
mae_val = mean_absolute_error(y_val, y_pred_val)
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

print(f"MAE (validation): {mae_val:.2f}")
print(f"RMSE (validation): {rmse_val:.2f}")

MAE (train): 0.47
RMSE (train): 1.26
MAE (validation): 0.52
RMSE (validation): 1.29


**Comparison between MAE and RMSE:**
- **MAE:** Measures the average absolute error.  
- **RMSE:** Highlights large errors due to quadratic penalization.  

**Interpretation:**  
If the difference between **RMSE** and **MAE** is large, it indicates the presence of **outliers or significant prediction errors** that MAE does not emphasize.

### Test Set Predictions