In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

Data Loading and Preparation

In [None]:
try:
    df = pd.read_csv('data/processed/merged_data.csv', parse_dates=['Date'])
except FileNotFoundError:
    print("Error: 'merged_data.csv' not found. Please run the preprocessing script first.")
    df = None 

if df is not None:
    store_df = df[df['Store'] == 1].copy()

    store_df.sort_values('Date', inplace=True)
    
    print(f"Data loaded for Store 1. Shape: {store_df.shape}")
    display(store_df.head())

Feature Engineering

In [None]:
if df is not None:
    # Time-based features
    store_df['Year'] = store_df['Date'].dt.year
    store_df['Month'] = store_df['Date'].dt.month
    store_df['WeekOfYear'] = store_df['Date'].dt.isocalendar().week.astype(int)
    store_df['DayOfYear'] = store_df['Date'].dt.dayofyear
    
    # Lag features (past sales values)
    store_df['Lag_1'] = store_df['Weekly_Sales'].shift(1) # Sales from last week
    store_df['Lag_4'] = store_df['Weekly_Sales'].shift(4) # Sales from a month ago
    store_df['Lag_52'] = store_df['Weekly_Sales'].shift(52) # Sales from a year ago

    # Rolling window features
    store_df['Rolling_Mean_4'] = store_df['Weekly_Sales'].shift(1).rolling(window=4).mean()
    store_df['Rolling_Std_4'] = store_df['Weekly_Sales'].shift(1).rolling(window=4).std()

    store_df.dropna(inplace=True)
    
    print("Features created. Shape after dropping NaNs:", store_df.shape)
    display(store_df.head())

Define Features and Target & Split Data

In [None]:
if df is not None:
    # Define feature columns and the target column
    FEATURES = [
        'Year', 'Month', 'WeekOfYear', 'DayOfYear', # Time features
        'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', # External features
        'Lag_1', 'Lag_4', 'Lag_52', # Lag features
        'Rolling_Mean_4', 'Rolling_Std_4' # Rolling window features
    ]
    TARGET = 'Weekly_Sales'

    X = store_df[FEATURES]
    y = store_df[TARGET]

    train_size = int(len(X) * 0.7)
    val_size = int(len(X) * 0.15)
    
    X_train, y_train = X[:train_size], y[:train_size]
    X_val, y_val = X[train_size:train_size + val_size], y[train_size:train_size + val_size]
    X_test, y_test = X[train_size + val_size:], y[train_size + val_size:]

    print(f"X_train shape: {X_train.shape}")
    print(f"X_val shape: {X_val.shape}")
    print(f"X_test shape: {X_test.shape}")

Optuna Hyperparameter Search

In [None]:
if df is not None:
    def objective(trial):
        params = {
            'objective': 'reg:squarederror',
            'n_estimators': trial.suggest_int('n_estimators', 200, 2000, step=100),
            'max_depth': trial.suggest_int('max_depth', 3, 12),
            'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
            'subsample': trial.suggest_float('subsample', 0.5, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
            'gamma': trial.suggest_float('gamma', 1e-8, 1.0, log=True),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
            'random_state': 42
        }

        # Initialize and train the XGBoost model
        model = xgb.XGBRegressor(**params)
        model.fit(X_train, y_train, 
                  eval_set=[(X_val, y_val)], 
                  early_stopping_rounds=50, 
                  verbose=False)

        preds = model.predict(X_val)
        
        rmse = np.sqrt(mean_squared_error(y_val, preds))
        return rmse


    study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=50, timeout=600)

    print("\n--- OPTIMIZATION FINISHED ---")
    print(f"Number of finished trials: {len(study.trials)}")

Analyze Results

In [None]:
if df is not None:
    print('Best trial found:')
    best_trial = study.best_trial
    print(f'  Value (Validation RMSE): {best_trial.value:,.2f}')
    print('  Best Params: ')
    for key, value in best_trial.params.items():
        print(f'    {key}: {value}')
        
    
    print("\n--- Evaluating final model on test set ---")
    final_model = xgb.XGBRegressor(**best_trial.params)
    
    # Combine train and validation data to train the final model
    X_train_full = pd.concat([X_train, X_val])
    y_train_full = pd.concat([y_train, y_val])
    
    final_model.fit(X_train_full, y_train_full)
    
    test_preds = final_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))
    
    print(f"Final Model RMSE on unseen Test Data: {test_rmse:,.2f}")