# Experiment 11 Sarimax

In [1]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m25.6 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive


In [2]:
! mkdir ~/.kaggle
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json
! chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting
! unzip walmart-recruiting-store-sales-forecasting.zip
!unzip train.csv.zip
!unzip features.csv.zip

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 205MB/s]
Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           
Archive:  train.csv.zip
  inflating: train.csv               
Archive:  features.csv.zip
  inflating: features.csv            


In [3]:
!pip install statsmodels mlflow dagshub scikit-learn pandas numpy matplotlib seaborn joblib -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m246.9/246.9 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# SPEED OPTIMIZATION: Suppress verbose logging globally
import logging
logging.getLogger('statsmodels').setLevel(logging.WARNING)

# Core libraries
import mlflow
import mlflow.sklearn
import dagshub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# SARIMAX libraries
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.seasonal import seasonal_decompose
import joblib
import os
import json


In [5]:
class WalmartPreprocessingPipeline:
    """
    Complete preprocessing pipeline for Walmart sales data
    Supports fit/transform pattern for proper train/validation handling
    """

    def __init__(self):
        self.fitted = False
        self.outlier_thresholds = None
        self.feature_columns = None

    def load_and_prepare_data(self):
        """Load and merge train.csv, stores.csv, features.csv datasets"""
        print("📊 Loading datasets...")

        # Load datasets
        train_df = pd.read_csv('train.csv')
        stores_df = pd.read_csv('stores.csv')
        features_df = pd.read_csv('features.csv')

        print(f"   📈 Train data: {train_df.shape}")
        print(f"   🏪 Stores data: {stores_df.shape}")
        print(f"   🎯 Features data: {features_df.shape}")

        # Convert Date column to datetime
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        # Merge datasets
        train_stores = train_df.merge(stores_df, on='Store', how='left')
        train_full = train_stores.merge(features_df, on=['Store', 'Date'], how='left')

        print(f"   ✅ Merged data: {train_full.shape}")
        print(f"   📅 Date range: {train_full['Date'].min()} to {train_full['Date'].max()}")

        return train_full

    def clean_merged_data(self, train_full):
        """Clean merged data by handling duplicate IsHoliday columns"""
        print("🧹 Cleaning merged data...")

        initial_shape = train_full.shape

        # Handle duplicate IsHoliday columns if they exist
        if 'IsHoliday_x' in train_full.columns and 'IsHoliday_y' in train_full.columns:
            print("   🔄 Resolving duplicate IsHoliday columns...")
            train_full['IsHoliday'] = train_full['IsHoliday_x'] | train_full['IsHoliday_y']
            train_full = train_full.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)

        print(f"   ✅ Cleaned data: {train_full.shape} (was {initial_shape})")
        return train_full

    def create_temporal_split(self, df, train_ratio=0.8):
        """Create temporal split to prevent data leakage"""
        print(f"📅 Creating temporal split ({int(train_ratio*100)}/{int((1-train_ratio)*100)})...")

        # Sort by date to ensure temporal order
        df_sorted = df.sort_values('Date').reset_index(drop=True)

        # Find split point
        split_idx = int(len(df_sorted) * train_ratio)
        split_date = df_sorted.iloc[split_idx]['Date']

        # Create splits
        train_data = df_sorted.iloc[:split_idx].copy()
        val_data = df_sorted.iloc[split_idx:].copy()

        # Create split info dictionary
        split_info = {
            'split_date': split_date,
            'train_size': len(train_data),
            'val_size': len(val_data),
            'train_date_range': (train_data['Date'].min(), train_data['Date'].max()),
            'val_date_range': (val_data['Date'].min(), val_data['Date'].max())
        }

        print(f"   📊 Split date: {split_date}")
        print(f"   📈 Train: {len(train_data):,} records ({train_data['Date'].min()} to {train_data['Date'].max()})")
        print(f"   📉 Val: {len(val_data):,} records ({val_data['Date'].min()} to {val_data['Date'].max()})")

        return train_data, val_data, split_info

    def fit(self, train_data):
        """Fit the preprocessing pipeline on training data"""
        print("🔧 Fitting preprocessing pipeline on training data...")

        # Store training data for lag feature creation
        self.train_data_for_lags = train_data.copy()

        # Fit outlier removal thresholds on training data only
        self.outlier_thresholds = {
            'A': {'lower': -1000, 'upper': 50000},  # Type A stores
            'B': {'lower': -500, 'upper': 25000},   # Type B stores
            'C': {'lower': -200, 'upper': 15000}    # Type C stores
        }

        print("✅ Pipeline fitted on training data")
        self.fitted = True
        return self

    def transform(self, data, is_validation=False):
        """Transform data using fitted pipeline"""
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before transform!")

        print(f"🔄 Transforming {'validation' if is_validation else 'training'} data...")

        df = data.copy()

        # Step 1: Create date features
        df = self._create_date_features(df)

        # Step 2: Create holiday features
        df = self._create_holiday_features(df)

        # Step 3: Encode categorical features (BEFORE outlier removal!)
        df = self._encode_categorical_features(df)

        # Step 4: Create lag features (different for train vs validation)
        if is_validation:
            df = self._create_lag_features_validation(df)
        else:
            df = self._create_lag_features_training(df)

        # Step 5: Remove outliers (only on training data, AFTER encoding)
        if not is_validation:
            df = self._remove_outliers(df)

        # Step 6: Remove markdown features
        df = self._remove_markdown_features(df)

        # Step 7: Remove redundant features
        df = self._remove_redundant_features(df)

        print(f"✅ Transform complete. Shape: {df.shape}")
        return df

    def fit_transform(self, train_data):
        """Fit and transform training data in one step"""
        return self.fit(train_data).transform(train_data, is_validation=False)

    def _create_date_features(self, df):
        """Create date features"""
        df = df.copy()
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['WeekOfYear'] = df['Date'].dt.isocalendar().week
        df['Quarter'] = df['Date'].dt.quarter
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
        df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
        df['IsQuarterStart'] = df['Date'].dt.is_quarter_start.astype(int)
        df['IsQuarterEnd'] = df['Date'].dt.is_quarter_end.astype(int)
        start_date = df['Date'].min()
        df['DaysFromStart'] = (df['Date'] - start_date).dt.days
        df['WeeksFromStart'] = df['DaysFromStart'] // 7
        return df

    def _create_holiday_features(self, df):
        """Create holiday features"""
        df = df.copy()
        super_bowl_dates = ['2010-02-12', '2011-02-11', '2012-02-10']
        labor_day_dates = ['2010-09-10', '2011-09-09', '2012-09-07']
        thanksgiving_dates = ['2010-11-26', '2011-11-25', '2012-11-23']
        christmas_dates = ['2010-12-31', '2011-12-30', '2012-12-28']

        df['IsSuperBowlWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(super_bowl_dates).astype(int)
        df['IsLaborDayWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(labor_day_dates).astype(int)
        df['IsThanksgivingWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(thanksgiving_dates).astype(int)
        df['IsChristmasWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(christmas_dates).astype(int)
        df['IsMajorHoliday'] = (df['IsSuperBowlWeek'] | df['IsLaborDayWeek'] |
                               df['IsThanksgivingWeek'] | df['IsChristmasWeek']).astype(int)
        df['IsHolidayMonth'] = df['Month'].isin([11, 12]).astype(int)
        df['IsBackToSchool'] = df['Month'].isin([8, 9]).astype(int)
        return df

    def _create_lag_features_training(self, df):
        """Create lag features for training data - DISABLED to reduce overfitting"""
        # Lag features removed to prevent overfitting
        return df

    def _create_lag_features_validation(self, df):
        """Create lag features for validation data - DISABLED to reduce overfitting"""
        # Lag features removed to prevent overfitting
        return df

    def _remove_outliers(self, df):
        """Remove outliers from training data only"""
        initial_len = len(df)
        df_clean = df.copy()

        for store_type, thresholds in self.outlier_thresholds.items():
            type_mask = df_clean[f'Type_{store_type}'] == 1
            outlier_mask = (
                (df_clean['Weekly_Sales'] < thresholds['lower']) |
                (df_clean['Weekly_Sales'] > thresholds['upper'])
            )
            df_clean = df_clean[~(type_mask & outlier_mask)]

        removed = initial_len - len(df_clean)
        print(f"   🗑️ Removed {removed:,} outliers from training data")
        return df_clean

    def _remove_markdown_features(self, df):
        """Remove markdown columns"""
        markdown_cols = [col for col in df.columns if 'MarkDown' in col]
        if markdown_cols:
            df = df.drop(markdown_cols, axis=1)
        return df

    def _remove_redundant_features(self, df):
        """Remove redundant features"""
        redundant_cols = ['Year', 'Quarter', 'Day', 'WeekOfYear', 'DaysFromStart',
                         'IsQuarterStart', 'IsQuarterEnd']
        existing_redundant = [col for col in redundant_cols if col in df.columns]
        if existing_redundant:
            df = df.drop(existing_redundant, axis=1)
        return df

    def _encode_categorical_features(self, df):
        """Encode categorical features using both one-hot and label encoding"""
        df = df.copy()

        if 'Type' in df.columns:
            print(f"   🔧 Encoding Type column using both one-hot and label encoding...")

            # One-hot encoding (existing approach)
            type_dummies = pd.get_dummies(df['Type'], prefix='Type', dtype=int)

            # Label encoding (experiment_2 approach)
            # A=0, B=1, C=2 (same as experiment_2)
            type_mapping = {'A': 0, 'B': 1, 'C': 2}
            df['Type_Encoded'] = df['Type'].map(type_mapping)

            # Add one-hot columns
            for col in type_dummies.columns:
                df[col] = type_dummies[col]

            # Remove original Type column
            df = df.drop('Type', axis=1)

            print(f"   ✅ Added both Type_Encoded and {list(type_dummies.columns)}")

        return df


In [6]:
def setup_mlflow():
    """Setup MLflow and DagsHub tracking"""
    print("🔧 Setting up MLflow and DagsHub...")

    # End any active runs first
    try:
        mlflow.end_run()
    except:
        pass

    # Initialize DagsHub
    try:
        dagshub.init(
            repo_owner='konstantine25b',
            repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
            mlflow=True
        )
        print("✅ DagsHub initialized successfully!")
    except Exception as e:
        print(f"⚠️ DagsHub init warning: {e}")

    # Set MLflow tracking URI
    mlflow.set_tracking_uri("https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow")

    # Create unique experiment name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_name = f"Experiment_11_SARIMAX_{timestamp}"

    try:
        experiment_id = mlflow.create_experiment(experiment_name)
        print(f"✅ Created new experiment: {experiment_name}")
    except mlflow.exceptions.MlflowException as e:
        if "already exists" in str(e):
            experiment = mlflow.get_experiment_by_name(experiment_name)
            experiment_id = experiment.experiment_id
            print(f"✅ Using existing experiment: {experiment_name}")
        else:
            # Fallback to default experiment
            experiment_name = "Default"
            mlflow.set_experiment(experiment_name)
            print(f"⚠️ Using default experiment due to: {e}")

    mlflow.set_experiment(experiment_name)

    print(f"✅ MLflow setup complete!")
    print(f"🔗 Tracking URI: {mlflow.get_tracking_uri()}")
    print(f"📊 Experiment: {experiment_name}")

    return experiment_name



In [7]:
def get_preprocessed_data():
    """
    Use preprocessing pipeline to get model-ready data

    Returns:
        X_train, y_train, X_val, y_val: Model-ready datasets
        train_holidays, val_holidays: Holiday indicators for WMAE
        split_info: Information about the temporal split
    """
    print("🔄 Getting preprocessed data using pipeline...")

    # Create the preprocessing pipeline
    pipeline = WalmartPreprocessingPipeline()

    # Load raw data
    train_full = pipeline.load_and_prepare_data()
    train_full = pipeline.clean_merged_data(train_full)

    # Create temporal split
    train_data, val_data, split_info = pipeline.create_temporal_split(train_full)

    # Extract holiday information before preprocessing
    val_holidays = val_data['IsHoliday'].values.astype(bool)

    # Separate validation target (realistic test scenario)
    y_val = val_data['Weekly_Sales'].copy()
    val_data_no_target = val_data.drop('Weekly_Sales', axis=1).copy()

    # Fit and transform data using pipeline
    pipeline.fit(train_data)
    train_processed = pipeline.transform(train_data, is_validation=False)
    val_processed = pipeline.transform(val_data_no_target, is_validation=True)

    # Prepare final model data
    X_train = train_processed.drop(['Weekly_Sales', 'Date'], axis=1)
    y_train = train_processed['Weekly_Sales']
    X_val = val_processed.drop('Date', axis=1)
    train_holidays = train_processed['IsHoliday'].values.astype(bool)

    # Store feature columns for later reference
    feature_columns = list(X_train.columns)

    print(f"✅ Data preprocessing complete!")
    print(f"   📊 Training shape: {X_train.shape}")
    print(f"   📊 Validation shape: {X_val.shape}")
    print(f"   🎯 Features: {len(feature_columns)}")

    return X_train, y_train, X_val, y_val, train_holidays, val_holidays, split_info, feature_columns



In [8]:
def calculate_wmae(y_true, y_pred, is_holiday, holiday_weight=5.0):
    """Calculate Weighted Mean Absolute Error (WMAE)"""
    abs_errors = np.abs(y_true - y_pred)
    weights = np.where(is_holiday, holiday_weight, 1.0)
    wmae = np.sum(weights * abs_errors) / np.sum(weights)
    return wmae


def prepare_sarimax_data(X_train, y_train, X_val, y_val, train_holidays, val_holidays):
    """Prepare data for SARIMAX training on Store-Dept combinations"""
    print("📊 Preparing data for SARIMAX modeling...")

    # Reconstruct full datasets with dates for SARIMAX
    # We need to reload the original data to get dates back
    pipeline = WalmartPreprocessingPipeline()
    train_full = pipeline.load_and_prepare_data()
    train_full = pipeline.clean_merged_data(train_full)
    train_data, val_data, _ = pipeline.create_temporal_split(train_full)

    print(f"   📈 Train data shape: {train_data.shape}")
    print(f"   📉 Val data shape: {val_data.shape}")

    # Get unique Store-Dept combinations
    train_combinations = set(zip(train_data['Store'], train_data['Dept']))
    val_combinations = set(zip(val_data['Store'], val_data['Dept']))

    print(f"   🏪 Train combinations: {len(train_combinations)}")
    print(f"   🔮 Val combinations: {len(val_combinations)}")

    # Find missing combinations in validation
    missing_in_val = train_combinations - val_combinations
    missing_in_train = val_combinations - train_combinations

    print(f"   ⚠️ Missing in validation: {len(missing_in_val)}")
    print(f"   ⚠️ Missing in training: {len(missing_in_train)}")

    return train_data, val_data, train_combinations, val_combinations


In [9]:
def auto_sarimax_order(ts_data, exog_data=None, max_p=2, max_d=1, max_q=2, max_P=1, max_D=1, max_Q=1):
    """Automatically determine SARIMAX order using AIC - SPEED OPTIMIZED"""
    best_aic = float('inf')
    best_order = (1, 1, 1)
    best_seasonal_order = (1, 1, 1, 52)  # Weekly seasonality

    # Limited grid search for speed
    for p in range(max_p + 1):
        for d in range(max_d + 1):
            for q in range(max_q + 1):
                for P in range(max_P + 1):
                    for D in range(max_D + 1):
                        for Q in range(max_Q + 1):
                            try:
                                model = SARIMAX(ts_data,
                                              exog=exog_data,
                                              order=(p, d, q),
                                              seasonal_order=(P, D, Q, 52))
                                fitted_model = model.fit(disp=False)
                                if fitted_model.aic < best_aic:
                                    best_aic = fitted_model.aic
                                    best_order = (p, d, q)
                                    best_seasonal_order = (P, D, Q, 52)
                            except:
                                continue

    return best_order, best_seasonal_order



In [19]:
import time

In [20]:
def get_exog_features():
    """Define external regressor features for SARIMAX - ULTRA OPTIMIZED FOR SPEED"""
    # Use only the most important features to reduce complexity
    return [
        'IsHoliday',
        'Type_Encoded',
        'Month'
    ]



In [26]:
def train_sarimax_models(train_data, val_data, feature_columns, max_models=None):
    """Train individual SARIMAX models for each Store-Dept combination - ULTRA SPEED OPTIMIZED"""
    print("📈 Training SARIMAX models for each Store-Dept combination (ULTRA SPEED OPTIMIZED)...")

    # Get unique combinations from training data
    train_combinations = train_data.groupby(['Store', 'Dept']).size().index.tolist()

    # Option to limit models for faster testing
    if max_models and max_models < len(train_combinations):
        train_combinations = train_combinations[:max_models]
        print(f"   ⚡ TESTING MODE: Limited to first {max_models} combinations for speed")

    print(f"   📊 Training models for {len(train_combinations)} combinations")

    models = {}
    training_errors = {}
    model_orders = {}
    seasonal_orders = {}
    exog_features = get_exog_features()

    successful_models = 0
    failed_models = 0
    skipped_models = 0

    # Speed monitoring
    start_time = time.time()
    last_update_time = start_time

    print(f"   🎯 Using external regressors: {exog_features}")
    print(f"   ⚡ EXTREME SPEED MODE: No seasonality, minimal ARIMA, ultra-fast fitting")

    for i, (store, dept) in enumerate(train_combinations):
        try:
            # Filter data for this combination
            store_dept_data = train_data[
                (train_data['Store'] == store) &
                (train_data['Dept'] == dept)
            ].copy()

            # Skip if insufficient data - reduced requirement for speed
            if len(store_dept_data) < 10:  # Reduced from 20 for speed
                skipped_models += 1
                # Progress updates every 10 processed combinations (including skipped)
                if (i + 1) % 10 == 0:
                    current_time = time.time()
                    elapsed = current_time - start_time
                    speed = (i + 1) / elapsed if elapsed > 0 else 0
                    print(f"   📊 Processed {i+1}/{len(train_combinations)} combinations | ✅ Trained: {successful_models} | ⚠️ Skipped: {skipped_models} | ❌ Failed: {failed_models} | ⚡ Speed: {speed:.1f} comb/sec")
                continue

            # Sort by date and create time series
            store_dept_data = store_dept_data.sort_values('Date').reset_index(drop=True)
            ts_data = store_dept_data['Weekly_Sales'].values

            # Prepare external regressors - SIMPLIFIED
            exog_data = None
            if all(feat in store_dept_data.columns for feat in exog_features):
                exog_data = store_dept_data[exog_features].values
                # Simple forward fill only
                exog_data = pd.DataFrame(exog_data, columns=exog_features).fillna(method='ffill').values

            # EXTREME SPEED: Use ARIMAX instead of SARIMAX (no seasonality)
            # This should be 5-10x faster than seasonal models
            sarimax_order = (1, 1, 0)  # Minimal ARIMA: AR(1), I(1), no MA
            seasonal_order = (0, 0, 0, 0)  # NO SEASONALITY for maximum speed

            # Fit ARIMAX model (SARIMAX with no seasonal component)
            model = SARIMAX(ts_data,
                           exog=exog_data,
                           order=sarimax_order,
                           seasonal_order=seasonal_order,
                           enforce_stationarity=False,
                           enforce_invertibility=False,
                           concentrate_scale=True,
                           trend='c')  # Just constant trend

            # EXTREME speed fitting
            fitted_model = model.fit(disp=False,
                                   maxiter=10,  # Reduced from 25
                                   method='lbfgs',
                                   low_memory=True,
                                   warn_convergence=False)  # Suppress convergence warnings

            models[(store, dept)] = fitted_model
            model_orders[(store, dept)] = sarimax_order
            seasonal_orders[(store, dept)] = seasonal_order
            successful_models += 1

            # Skip all training error calculations for maximum speed

            # Progress updates every 10 processed combinations
            if (i + 1) % 10 == 0:
                current_time = time.time()
                elapsed = current_time - start_time
                speed = (i + 1) / elapsed if elapsed > 0 else 0
                print(f"   📊 Processed {i+1}/{len(train_combinations)} combinations | ✅ Trained: {successful_models} | ⚠️ Skipped: {skipped_models} | ❌ Failed: {failed_models} | ⚡ Speed: {speed:.1f} comb/sec")

        except Exception as e:
            failed_models += 1
            if failed_models < 3:  # Only print first few errors
                print(f"   ⚠️ Failed to train model for Store {store}, Dept {dept}: {e}")

            # Progress updates every 10 processed combinations
            if (i + 1) % 10 == 0:
                current_time = time.time()
                elapsed = current_time - start_time
                speed = (i + 1) / elapsed if elapsed > 0 else 0
                print(f"   📊 Processed {i+1}/{len(train_combinations)} combinations | ✅ Trained: {successful_models} | ⚠️ Skipped: {skipped_models} | ❌ Failed: {failed_models} | ⚡ Speed: {speed:.1f} comb/sec")

    total_time = time.time() - start_time
    print(f"✅ SARIMAX training complete in {total_time:.1f} seconds!")
    print(f"   🎯 Successful models: {successful_models}")
    print(f"   ⚠️ Skipped models (insufficient data): {skipped_models}")
    print(f"   ❌ Failed models: {failed_models}")
    print(f"   ⚡ Average speed: {len(train_combinations)/total_time:.1f} combinations/sec")

    return models, training_errors, model_orders, seasonal_orders



In [27]:
def make_sarimax_predictions(models, val_data, model_orders, seasonal_orders):
    """Make predictions using trained SARIMAX models"""
    print("📈 Making SARIMAX predictions...")

    predictions = []
    actuals = []
    holidays = []
    successful_predictions = 0
    failed_predictions = 0
    exog_features = get_exog_features()

    # Get validation combinations
    val_combinations = val_data.groupby(['Store', 'Dept']).groups.keys()

    for store, dept in val_combinations:
        try:
            # Check if we have a model for this combination
            if (store, dept) not in models:
                # Use overall mean as fallback
                fallback_pred = val_data['Weekly_Sales'].mean()
                store_dept_val = val_data[
                    (val_data['Store'] == store) &
                    (val_data['Dept'] == dept)
                ]
                predictions.extend([fallback_pred] * len(store_dept_val))
                actuals.extend(store_dept_val['Weekly_Sales'].tolist())
                holidays.extend(store_dept_val['IsHoliday'].tolist())
                failed_predictions += len(store_dept_val)
                continue

            # Get validation data for this combination
            store_dept_val = val_data[
                (val_data['Store'] == store) &
                (val_data['Dept'] == dept)
            ].copy()

            # Sort by date
            store_dept_val = store_dept_val.sort_values('Date').reset_index(drop=True)

            # Prepare external regressors for prediction
            exog_forecast = None
            if all(feat in store_dept_val.columns for feat in exog_features):
                exog_forecast = store_dept_val[exog_features].values
                # Handle missing values
                exog_forecast = pd.DataFrame(exog_forecast, columns=exog_features).fillna(method='ffill').fillna(method='bfill').values

            # Make prediction using the fitted model
            model = models[(store, dept)]
            n_periods = len(store_dept_val)

            # Forecast future values
            forecast = model.forecast(steps=n_periods, exog=exog_forecast)

            # Store results
            predictions.extend(forecast.tolist())
            actuals.extend(store_dept_val['Weekly_Sales'].tolist())
            holidays.extend(store_dept_val['IsHoliday'].tolist())
            successful_predictions += len(store_dept_val)

        except Exception as e:
            # Fallback prediction
            fallback_pred = val_data['Weekly_Sales'].mean()
            store_dept_val = val_data[
                (val_data['Store'] == store) &
                (val_data['Dept'] == dept)
            ]
            predictions.extend([fallback_pred] * len(store_dept_val))
            actuals.extend(store_dept_val['Weekly_Sales'].tolist())
            holidays.extend(store_dept_val['IsHoliday'].tolist())
            failed_predictions += len(store_dept_val)

    print(f"✅ Predictions complete!")
    print(f"   🎯 Successful predictions: {successful_predictions}")
    print(f"   ❌ Failed/fallback predictions: {failed_predictions}")

    return np.array(predictions), np.array(actuals), np.array(holidays)



In [30]:
def main():
    """Main experiment execution"""
    print("🚀 Starting Experiment 11: SARIMAX with Experiment 7 Features")
    print("=" * 80)

    # Setup MLflow tracking
    experiment_name = setup_mlflow()

    with mlflow.start_run(run_name="SARIMAX_Exp7_Features_Complete") as run:
        print(f"🔄 Starting MLflow run: {run.info.run_id}")

        # Log experiment metadata
        mlflow.log_param("experiment_type", "SARIMAX_with_Exp7_Features")
        mlflow.log_param("model_type", "SARIMAX_Individual_Models")
        mlflow.log_param("feature_engineering", "Experiment_7_Pipeline")
        mlflow.log_param("data_split", "temporal_80_20")
        mlflow.log_param("external_regressors", str(get_exog_features()))
        mlflow.log_param("seasonality", "weekly_52")

        try:
            # Step 1: Get preprocessed data
            print("\n📊 Step 1: Data preprocessing...")
            X_train, y_train, X_val, y_val, train_holidays, val_holidays, split_info, feature_columns = get_preprocessed_data()

            # Log data info
            mlflow.log_metric("train_samples", len(X_train))
            mlflow.log_metric("val_samples", len(X_val))
            mlflow.log_metric("total_features", len(feature_columns))
            mlflow.log_param("split_date", str(split_info['split_date']))

            # Step 2: Prepare SARIMAX-specific data
            print("\n📊 Step 2: Preparing SARIMAX data...")
            train_data, val_data, train_combinations, val_combinations = prepare_sarimax_data(
                X_train, y_train, X_val, y_val, train_holidays, val_holidays
            )

            # Log combination info
            mlflow.log_metric("train_combinations", len(train_combinations))
            mlflow.log_metric("val_combinations", len(val_combinations))
            mlflow.log_metric("missing_combinations", len(train_combinations - val_combinations))

            # Step 3: Train SARIMAX models
            print("\n📈 Step 3: Training SARIMAX models...")

            # OPTION: Limit models for faster testing (uncomment next line for testing)
            max_models_for_testing = None  # Test with only 100 models for speed
            # max_models_for_testing = None   # Use None for full training

            models, training_errors, model_orders, seasonal_orders = train_sarimax_models(
                train_data, val_data, feature_columns, max_models=max_models_for_testing
            )

            # Log training info
            mlflow.log_metric("successful_models", len(models))
            mlflow.log_metric("avg_training_mae", np.mean(list(training_errors.values())) if training_errors else 0)
            mlflow.log_metric("num_models", len(models))

            # Step 4: Make predictions
            print("\n📈 Step 4: Making predictions...")
            y_pred, y_true, is_holiday = make_sarimax_predictions(
                models, val_data, model_orders, seasonal_orders
            )

            # Step 5: Calculate metrics
            print("\n📊 Step 5: Calculating metrics...")

            # Standard metrics
            mae = mean_absolute_error(y_true, y_pred)
            rmse = np.sqrt(mean_squared_error(y_true, y_pred))
            r2 = r2_score(y_true, y_pred)

            # WMAE (Competition metric)
            wmae = calculate_wmae(y_true, y_pred, is_holiday)

            # Holiday vs non-holiday breakdown
            holiday_mask = is_holiday == True
            non_holiday_mask = is_holiday == False

            holiday_mae = mean_absolute_error(y_true[holiday_mask], y_pred[holiday_mask]) if holiday_mask.sum() > 0 else 0
            non_holiday_mae = mean_absolute_error(y_true[non_holiday_mask], y_pred[non_holiday_mask]) if non_holiday_mask.sum() > 0 else 0

            # Log all metrics
            mlflow.log_metric("mae", mae)
            mlflow.log_metric("rmse", rmse)
            mlflow.log_metric("r2_score", r2)
            mlflow.log_metric("wmae", wmae)
            mlflow.log_metric("holiday_mae", holiday_mae)
            mlflow.log_metric("non_holiday_mae", non_holiday_mae)
            mlflow.log_metric("holiday_samples", holiday_mask.sum())
            mlflow.log_metric("non_holiday_samples", non_holiday_mask.sum())

            # Step 6: Results summary
            print("\n" + "=" * 60)
            print("🎯 EXPERIMENT 11 RESULTS SUMMARY")
            print("=" * 60)
            print(f"📊 Validation Metrics:")
            print(f"   WMAE (Competition Metric): ${wmae:,.2f}")
            print(f"   MAE: ${mae:,.2f}")
            print(f"   RMSE: ${rmse:,.2f}")
            print(f"   R²: {r2:.4f}")
            print(f"\n📊 Holiday Breakdown:")
            print(f"   Holiday MAE: ${holiday_mae:,.2f} ({holiday_mask.sum():,} samples)")
            print(f"   Non-Holiday MAE: ${non_holiday_mae:,.2f} ({non_holiday_mask.sum():,} samples)")
            print(f"\n📊 Model Statistics:")
            print(f"   Successful models trained: {len(models):,}")
            print(f"   Store-Dept combinations: {len(train_combinations):,}")
            print(f"   External regressors: {len(get_exog_features())}")
            print(f"   Features used: {len(feature_columns):,}")

            # Step 7: Save artifacts
            print(f"\n💾 Saving model artifacts...")

            # Save model summary
            model_summary = {
                'experiment_name': experiment_name,
                'run_id': run.info.run_id,
                'models_trained': len(models),
                'feature_columns': feature_columns,
                'external_regressors': get_exog_features(),
                'model_orders': {f"{k[0]}_{k[1]}": v for k, v in model_orders.items()},
                'seasonal_orders': {f"{k[0]}_{k[1]}": v for k, v in seasonal_orders.items()},
                'metrics': {
                    'wmae': wmae,
                    'mae': mae,
                    'rmse': rmse,
                    'r2': r2
                }
            }

            with open('sarimax_exp11_summary.json', 'w') as f:
                json.dump(model_summary, f, indent=2, default=str)

            mlflow.log_artifact('sarimax_exp11_summary.json')

            print("✅ Experiment 11 completed successfully!")

        except Exception as e:
            print(f"❌ Experiment failed: {e}")
            mlflow.log_param("error", str(e))
            raise

    print("\n🎉 Experiment 11: SARIMAX with Experiment 7 Features - COMPLETE!")


In [31]:
if __name__ == "__main__":
    main()

🚀 Starting Experiment 11: SARIMAX with Experiment 7 Features
🔧 Setting up MLflow and DagsHub...


✅ DagsHub initialized successfully!
✅ Created new experiment: Experiment_11_SARIMAX_20250710_152458
✅ MLflow setup complete!
🔗 Tracking URI: https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow
📊 Experiment: Experiment_11_SARIMAX_20250710_152458
🔄 Starting MLflow run: 4f7a4e33667a430f9e993993cdcb28db

📊 Step 1: Data preprocessing...
🔄 Getting preprocessed data using pipeline...
📊 Loading datasets...
   📈 Train data: (421570, 5)
   🏪 Stores data: (45, 3)
   🎯 Features data: (8190, 12)
   ✅ Merged data: (421570, 17)
   📅 Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
🧹 Cleaning merged data...
   🔄 Resolving duplicate IsHoliday columns...
   ✅ Cleaned data: (421570, 16) (was (421570, 17))
📅 Creating temporal split (80/19)...
   📊 Split date: 2012-04-13 00:00:00
   📈 Train: 337,256 records (2010-02-05 00:00:00 to 2012-04-13 00:00:00)
   📉 Val: 84,314 records (2012-04-13 00:00:00 to 2012-10-26 00:00:00)
🔧 Fitting preprocessing pipeline on training d