<a href="https://colab.research.google.com/github/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting/blob/lodia/model_exp_FX_Prophet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 848MB/s]


In [7]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [21]:
import pandas as pd
from prophet import Prophet
import numpy as np
import warnings
import joblib # Still useful for saving models if needed
import os
from datetime import datetime, timedelta
import logging
from sklearn.metrics import mean_absolute_error, mean_squared_error # Needed for metrics
import zipfile
# გაფრთხილებების იგნორირება
warnings.filterwarnings('ignore')
logging.getLogger('prophet').setLevel(logging.WARNING)
logging.getLogger('cmdstanpy').setLevel(logging.WARNING) # Suppress cmdstanpy warnings if they occur
logging.getLogger('cmdstanpy').setLevel(logging.ERROR)


In [28]:
class WalmartProphetPreprocessingPipeline:
    """
    Preprocessing pipeline for Prophet models.
    Focuses on preparing data in the 'ds' (Date) and 'y' (Weekly_Sales) format,
    and handling holidays.
    """

    def __init__(self):
        self.fitted = False
        self.holidays_df = None

    def load_and_prepare_data(self):
        """Load and merge necessary datasets for Prophet."""
        print("📊 Loading datasets...") # Changed for consistency

        # Load datasets
        train_df = pd.read_csv('train.csv')
        features_df = pd.read_csv('features.csv')
        stores_df = pd.read_csv('stores.csv')

        print(f"   📈 Train data: {train_df.shape}")
        print(f"   📊 Features data: {features_df.shape}") # Added for features.csv
        print(f"   🏪 Stores data: {stores_df.shape}")

        # Convert Date columns to datetime
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        # Merge datasets (similar to previous steps)
        merged_df = pd.merge(train_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
        train_full = pd.merge(merged_df, stores_df, on=['Store'], how='left')

        # Sort by date for time series consistency
        train_full = train_full.sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)

        print(f"   ✅ Merged data: {train_full.shape}")
        print(f"   📅 Date range: {train_full['Date'].min()} to {train_full['Date'].max()}")

        return train_full

    def create_temporal_split(self, df, train_ratio=0.8):
        """Create temporal split to prevent data leakage for time series."""
        print(f"📅 Creating temporal split ({int(train_ratio*100)}/{int((1-train_ratio)*100)})...")

        # Sort by date to ensure temporal order
        df_sorted = df.sort_values('Date').reset_index(drop=True)

        # Determine split date based on a fixed ratio of unique dates
        unique_dates = sorted(df_sorted['Date'].unique())
        total_weeks = len(unique_dates)
        train_weeks = int(total_weeks * train_ratio)

        # Ensure train_weeks is at least 1, and not exceeding total_weeks
        if train_weeks < 1:
            train_weeks = 1
        if train_weeks >= total_weeks:
            train_weeks = total_weeks - 1 # Ensure at least one week for validation/test

        split_date = unique_dates[train_weeks - 1] # End date of training

        # Split data
        train_data = df_sorted[df_sorted['Date'] <= split_date].copy()
        val_data = df_sorted[df_sorted['Date'] > split_date].copy()

        # Create split info dictionary
        split_info = {
            'split_date': split_date,
            'train_size': len(train_data),
            'val_size': len(val_data),
            'train_date_range': (train_data['Date'].min(), train_data['Date'].max()),
            'val_date_range': (val_data['Date'].min(), val_data['Date'].max())
        }

        print(f"   📊 Split date: {split_date}")
        print(f"   📈 Train: {len(train_data):,} records ({train_data['Date'].min()} to {train_data['Date'].max()})")
        print(f"   📉 Val: {len(val_data):,} records ({val_data['Date'].min()} to {val_data['Date'].max()})")

        return train_data, val_data, split_info

    def fit(self, train_data):
        """Fit the preprocessing pipeline (prepare holidays)."""
        print("🔧 Preparing Prophet specific data (holidays)...") # Changed for consistency

        # Prophet's holidays DataFrame: requires 'holiday', 'ds' columns
        # We define common US holidays that align with Walmart's IsHoliday flag
        self.holidays_df = pd.DataFrame([
            # Super Bowl: IsHoliday=True
            {'holiday': 'SuperBowl', 'ds': '2010-02-12'},
            {'holiday': 'SuperBowl', 'ds': '2011-02-11'},
            {'holiday': 'SuperBowl', 'ds': '2012-02-10'},
            # Labor Day: IsHoliday=True
            {'holiday': 'LaborDay', 'ds': '2010-09-10'},
            {'holiday': 'LaborDay', 'ds': '2011-09-09'},
            {'holiday': 'LaborDay', 'ds': '2012-09-07'},
            # Thanksgiving: IsHoliday=True
            {'holiday': 'Thanksgiving', 'ds': '2010-11-26'},
            {'holiday': 'Thanksgiving', 'ds': '2011-11-25'},
            {'holiday': 'Thanksgiving', 'ds': '2012-11-23'},
            # Christmas: IsHoliday=True (often last week of year in dataset)
            {'holiday': 'Christmas', 'ds': '2010-12-31'},
            {'holiday': 'Christmas', 'ds': '2011-12-30'},
            {'holiday': 'Christmas', 'ds': '2012-12-28'},
        ])
        self.holidays_df['ds'] = pd.to_datetime(self.holidays_df['ds'])

        # Ensure only holidays present in the training data date range are considered
        min_date = train_data['Date'].min()
        max_date = train_data['Date'].max()
        self.holidays_df = self.holidays_df[
            (self.holidays_df['ds'] >= min_date) &
            (self.holidays_df['ds'] <= max_date)
        ]

        print("✅ Pipeline fitted on training data with holiday-aware settings") # Changed for consistency
        self.fitted = True
        return self

    def transform(self, data, is_validation=False): # Added is_validation for print
        """Transform data into Prophet's required format (ds, y)."""
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before transform!")

        print(f"🔄 Transforming {'validation' if is_validation else 'training'} data...") # Changed for consistency

        df = data.copy()
        # Rename columns to Prophet's requirements
        df = df.rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'})

        # Ensure 'y' (Weekly_Sales) is not negative, as sales cannot be negative
        df['y'] = df['y'].apply(lambda x: max(0, x))

        print(f"✅ Transform complete. Shape: {df.shape}") # Changed for consistency
        return df

    def fit_transform(self, train_data):
        """Fit and transform training data in one step."""
        # This will call fit() and then transform() with is_validation=False
        return self.fit(train_data).transform(train_data, is_validation=False)

    def get_preprocessed_data(self):
        """
        Orchestrates preprocessing steps to get model-ready data.

        Returns:
            train_data_prophet, val_data_prophet: DataFrames ready for Prophet
            split_info: Information about the temporal split
            holidays_df: DataFrame of holidays for Prophet
        """
        print("🔄 Getting preprocessed data using pipeline...") # Changed for consistency

        # Create the preprocessing pipeline
        pipeline = WalmartProphetPreprocessingPipeline()

        # Load raw data
        train_full = pipeline.load_and_prepare_data()

        # Create temporal split
        train_data_raw, val_data_raw, split_info = pipeline.create_temporal_split(train_full)

        # Fit and transform data using pipeline
        pipeline.fit(train_data_raw)
        train_data_prophet = pipeline.transform(train_data_raw, is_validation=False)
        val_data_prophet = pipeline.transform(val_data_raw, is_validation=True) # Pass is_validation=True

        holidays_df = pipeline.holidays_df # Get holidays after fitting pipeline

        print(f"✅ Data preprocessing complete!")
        print(f"   📊 Training shape: {train_data_prophet.shape}")
        print(f"   📊 Validation shape: {val_data_prophet.shape}")

        return train_data_prophet, val_data_prophet, split_info, holidays_df

In [23]:
def calculate_wmae(y_true, y_pred, is_holiday, holiday_weight=5.0):
    """Calculate Weighted Mean Absolute Error (WMAE) as per competition rules."""
    abs_errors = np.abs(y_true - y_pred)
    weights = np.where(is_holiday, holiday_weight, 1.0)
    wmae = np.sum(weights * abs_errors) / np.sum(weights)
    return wmae

In [24]:
def train_prophet_models(train_data_prophet, holidays_df, min_observations=50):
    """
    Trains Prophet models for each unique (Store, Dept) combination.

    Args:
        train_data_prophet (pd.DataFrame): Training data in Prophet format (ds, y).
        holidays_df (pd.DataFrame): DataFrame of holidays for Prophet.
        min_observations (int): Minimum number of observations required to train a model.

    Returns:
        dict: A dictionary of trained Prophet models, keyed by (Store, Dept) tuple.
    """
    print(f"📈 Training Prophet models for each Store-Dept combination...") # Changed for consistency
    print(f"   ⏰ No time limit - training all combinations") # Added for consistency

    unique_series_keys = train_data_prophet[['Store', 'Dept']].drop_duplicates().values
    total_combinations = len(unique_series_keys)
    print(f"   📊 Training models for {total_combinations} combinations")
    print(f"   🎯 Training Prophet for all combinations")

    models = {}
    successful_models = 0
    skipped_models_insufficient_data = 0
    failed_models_training_error = 0

    for i, (store_id, dept_id) in enumerate(unique_series_keys):
        # Progress update, similar to ARIMA
        # Print at start (index 0), every 200 models, and at the very end
        if i % 200 == 0 or i == total_combinations - 1: # Fixed bug for 0-index. Use i == total_combinations - 1 for the last one
            print(f"   ✅ Trained {i+1}/{total_combinations} models ({successful_models} successful, {skipped_models_insufficient_data + failed_models_training_error} failed)")

        series_data = train_data_prophet[
            (train_data_prophet['Store'] == store_id) &
            (train_data_prophet['Dept'] == dept_id)
        ].copy()

        # Check for minimum observations
        if len(series_data) < min_observations:
            skipped_models_insufficient_data += 1
            continue

        try:
            # Initialize Prophet model
            m = Prophet(
                yearly_seasonality=True,
                weekly_seasonality=True,
                holidays=holidays_df
            )

            # Fit the model
            m.fit(series_data)
            models[(store_id, dept_id)] = m
            successful_models += 1

        except Exception as e:
            failed_models_training_error += 1
            # You can uncomment the line below for specific error messages, but it will be verbose
            # if failed_models_training_error < 5: # Limit error printouts to avoid flooding console
            #     print(f"   ⚠️ Failed to train model for Store {store_id}, Dept {dept_id}: {e}")

    # Final check for total counts, if loop finished without a 200 multiple at the end
    # This ensures the last state is always printed if the loop didn't end on a % 200
    if (total_combinations - 1) % 200 != 0 and total_combinations > 0:
         print(f"   ✅ Trained {total_combinations}/{total_combinations} models ({successful_models} successful, {skipped_models_insufficient_data + failed_models_training_error} failed)")


    print(f"✅ Prophet training complete!")
    print(f"   🎯 Successful models: {successful_models}")
    print(f"   ❌ Failed models: {skipped_models_insufficient_data + failed_models_training_error}") # Combined failed and skipped
    print(f"   📊 Coverage: {successful_models}/{total_combinations} ({successful_models/total_combinations*100:.1f}%)")

    return models

In [25]:
def make_prophet_predictions(models, val_data_prophet, train_data=None): # Added train_data=None for consistency
    """
    Makes predictions using trained Prophet models for the validation period.
    Handles cases where models might not exist or predictions fail.

    Args:
        models (dict): Dictionary of trained Prophet models.
        val_data_prophet (pd.DataFrame): Validation data in Prophet format (ds, y).
        train_data (pd.DataFrame): Not used by Prophet for forecasting, but kept for signature consistency.

    Returns:
        tuple: (y_pred, y_true, is_holiday_flags)
    """
    print("📈 Making Prophet predictions (no fallbacks)...") # Changed for consistency

    predictions = []
    actuals = []
    holidays_flags = []

    successful_predictions_count = 0
    skipped_predictions_no_model = 0
    failed_predictions_error = 0

    unique_val_series_keys = val_data_prophet[['Store', 'Dept']].drop_duplicates().values
    total_val_combinations = len(unique_val_series_keys)

    # We don't need a frequent print here, as Prophet's predict is usually fast.
    # The final summary will be sufficient.
    for i, (store_id, dept_id) in enumerate(unique_val_series_keys):
        # Get actual validation data for this series
        current_val_series_actuals = val_data_prophet[
            (val_data_prophet['Store'] == store_id) &
            (val_data_prophet['Dept'] == dept_id)
        ].copy()

        if current_val_series_actuals.empty:
            continue # No validation data for this series

        # Prepare future DataFrame for Prophet prediction
        # The future DataFrame should cover the exact dates in the validation set
        future_dates = pd.DataFrame({'ds': current_val_series_actuals['ds']})

        if (store_id, dept_id) in models:
            try:
                m = models[(store_id, dept_id)]
                forecast = m.predict(future_dates)
                yhat = forecast['yhat'].values

                # Ensure predictions are not negative
                yhat[yhat < 0] = 0

                predictions.extend(yhat)
                actuals.extend(current_val_series_actuals['y'].values)
                holidays_flags.extend(current_val_series_actuals['IsHoliday'].values)
                successful_predictions_count += len(yhat)

            except Exception as e:
                failed_predictions_error += len(current_val_series_actuals)
                predictions.extend(np.zeros(len(current_val_series_actuals)))
                actuals.extend(current_val_series_actuals['y'].values)
                holidays_flags.extend(current_val_series_actuals['IsHoliday'].values)
        else:
            skipped_predictions_no_model += len(current_val_series_actuals)
            predictions.extend(np.zeros(len(current_val_series_actuals)))
            actuals.extend(current_val_series_actuals['y'].values)
            holidays_flags.extend(current_val_series_actuals['IsHoliday'].values)

    print(f"✅ Predictions complete!")
    print(f"   🎯 Prophet predictions: {successful_predictions_count}") # Changed for consistency
    print(f"   ⏭️ Skipped (no model): {skipped_predictions_no_model}") # Changed for consistency

    return np.array(predictions), np.array(actuals), np.array(holidays_flags).astype(bool)


In [30]:
def main():
    """Main experiment execution for Prophet model."""
    print("🚀 Starting Experiment Prophet: Prophet Models for Walmart Sales Forecasting")
    print("=" * 80)

    try:
        # Step 1: Get preprocessed data
        print("\n📊 Step 1: Data preprocessing...")
        train_data_prophet, val_data_prophet, split_info, holidays_df = WalmartProphetPreprocessingPipeline().get_preprocessed_data()

        # Step 2: Train Prophet models
        print("\n📈 Step 2: Training Prophet models...")
        models = train_prophet_models(train_data_prophet, holidays_df)

        # Step 3: Make predictions on validation set
        print("\n📈 Step 3: Making predictions...")
        y_pred_val, y_true_val, is_holiday_val = make_prophet_predictions(models, val_data_prophet)

        # --- Step 3.5: Calculate training WMAE (Prophet does not have direct 'fittedvalues') ---
        print("\n📊 Step 3.5: Training performance...")
        print("📊 Calculating training WMAE on fitted values...")

        train_predictions = []
        train_actuals = []
        train_holidays = []

        unique_train_series_keys = train_data_prophet[['Store', 'Dept']].drop_duplicates().values

        # Iterate through each Store-Dept combination to predict on training data
        for store_id, dept_id in unique_train_series_keys:
            series_data_train = train_data_prophet[
                (train_data_prophet['Store'] == store_id) &
                (train_data_prophet['Dept'] == dept_id)
            ].copy()

            if (store_id, dept_id) in models and not series_data_train.empty:
                try:
                    m = models[(store_id, dept_id)]
                    # Predict on the training data's 'ds' (Date) column
                    forecast_train = m.predict(series_data_train[['ds']])
                    yhat_train = forecast_train['yhat'].values
                    yhat_train[yhat_train < 0] = 0 # Ensure no negative predictions

                    train_predictions.extend(yhat_train)
                    train_actuals.extend(series_data_train['y'].values)
                    train_holidays.extend(series_data_train['IsHoliday'].values)
                except Exception as e:
                    # If prediction fails for a specific series on training data,
                    # fill with zeros to avoid breaking WMAE calculation and continue.
                    train_predictions.extend(np.zeros(len(series_data_train)))
                    train_actuals.extend(series_data_train['y'].values)
                    train_holidays.extend(series_data_train['IsHoliday'].values)
                    # print(f"   ⚠️ Failed to predict on training data for Store {store_id}, Dept {dept_id}: {e}") # Uncomment for debug

        if len(train_actuals) > 0:
            train_wmae = calculate_wmae(np.array(train_actuals), np.array(train_predictions), np.array(train_holidays).astype(bool))
            print(f"   📈 Training WMAE: ${train_wmae:,.2f}")
        else:
            train_wmae = None
            print("   ⚠️ No training data points available for WMAE calculation.")
        # --- End of Step 3.5 ---


        # Step 4: Calculate validation metrics
        print("\n📊 Step 4: Calculating validation metrics...")

        # Validation metrics
        if len(y_true_val) > 0:
            val_mae = mean_absolute_error(y_true_val, y_pred_val)
            val_rmse = np.sqrt(mean_squared_error(y_true_val, y_pred_val))
            val_wmae = calculate_wmae(y_true_val, y_pred_val, is_holiday_val)
        else:
            val_mae, val_rmse, val_wmae = 0, 0, 0
            print("   ⚠️ Warning: No data points for evaluation. Metrics set to 0.")

        # Holiday breakdown for validation
        holiday_mask_val = is_holiday_val.astype(bool)
        holiday_mae_val = mean_absolute_error(y_true_val[holiday_mask_val], y_pred_val[holiday_mask_val]) if holiday_mask_val.any() else 0
        non_holiday_mae_val = mean_absolute_error(y_true_val[~holiday_mask_val], y_pred_val[~holiday_mask_val]) if (~holiday_mask_val).any() else 0

        # Print results
        print("\n" + "=" * 60)
        print("🎯 EXPERIMENT PROPHET RESULTS SUMMARY")
        print("=" * 60)

        print("📊 Training Metrics:")
        if train_wmae is not None:
             print(f"   Training WMAE: ${train_wmae:,.2f}")
        else:
             print("   Training WMAE: Not calculated (no data points).")
        print()


        print("📊 Validation Metrics:")
        print(f"   WMAE (Competition Metric): ${val_wmae:,.2f}")
        print(f"   MAE: ${val_mae:,.2f}")
        print(f"   RMSE: ${val_rmse:,.2f}")

        print("\n📊 Holiday Breakdown:")
        print(f"   Holiday MAE: ${holiday_mae_val:,.2f} ({int(holiday_mask_val.sum())} samples)")
        print(f"   Non-Holiday MAE: ${non_holiday_mae_val:,.2f} ({int((~holiday_mask_val).sum())} samples)")

        print("\n📊 Model Statistics:")
        print(f"   Successful models trained: {len(models):,}")
        print(f"   Store-Dept combinations: {len(train_data_prophet[['Store', 'Dept']].drop_duplicates()):,}")
        print(f"   No training errors calculated")


        print("\n🎉 Experiment Prophet: Individual Prophet Models - COMPLETE!")

    except Exception as e:
        print(f"❌ Experiment failed: {e}")
        raise


In [17]:
if __name__ == "__main__":
    with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    train = pd.read_csv('train.csv')
    with zipfile.ZipFile('features.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    features = pd.read_csv('features.csv')


In [31]:
main()

🚀 Starting Experiment Prophet: Prophet Models for Walmart Sales Forecasting

📊 Step 1: Data preprocessing...
🔄 Getting preprocessed data using pipeline...
📊 Loading datasets...
   📈 Train data: (421570, 5)
   📊 Features data: (8190, 12)
   🏪 Stores data: (45, 3)
   ✅ Merged data: (421570, 16)
   📅 Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
📅 Creating temporal split (80/19)...
   📊 Split date: 2012-04-06 00:00:00
   📈 Train: 335,761 records (2010-02-05 00:00:00 to 2012-04-06 00:00:00)
   📉 Val: 85,809 records (2012-04-13 00:00:00 to 2012-10-26 00:00:00)
🔧 Preparing Prophet specific data (holidays)...
✅ Pipeline fitted on training data with holiday-aware settings
🔄 Transforming training data...
✅ Transform complete. Shape: (335761, 16)
🔄 Transforming validation data...
✅ Transform complete. Shape: (85809, 16)
✅ Data preprocessing complete!
   📊 Training shape: (335761, 16)
   📊 Validation shape: (85809, 16)

📈 Step 2: Training Prophet models...
📈 Training Prophet models for e