<a href="https://colab.research.google.com/github/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting/blob/lodia/model_exp_prophet_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# experiment_7_.ipynb


In [2]:
!pip install kaggle dagshub mlflow wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
! unzip walmart-recruiting-store-sales-forecasting.zip

Archive:  walmart-recruiting-store-sales-forecasting.zip
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')
from tqdm.auto import tqdm # For progress bars
# Install required packages (Prophet specific)
import subprocess
import sys

In [9]:
try:
    import prophet
except ImportError:
    print("Prophet not found. Installing fbprophet...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "fbprophet"])
    import prophet
from prophet import Prophet # Explicitly import Prophet

# MLflow and DagsHub
import mlflow
import mlflow.sklearn # Retained for consistency, though Prophet isn't sklearn
import dagshub

# Scikit-learn (for metrics)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# For suppressing Prophet's verbose output
import logging
import os
from io import StringIO

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [10]:
# Define the minimum number of observations required for a series to be trained by Prophet.
MIN_OBSERVATIONS_FOR_PROPHET = 50

# Configure CmdStanPy temporary directory and suppress its logging globally
colab_tmp_dir = '/tmp/cmdstanpy_tmp_colab' # Use /tmp or a specific path
os.makedirs(colab_tmp_dir, exist_ok=True)
os.environ['CMDSTANPY_TEMP'] = colab_tmp_dir
logging.getLogger('cmdstanpy').setLevel(logging.CRITICAL) # Set logging level to CRITICAL
warnings.filterwarnings('ignore', category=UserWarning, module='prophet')
warnings.filterwarnings('ignore', category=FutureWarning, module='prophet')
import logging
print(logging.getLogger('cmdstanpy').level)
# This should print 50, which corresponds to CRITICAL

50


In [11]:
class WalmartPreprocessingPipeline:
    """
    Complete preprocessing pipeline for Walmart sales data
    Supports fit/transform pattern for proper train/validation handling
    """

    def __init__(self):
        self.fitted = False
        self.outlier_thresholds = None
        self.feature_columns = None # To store actual features used for Prophet regressors

    def load_and_prepare_data(self):
        """Load and merge train.csv, stores.csv, features.csv datasets"""
        print("📊 Loading datasets...")

        # Load datasets
        train_df = pd.read_csv('train.csv')
        stores_df = pd.read_csv('stores.csv')
        features_df = pd.read_csv('features.csv')

        print(f"   📈 Train data: {train_df.shape}")
        print(f"   🏪 Stores data: {stores_df.shape}")
        print(f"   🎯 Features data: {features_df.shape}")

        # Convert Date column to datetime
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        # Merge datasets
        train_stores = train_df.merge(stores_df, on='Store', how='left')
        train_full = train_stores.merge(features_df, on=['Store', 'Date'], how='left')

        print(f"   ✅ Merged data: {train_full.shape}")
        print(f"   📅 Date range: {train_full['Date'].min()} to {train_full['Date'].max()}")

        return train_full

    def clean_merged_data(self, train_full):
        """Clean merged data by handling duplicate IsHoliday columns"""
        print("🧹 Cleaning merged data...")

        initial_shape = train_full.shape

        # Handle duplicate IsHoliday columns if they exist
        if 'IsHoliday_x' in train_full.columns and 'IsHoliday_y' in train_full.columns:
            print("   🔄 Resolving duplicate IsHoliday columns...")
            train_full['IsHoliday'] = train_full['IsHoliday_x'] | train_full['IsHoliday_y']
            train_full = train_full.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)
        elif 'IsHoliday_x' in train_full.columns: # If only _x exists, rename it
            train_full = train_full.rename(columns={'IsHoliday_x': 'IsHoliday'})
        elif 'IsHoliday_y' in train_full.columns: # If only _y exists, rename it
            train_full = train_full.rename(columns={'IsHoliday_y': 'IsHoliday'})

        print(f"   ✅ Cleaned data: {train_full.shape} (was {initial_shape})")
        return train_full

    def create_temporal_split(self, df, train_ratio=0.8):
        """Create temporal split to prevent data leakage"""
        print(f"📅 Creating temporal split ({int(train_ratio*100)}/{int((1-train_ratio)*100)})...")

        # Sort by date to ensure temporal order
        df_sorted = df.sort_values('Date').reset_index(drop=True)

        # Find split point
        split_idx = int(len(df_sorted) * train_ratio)
        split_date = df_sorted.iloc[split_idx]['Date']

        # Create splits
        train_data = df_sorted.iloc[:split_idx].copy()
        val_data = df_sorted.iloc[split_idx:].copy()

        # Create split info dictionary
        split_info = {
            'split_date': split_date,
            'train_size': len(train_data),
            'val_size': len(val_data),
            'train_date_range': (train_data['Date'].min(), train_data['Date'].max()),
            'val_date_range': (val_data['Date'].min(), val_data['Date'].max())
        }

        print(f"   📊 Split date: {split_date}")
        print(f"   📈 Train: {len(train_data):,} records ({train_data['Date'].min()} to {train_data['Date'].max()})")
        print(f"   📉 Val: {len(val_data):,} records ({val_data['Date'].min()} to {val_data['Date'].max()})")

        return train_data, val_data, split_info

    def fit(self, train_data):
        """Fit the preprocessing pipeline on training data"""
        print("🔧 Fitting preprocessing pipeline on training data...")

        # Fit outlier removal thresholds on training data only
        # These are fixed for this pipeline, but fit concept remains
        self.outlier_thresholds = {
            'A': {'lower': -1000, 'upper': 50000},  # Type A stores
            'B': {'lower': -500, 'upper': 25000},   # Type B stores
            'C': {'lower': -200, 'upper': 15000}    # Type C stores
        }

        print("✅ Pipeline fitted on training data")
        self.fitted = True
        return self

    def transform(self, data, is_validation=False):
        """Transform data using fitted pipeline"""
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before transform!")

        print(f"🔄 Transforming {'validation' if is_validation else 'training'} data...")

        df = data.copy()

        # Step 1: Create date features
        df = self._create_date_features(df)

        # Step 2: Create holiday features
        df = self._create_holiday_features(df)

        # Step 3: Encode categorical features (BEFORE outlier removal!)
        df = self._encode_categorical_features(df)

        # Step 4: Lag features are DISABLED as per original code's intent
        df = self._create_lag_features_training(df) # Still called to ensure it does nothing

        # Step 5: Remove outliers (only on training data)
        # Note: Outlier removal is applied only to the training set BEFORE the temporal split
        # and then passed here. So this only applies if `is_validation` is False.
        if not is_validation:
            df = self._remove_outliers(df)

        # Step 6: Remove markdown features
        df = self._remove_markdown_features(df)

        # Step 7: Remove redundant features (except Store, Dept, Date, Weekly_Sales)
        df = self._remove_redundant_features(df)

        # Step 8: Rename for Prophet and identify features
        df = df.rename(columns={'Date': 'ds', 'Weekly_Sales': 'y'})

        # Identify features that will be used as regressors
        # These are all columns EXCEPT 'ds', 'y', 'Store', 'Dept'
        self.feature_columns = [col for col in df.columns if col not in ['ds', 'y', 'Store', 'Dept']]

        print(f"✅ Transform complete. Shape: {df.shape}")
        return df

    def fit_transform(self, train_data):
        """Fit and transform training data in one step"""
        return self.fit(train_data).transform(train_data, is_validation=False)

    def _create_date_features(self, df):
        """Create date features"""
        df = df.copy()
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int) # Ensure int for some operations
        df['Quarter'] = df['Date'].dt.quarter
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
        df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
        df['IsQuarterStart'] = df['Date'].dt.is_quarter_start.astype(int)
        df['IsQuarterEnd'] = df['Date'].dt.is_quarter_end.astype(int)
        start_date = df['Date'].min()
        df['DaysFromStart'] = (df['Date'] - start_date).dt.days
        df['WeeksFromStart'] = df['DaysFromStart'] // 7
        return df

    def _create_holiday_features(self, df):
        """Create holiday features"""
        df = df.copy()
        # Convert date strings to datetime objects for accurate comparison
        super_bowl_dates = pd.to_datetime(['2010-02-12', '2011-02-11', '2012-02-10'])
        labor_day_dates = pd.to_datetime(['2010-09-10', '2011-09-09', '2012-09-07'])
        thanksgiving_dates = pd.to_datetime(['2010-11-26', '2011-11-25', '2012-11-23'])
        christmas_dates = pd.to_datetime(['2010-12-31', '2011-12-30', '2012-12-28'])

        df['IsSuperBowlWeek'] = df['Date'].isin(super_bowl_dates).astype(int)
        df['IsLaborDayWeek'] = df['Date'].isin(labor_day_dates).astype(int)
        df['IsThanksgivingWeek'] = df['Date'].isin(thanksgiving_dates).astype(int)
        df['IsChristmasWeek'] = df['Date'].isin(christmas_dates).astype(int)
        df['IsMajorHoliday'] = (df['IsSuperBowlWeek'] | df['IsLaborDayWeek'] |
                               df['IsThanksgivingWeek'] | df['IsChristmasWeek']).astype(int)
        df['IsHolidayMonth'] = df['Month'].isin([11, 12]).astype(int)
        df['IsBackToSchool'] = df['Month'].isin([8, 9]).astype(int)
        return df

    def _create_lag_features_training(self, df):
        """Create lag features for training data - DISABLED to reduce overfitting"""
        # Lag features removed to prevent overfitting
        return df

    def _create_lag_features_validation(self, df):
        """Create lag features for validation data - DISABLED to reduce overfitting"""
        # Lag features removed to prevent overfitting
        return df

    def _remove_outliers(self, df):
        """Remove outliers from training data only"""
        initial_len = len(df)
        df_clean = df.copy()

        # Before outlier removal, ensure 'Type' column exists and is used to mask
        # If 'Type' has been one-hot encoded, we need to recreate it for masking
        if 'Type_A' in df_clean.columns and 'Type_B' in df_clean.columns and 'Type_C' in df_clean.columns:
            # Recreate a temporary 'Type' column for filtering
            # Initialize with an empty string or 'Unknown' to ensure string dtype
            df_clean['Temp_Type'] = ''
            df_clean.loc[df_clean['Type_A'] == 1, 'Temp_Type'] = 'A'
            df_clean.loc[df_clean['Type_B'] == 1, 'Temp_Type'] = 'B'
            df_clean.loc[df_clean['Type_C'] == 1, 'Temp_Type'] = 'C'
            # If there are rows where none of A, B, C are 1 (e.g., original Type was NaN),
            # they will remain '', which is fine for type consistency.
        elif 'Type' in df_clean.columns:
            df_clean['Temp_Type'] = df_clean['Type']
        else:
            print("   ⚠️ Warning: 'Type' or 'Type_X' columns not found for outlier removal by type. Skipping type-specific outlier removal.")
            return df

        for store_type, thresholds in self.outlier_thresholds.items():
            type_mask = df_clean['Temp_Type'] == store_type
            outlier_mask = (
                (df_clean['Weekly_Sales'] < thresholds['lower']) |
                (df_clean['Weekly_Sales'] > thresholds['upper'])
            )
            df_clean = df_clean[~(type_mask & outlier_mask)]

        # Drop the temporary type column
        if 'Temp_Type' in df_clean.columns:
            df_clean = df_clean.drop('Temp_Type', axis=1)

        removed = initial_len - len(df_clean)
        print(f"   🗑️ Removed {removed:,} outliers from training data")
        return df_clean

    def _remove_markdown_features(self, df):
        """Remove markdown columns"""
        markdown_cols = [col for col in df.columns if 'MarkDown' in col]
        if markdown_cols:
            df = df.drop(markdown_cols, axis=1)
        return df

    def _remove_redundant_features(self, df):
        """Remove redundant features for Prophet"""
        # Keep 'Store', 'Dept', 'Date' (ds), 'Weekly_Sales' (y) for per-series processing
        # 'Year' and 'WeekOfYear' are often implicitly handled by Prophet's seasonality
        # or can cause collinearity issues with 'ds' for simple linear trends.
        # However, for consistency with original pipeline, remove some.
        redundant_cols = ['Year', 'Quarter', 'Day', # WeekOfYear is kept for now as it's often a good regressor
                         'IsQuarterStart', 'IsQuarterEnd']
        existing_redundant = [col for col in redundant_cols if col in df.columns]
        if existing_redundant:
            df = df.drop(existing_redundant, axis=1)
        return df

    def _encode_categorical_features(self, df):
        """Encode categorical features using both one-hot and label encoding"""
        df = df.copy()

        if 'Type' in df.columns:
            print(f"   🔧 Encoding Type column using both one-hot and label encoding...")

            # One-hot encoding (existing approach) - Prophet can use these as regressors
            type_dummies = pd.get_dummies(df['Type'], prefix='Type', dtype=int)

            # Label encoding (experiment_2 approach) - Prophet can also use this as a regressor
            # A=0, B=1, C=2
            type_mapping = {'A': 0, 'B': 1, 'C': 2}
            df['Type_Encoded'] = df['Type'].map(type_mapping)

            # Add one-hot columns
            for col in type_dummies.columns:
                df[col] = type_dummies[col]

            # Remove original Type column
            df = df.drop('Type', axis=1)

            print(f"   ✅ Added both Type_Encoded and {list(type_dummies.columns)}")

        return df

In [12]:
def setup_mlflow_prophet():
    """Setup MLflow and DagsHub tracking for Prophet"""
    print("🔧 Setting up MLflow and DagsHub for Prophet...")

    # End any active runs first
    try:
        mlflow.end_run()
    except:
        pass

    # Initialize DagsHub
    try:
        dagshub.init(
            repo_owner='konstantine25b',
            repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
            mlflow=True
        )
        print("✅ DagsHub initialized successfully!")
    except Exception as e:
        print(f"⚠️ DagsHub init warning: {e}")

    # Set MLflow tracking URI
    mlflow.set_tracking_uri("https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow")

    # Create unique experiment name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_name = f"Experiment_Prophet_{timestamp}" # Changed experiment name for Prophet

    try:
        experiment_id = mlflow.create_experiment(experiment_name)
        print(f"✅ Created new experiment: {experiment_name}")
    except mlflow.exceptions.MlflowException as e:
        if "already exists" in str(e):
            experiment = mlflow.get_experiment_by_name(experiment_name)
            experiment_id = experiment.experiment_id
            print(f"✅ Using existing experiment: {experiment_name}")
        else:
            # Fallback to default experiment
            experiment_name = "Default"
            mlflow.set_experiment(experiment_name)
            print(f"⚠️ Using default experiment due to: {e}")

    mlflow.set_experiment(experiment_name)

    print(f"✅ MLflow setup complete!")
    print(f"🔗 Tracking URI: {mlflow.get_tracking_uri()}")
    print(f"📊 Experiment: {experiment_name}")

    return experiment_name

In [13]:
def get_preprocessed_data_prophet():
    """
    Use preprocessing pipeline to get model-ready data for Prophet

    Returns:
        train_df_prophet: DataFrame for training (ds, y, Store, Dept, regressors)
        val_df_prophet: DataFrame for validation (ds, y, Store, Dept, regressors)
        train_holidays, val_holidays: Holiday indicators for WMAE
        split_info: Information about the temporal split
        regressor_features: List of column names used as Prophet regressors
    """
    print("🔄 Getting preprocessed data using pipeline for Prophet...")

    # Create the preprocessing pipeline
    pipeline = WalmartPreprocessingPipeline()

    # Load raw data
    train_full = pipeline.load_and_prepare_data()
    train_full = pipeline.clean_merged_data(train_full)

    # Create temporal split
    # train_data will be used to fit the pipeline and its outlier removal
    # val_data will be transformed using the fitted pipeline
    train_data_raw, val_data_raw, split_info = pipeline.create_temporal_split(train_full)

    # Fit and transform training data (outliers removed from train_data_raw)
    train_df_prophet = pipeline.fit_transform(train_data_raw) # This handles outlier removal for train

    # Transform validation data
    val_df_prophet = pipeline.transform(val_data_raw, is_validation=True) # No outlier removal for val

    # Extract holiday information AFTER preprocessing for WMAE calculation
    # Ensure 'IsHoliday' column is handled consistently, it should be 'IsHoliday' in the output df
    train_holidays = train_df_prophet['IsHoliday'].values.astype(bool)
    val_holidays = val_df_prophet['IsHoliday'].values.astype(bool)

    # Get the list of regressors from the pipeline after transformation
    regressor_features = pipeline.feature_columns

    print(f"✅ Data ready for Prophet:")
    print(f"   Train DF: {train_df_prophet.shape} (ds, y, Store, Dept, {len(regressor_features)} regressors)")
    print(f"   Val DF:   {val_df_prophet.shape} (ds, y, Store, Dept, {len(regressor_features)} regressors)")
    print(f"   Regressor Features: {regressor_features}")
    print(f"   train_holidays: {train_holidays.shape} ({train_holidays.sum()} holidays)")
    print(f"   val_holidays: {val_holidays.shape} ({val_holidays.sum()} holidays)")

    return train_df_prophet, val_df_prophet, train_holidays, val_holidays, split_info, regressor_features


In [14]:
def calculate_metrics(y_true, y_pred, is_holiday=None):
    """Calculate evaluation metrics including WMAE with correct Walmart formula"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    # Calculate WMAE (Weighted Mean Absolute Error) - Walmart competition formula
    # w_i = 5 if holiday week, 1 otherwise
    if is_holiday is not None and len(is_holiday) == len(y_true):
        weights = np.where(is_holiday, 5, 1)  # 5 for holidays, 1 for regular weeks
    else:
        weights = np.ones(len(y_true))  # Default to all 1s if no holiday info or mismatch

    wmae = np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

    return {
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'wmae': wmae
    }

In [15]:
def log_prophet_results_to_mlflow(train_metrics, val_metrics, num_train_series, num_val_series, params, feature_categories, regressor_features):
    """Log Prophet training results to MLflow after training is complete"""
    print("\n📊 Logging results to MLflow...")

    try:
        # Setup MLflow
        experiment_name = setup_mlflow_prophet()

        with mlflow.start_run(run_name="Prophet_Walmart_Sales"):
            # Log parameters (Prophet's general parameters + regressor count)
            mlflow.log_params(params)
            mlflow.log_param("num_regressors", len(regressor_features))
            mlflow.log_param("regressors_list", str(regressor_features)) # Log the list of regressors

            # Log metrics
            for metric_name, value in train_metrics.items():
                mlflow.log_metric(f"train_{metric_name}", value)

            for metric_name, value in val_metrics.items():
                mlflow.log_metric(f"val_{metric_name}", value)

            # Log feature categories (relevant for understanding regressors)
            for category, features in feature_categories.items():
                if features:
                    mlflow.log_param(f"features_cat_{category}", str(features))

            # Log data split info
            mlflow.log_params({
                'num_train_series': num_train_series,
                'num_val_series': num_val_series
            })

            # Note: We are not logging individual Prophet models due to their large number
            mlflow.log_artifact(__file__, "prophet_script.py") # Log the script itself

            run_id = mlflow.active_run().info.run_id
            print(f"✅ Results logged to MLflow run: {run_id}")

    except Exception as e:
        print(f"⚠️ MLflow logging failed: {e}")
        print("   Training results are still valid, just not logged to MLflow")


In [16]:
def train_prophet_models(train_df_prophet, val_df_prophet, train_holidays=None, val_holidays=None, regressor_features=None):
    """Train Prophet models per Store-Dept series"""
    print("🚀 Training Prophet models...")

    if regressor_features is None:
        raise ValueError("regressor_features must be provided for Prophet models.")

    print(f"   📋 Total Regressors: {len(regressor_features)}")
    print(f"   📋 Regressor List: {regressor_features}")

    # Categorize features for better understanding (similar to XGBoost for context)
    feature_categories = {
        'Store_Info': [f for f in regressor_features if f in ['Size']],
        'Economic': [f for f in regressor_features if f in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']],
        'Date_Features': [f for f in regressor_features if f in ['Month', 'DayOfWeek', 'WeeksFromStart', 'WeekOfYear']],
        'Holiday_Features': [f for f in regressor_features if 'Holiday' in f or 'BackToSchool' in f],
        'Type_OneHot': [f for f in regressor_features if f.startswith('Type_') and f != 'Type_Encoded'],
        'Type_Label': [f for f in regressor_features if f == 'Type_Encoded'],
        'Boolean_Features': [f for f in regressor_features if f in ['IsWeekend', 'IsMonthStart', 'IsMonthEnd']],
    }

    print(f"   📊 Regressor Categories:")
    for category, features in feature_categories.items():
        if features:
            print(f"      {category}: {features}")

    # Prophet general parameters (from good practices)
    prophet_params = {
        'yearly_seasonality': True,
        'weekly_seasonality': True,
        'daily_seasonality': False, # Daily not relevant for weekly data
        'seasonality_mode': 'multiplicative', # Good for sales data
        'growth': 'linear'
    }
    print(f"   📋 Prophet Parameters: {prophet_params}")
    print(f"   🔄 Training individual Prophet models per Store-Dept series...")

    # Identify all unique Store-Dept combinations
    unique_series = train_df_prophet[['Store', 'Dept']].drop_duplicates()
    total_series_count = len(unique_series)
    print(f"   📊 Found {total_series_count} unique Store-Dept series.")

    all_train_preds = []
    all_train_actuals = []
    all_train_weights = []

    all_val_preds = []
    all_val_actuals = []
    all_val_weights = []

    # Keep track of holidays for WMAE (already extracted to separate arrays)
    # We will need to map these back to the predictions for WMAE correctly

    series_processed_count = 0
    skipped_short_series_count = 0
    failed_fit_series_count = 0
    failed_predict_series_count = 0

    # Collect actuals and holidays mapped by (Store, Dept, ds) for easier lookup
    # This is crucial for correctly applying WMAE weights later
    train_actuals_map = train_df_prophet.set_index(['Store', 'Dept', 'ds'])[['y', 'IsHoliday']].to_dict('index')
    val_actuals_map = val_df_prophet.set_index(['Store', 'Dept', 'ds'])[['y', 'IsHoliday']].to_dict('index')


    for index, row in tqdm(unique_series.iterrows(), total=total_series_count, desc="Processing series"):
        store_id = row['Store']
        dept_id = row['Dept']

        # Filter data for current series
        series_train_df = train_df_prophet[(train_df_prophet['Store'] == store_id) & (train_df_prophet['Dept'] == dept_id)].copy()
        series_val_df = val_df_prophet[(val_df_prophet['Store'] == store_id) & (val_df_prophet['Dept'] == dept_id)].copy()

        # Check minimum observations for Prophet
        if len(series_train_df) < MIN_OBSERVATIONS_FOR_PROPHET:
            skipped_short_series_count += 1
            # For skipped series, predict 0 for all validation periods
            if not series_val_df.empty:
                for _, val_row in series_val_df.iterrows():
                    all_val_preds.append(0.0)
                    actual_data = val_actuals_map.get((val_row['Store'], val_row['Dept'], val_row['ds']))
                    if actual_data:
                        all_val_actuals.append(actual_data['y'])
                        all_val_weights.append(5 if actual_data['IsHoliday'] else 1)
                    else: # Fallback if somehow not in map
                        all_val_actuals.append(np.nan)
                        all_val_weights.append(1)
            continue

        series_processed_count += 1

        # Initialize Prophet model
        m = Prophet(**prophet_params)

        # Add regressors
        for regressor in regressor_features:
            if regressor in series_train_df.columns: # Ensure regressor exists for this series
                m.add_regressor(regressor)

        # --- AGGRESSIVE OUTPUT SUPPRESSION FOR MODEL FIT ---
        original_stdout = sys.stdout
        original_stderr = sys.stderr
        sys.stdout = StringIO()
        sys.stderr = StringIO()
        try:
            m.fit(series_train_df[['ds', 'y'] + [f for f in regressor_features if f in series_train_df.columns]])
        except Exception as e:
            failed_fit_series_count += 1
            if not series_val_df.empty:
                 for _, val_row in series_val_df.iterrows():
                    all_val_preds.append(0.0) # Default to 0 for failed fit
                    actual_data = val_actuals_map.get((val_row['Store'], val_row['Dept'], val_row['ds']))
                    if actual_data:
                        all_val_actuals.append(actual_data['y'])
                        all_val_weights.append(5 if actual_data['IsHoliday'] else 1)
                    else:
                        all_val_actuals.append(np.nan)
                        all_val_weights.append(1)
            continue # Skip to the next series if fit fails
        finally:
            sys.stdout = original_stdout
            sys.stderr = original_stderr
        # --- END AGGRESSIVE OUTPUT SUPPRESSION FOR MODEL FIT ---

        # Generate IN-SAMPLE predictions for training metrics
        train_future_df = series_train_df[['ds'] + [f for f in regressor_features if f in series_train_df.columns]]
        if not train_future_df.empty:
            sys.stdout = StringIO()
            sys.stderr = StringIO()
            try:
                train_forecast = m.predict(train_future_df)
                train_preds_series = train_forecast['yhat'].clip(lower=0).values
                all_train_preds.extend(train_preds_series)

                for _, train_row in series_train_df.iterrows():
                    actual_data = train_actuals_map.get((train_row['Store'], train_row['Dept'], train_row['ds']))
                    if actual_data:
                        all_train_actuals.append(actual_data['y'])
                        all_train_weights.append(5 if actual_data['IsHoliday'] else 1)
                    else:
                        all_train_actuals.append(np.nan)
                        all_train_weights.append(1)

            except Exception as e:
                # If in-sample prediction fails, just pass (not critical for overall metrics)
                pass
            finally:
                sys.stdout = original_stdout
                sys.stderr = original_stderr

        # Generate OUT-OF-SAMPLE predictions for validation metrics
        if not series_val_df.empty:
            val_future_df = series_val_df[['ds'] + [f for f in regressor_features if f in series_val_df.columns]]
            sys.stdout = StringIO()
            sys.stderr = StringIO()
            try:
                val_forecast = m.predict(val_future_df)
                val_preds_series = val_forecast['yhat'].clip(lower=0).values
                all_val_preds.extend(val_preds_series)

                for _, val_row in series_val_df.iterrows():
                    actual_data = val_actuals_map.get((val_row['Store'], val_row['Dept'], val_row['ds']))
                    if actual_data:
                        all_val_actuals.append(actual_data['y'])
                        all_val_weights.append(5 if actual_data['IsHoliday'] else 1)
                    else:
                        all_val_actuals.append(np.nan)
                        all_val_weights.append(1)

            except Exception as e:
                failed_predict_series_count += 1
                # If prediction fails, default to 0 for this series in validation
                for _, val_row in series_val_df.iterrows():
                    all_val_preds.append(0.0) # Default to 0
                    actual_data = val_actuals_map.get((val_row['Store'], val_row['Dept'], val_row['ds']))
                    if actual_data:
                        all_val_actuals.append(actual_data['y'])
                        all_val_weights.append(5 if actual_data['IsHoliday'] else 1)
                    else:
                        all_val_actuals.append(np.nan)
                        all_val_weights.append(1)
            finally:
                sys.stdout = original_stdout
                sys.stderr = original_stderr


    print(f"\n   Summary of series processing:")
    print(f"      Total series identified: {total_series_count}")
    print(f"      Series trained: {series_processed_count}")
    print(f"      Series skipped (too short): {skipped_short_series_count}")
    print(f"      Series failed to fit: {failed_fit_series_count}")
    print(f"      Series failed to predict (val): {failed_predict_series_count}")

    # Convert collected lists to numpy arrays for aggregated metric calculation
    # Filter out NaNs where actuals might be missing (shouldn't happen if maps are complete)
    y_train_pred = np.array(all_train_preds)
    y_train_actual = np.array(all_train_actuals)
    train_weights = np.array(all_train_weights)

    y_val_pred = np.array(all_val_preds)
    y_val_actual = np.array(all_val_actuals)
    val_weights = np.array(all_val_weights)

    # Filter out NaNs if any values were missing
    valid_train_indices = ~np.isnan(y_train_pred) & ~np.isnan(y_train_actual)
    y_train_pred = y_train_pred[valid_train_indices]
    y_train_actual = y_train_actual[valid_train_indices]
    train_weights = train_weights[valid_train_indices]

    valid_val_indices = ~np.isnan(y_val_pred) & ~np.isnan(y_val_actual)
    y_val_pred = y_val_pred[valid_val_indices]
    y_val_actual = y_val_actual[valid_val_indices]
    val_weights = val_weights[valid_val_indices]

    # Calculate metrics
    train_metrics = calculate_metrics(y_train_actual, y_train_pred, is_holiday=(train_weights == 5))
    val_metrics = calculate_metrics(y_val_actual, y_val_pred, is_holiday=(val_weights == 5))

    return {
        'train_metrics': train_metrics,
        'val_metrics': val_metrics,
        'params': prophet_params, # Prophet's general parameters
        'feature_categories': feature_categories,
        'regressor_features': regressor_features,
        'num_train_series': series_processed_count, # Actual trained series
        'num_val_series': total_series_count # All series attempted for validation prediction
    }


In [17]:
def main():
    """Main experiment pipeline for Prophet"""
    print("🎯 EXPERIMENT : Prophet with Preprocessing Pipeline")
    print("=" * 60)

    # Get preprocessed data for Prophet
    train_df_prophet, val_df_prophet, train_holidays, val_holidays, split_info, regressor_features = get_preprocessed_data_prophet()

    # Train Prophet models
    model_info = train_prophet_models(
        train_df_prophet, val_df_prophet, train_holidays, val_holidays, regressor_features
    )

    # Display comprehensive results
    print(f"\n✅ Prophet Training Complete!")
    print(f"=" * 50)
    print(f"📊 TRAINING METRICS (Aggregated In-Sample):")
    print(f"   WMAE: {model_info['train_metrics']['wmae']:.2f}")
    print(f"   RMSE: {model_info['train_metrics']['rmse']:.2f}")
    print(f"   MAE: {model_info['train_metrics']['mae']:.2f}")
    print(f"   R²: {model_info['train_metrics']['r2']:.4f}")

    print(f"\n📊 VALIDATION METRICS (Aggregated Out-of-Sample):")
    print(f"   WMAE: {model_info['val_metrics']['wmae']:.2f} ⭐")
    print(f"   RMSE: {model_info['val_metrics']['rmse']:.2f}")
    print(f"   MAE: {model_info['val_metrics']['mae']:.2f}")
    print(f"   R²: {model_info['val_metrics']['r2']:.4f}")

    # Holiday weight analysis
    # These counts reflect the data frames after processing and potential outlier removal
    print(f"\n🎄 HOLIDAY ANALYSIS:")
    print(f"   Training holiday weeks: {(train_holidays.sum() / len(train_holidays)) * 100:.1f}%")
    print(f"   Validation holiday weeks: {(val_holidays.sum() / len(val_holidays)) * 100:.1f}%")
    print(f"   Holiday weight multiplier: 5x")

    # Calculate overfitting metrics
    wmae_ratio = model_info['val_metrics']['wmae'] / model_info['train_metrics']['wmae']
    r2_diff = model_info['train_metrics']['r2'] - model_info['val_metrics']['r2']

    print(f"\n🔍 OVERFITTING ANALYSIS:")
    print(f"   WMAE Ratio (val/train): {wmae_ratio:.2f}")
    print(f"   R² Difference (train-val): {r2_diff:.4f}")
    if wmae_ratio > 2.0:
        print("   ⚠️ High overfitting detected (WMAE ratio > 2.0)")
    elif wmae_ratio > 1.5:
        print("   ⚠️ Moderate overfitting detected (WMAE ratio > 1.5)")
    else:
        print("   ✅ Reasonable generalization")

    # Feature importance explanation for Prophet
    print(f"\n🎯 FEATURE IMPORTANCE (Prophet):")
    print(f"   Prophet does not provide a single global feature importance score like tree models.")
    print(f"   The influence of individual regressors can be observed through their coefficients in each trained model,")
    print(f"   or by analyzing component plots for specific series. The regressors used were:")
    for i, feature in enumerate(model_info['regressor_features']):
        print(f"      - {feature}")

    print(f"\n📈 MODEL INFO:")
    print(f"   Training series: {model_info['num_train_series']:,}")
    print(f"   Validation series: {model_info['num_val_series']:,}")
    print(f"   Number of regressors: {len(model_info['regressor_features'])}")
    print(f"   Prophet Parameters: {model_info['params']}")

    # Log results to MLflow after training is complete
    log_prophet_results_to_mlflow(
        model_info['train_metrics'], model_info['val_metrics'],
        model_info['num_train_series'], model_info['num_val_series'],
        model_info['params'], model_info['feature_categories'], model_info['regressor_features']
    )

    print(f"\n🎉 EXPERIMENT  COMPLETED!")
    print(f"=" * 60)
    print(f"🏆 Final Validation WMAE (Aggregated): {model_info['val_metrics']['wmae']:.2f}")
    print(f"📊 Final Validation R² (Aggregated): {model_info['val_metrics']['r2']:.4f}")
    print(f"🎯 This experiment uses the correct Walmart WMAE formula:")
    print(f"   • Holiday weeks weighted 5x")
    print(f"   • Regular weeks weighted 1x")
    print(f"   • Per-series Prophet training with date and holiday features")
    print(f"   • Lag features explicitly disabled to prevent overfitting")


In [18]:
if __name__ == "__main__":
    # Ensure raw data is unzipped before running main
    with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    train = pd.read_csv('train.csv')
    with zipfile.ZipFile('features.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    features = pd.read_csv('features.csv')
    main()

🎯 EXPERIMENT : Prophet with Preprocessing Pipeline
🔄 Getting preprocessed data using pipeline for Prophet...
📊 Loading datasets...
   📈 Train data: (421570, 5)
   🏪 Stores data: (45, 3)
   🎯 Features data: (8190, 12)
   ✅ Merged data: (421570, 17)
   📅 Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
🧹 Cleaning merged data...
   🔄 Resolving duplicate IsHoliday columns...
   ✅ Cleaned data: (421570, 16) (was (421570, 17))
📅 Creating temporal split (80/19)...
   📊 Split date: 2012-04-13 00:00:00
   📈 Train: 337,256 records (2010-02-05 00:00:00 to 2012-04-13 00:00:00)
   📉 Val: 84,314 records (2012-04-13 00:00:00 to 2012-10-26 00:00:00)
🔧 Fitting preprocessing pipeline on training data...
✅ Pipeline fitted on training data
🔄 Transforming training data...
   🔧 Encoding Type column using both one-hot and label encoding...
   ✅ Added both Type_Encoded and ['Type_A', 'Type_B', 'Type_C']
   🗑️ Removed 45,193 outliers from training data
✅ Transform complete. Shape: (292063, 29)
🔄 Transfor

Processing series:   0%|          | 0/3082 [00:00<?, ?it/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:cmdstanpy:Chain [1] start processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmpvdhmqjyf/5kl7mq9k.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpvdhmqjyf/atv1jkyk.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cmdstanpy:CmdStan args: ['/usr/local/lib/python3.11/dist-packages/prophet/stan_model/prophet_model.bin', 'random', 'seed=76306', 'data', 'file=/tmp/tmpvdhmqjyf/5kl7mq9k.json', 'init=/tmp/tmpvdhmqjyf/atv1jkyk.json', 'output', 'file=/tmp/tmpvdhmqjyf/prophet_modelqrgj7e03/prophet_model-20250706131238.csv', 'method=optimize', 'algorithm=lbfgs', 'iter=10000']
INFO:cmdstanpy:Chain [1] start processing
INFO:cmdstanpy:Chain [1] done processing
DEBUG:cmdstanpy:input tempfile: /tmp/tmpvdhmqjyf/1ktzvn9_.json
DEBUG:cmdstanpy:input tempfile: /tmp/tmpvdhmqjyf/njlb6f58.json
DEBUG:cmdstanpy:idx 0
DEBUG:cmdstanpy:running CmdStan, num_threads: None
DEBUG:cm


   Summary of series processing:
      Total series identified: 3082
      Series trained: 2615
      Series skipped (too short): 467
      Series failed to fit: 0
      Series failed to predict (val): 0

✅ Prophet Training Complete!
📊 TRAINING METRICS (Aggregated In-Sample):
   WMAE: 1439.43
   RMSE: 2831.60
   MAE: 1388.21
   R²: 0.9246

📊 VALIDATION METRICS (Aggregated Out-of-Sample):
   WMAE: 6178.43 ⭐
   RMSE: 19636.79
   MAE: 6141.23
   R²: -1.1195

🎄 HOLIDAY ANALYSIS:
   Training holiday weeks: 7.9%
   Validation holiday weeks: 3.5%
   Holiday weight multiplier: 5x

🔍 OVERFITTING ANALYSIS:
   WMAE Ratio (val/train): 4.29
   R² Difference (train-val): 2.0441
   ⚠️ High overfitting detected (WMAE ratio > 2.0)

🎯 FEATURE IMPORTANCE (Prophet):
   Prophet does not provide a single global feature importance score like tree models.
   The influence of individual regressors can be observed through their coefficients in each trained model,
   or by analyzing component plots for specific

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=5c40b940-9903-4d83-95f8-ebc25511640e&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=b75f6bd4c9f251abcc7a19cdc87178f960a3f7731bd161aafd951ffede27ea1d




✅ DagsHub initialized successfully!


2025/07/06 13:20:47 INFO mlflow.tracking.fluent: Experiment with name 'Default' does not exist. Creating a new experiment.


⚠️ MLflow logging failed: API request to endpoint /api/2.0/mlflow/experiments/create failed with error code 403 != 200. Response body: ''
   Training results are still valid, just not logged to MLflow

🎉 EXPERIMENT  COMPLETED!
🏆 Final Validation WMAE (Aggregated): 6178.43
📊 Final Validation R² (Aggregated): -1.1195
🎯 This experiment uses the correct Walmart WMAE formula:
   • Holiday weeks weighted 5x
   • Regular weeks weighted 1x
   • Per-series Prophet training with date and holiday features
   • Lag features explicitly disabled to prevent overfitting
