<a href="https://colab.research.google.com/github/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting/blob/main/experiment_7_xgboost_Markdowns.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# experiment_7_xgboost.ipynb


In [2]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
! mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting

walmart-recruiting-store-sales-forecasting.zip: Skipping, found more recently modified local copy (use --force to force download)


In [7]:
! unzip walmart-recruiting-store-sales-forecasting.zip

Archive:  walmart-recruiting-store-sales-forecasting.zip
replace features.csv.zip? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Install required packages
import subprocess
import sys

In [9]:
!pip install prophet plotly mlflow dagshub xgboost -q

In [10]:
import mlflow
import mlflow.sklearn
import dagshub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb

print("✅ All libraries imported successfully!")

✅ All libraries imported successfully!


In [11]:
import pandas as pd
import numpy as np
import mlflow
import mlflow.xgboost
import dagshub
import xgboost as xgb
import joblib
from datetime import datetime
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [12]:
class WalmartPreprocessingPipeline:
    """
    Complete preprocessing pipeline for Walmart sales data.
    Supports fit/transform pattern for proper train/validation handling.
    Updated to allow control over MarkDown feature inclusion.
    """

    def __init__(self, remove_outliers=True, remove_markdowns=False, enable_lag_features=False):
        # Configuration parameters
        self.remove_outliers = remove_outliers
        # Flag to control MarkDown removal (False means keep MarkDowns)
        self.remove_markdowns = remove_markdowns
        self.enable_lag_features = enable_lag_features

        # Internal state variables
        self.fitted = False
        self.outlier_thresholds = None
        self.feature_columns = None
        self.train_data_for_lags = None

        # Define outlier thresholds for Weekly_Sales based on store type
        self.weekly_sales_outlier_thresholds = {
            'A': {'lower': -1000, 'upper': 50000},  # Type A stores
            'B': {'lower': -500, 'upper': 25000},   # Type B stores
            'C': {'lower': -200, 'upper': 15000}    # Type C stores
        }

    def load_and_prepare_data(self):
        """Load and merge train.csv, stores.csv, features.csv datasets"""
        print("📊 Loading datasets...")

        # Assuming data files are available in the environment
        try:
            train_df = pd.read_csv('train.csv')
            stores_df = pd.read_csv('stores.csv')
            features_df = pd.read_csv('features.csv')
        except FileNotFoundError as e:
            print(f"Error loading data files: {e}")
            return None

        print(f"   📈 Train data: {train_df.shape}")
        print(f"   🏪 Stores data: {stores_df.shape}")
        print(f"   🎯 Features data: {features_df.shape}")

        # Convert Date column to datetime
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        # Merge datasets
        train_stores = train_df.merge(stores_df, on='Store', how='left')
        train_full = train_stores.merge(features_df, on=['Store', 'Date'], how='left')

        print(f"   ✅ Merged data: {train_full.shape}")
        print(f"   📅 Date range: {train_full['Date'].min()} to {train_full['Date'].max()}")

        return train_full

    def clean_merged_data(self, train_full):
        """Clean merged data by handling duplicate IsHoliday columns"""
        print("🧹 Cleaning merged data...")

        initial_shape = train_full.shape

        # Handle duplicate IsHoliday columns if they exist
        if 'IsHoliday_x' in train_full.columns and 'IsHoliday_y' in train_full.columns:
            print("   🔄 Resolving duplicate IsHoliday columns...")
            # Combine the boolean holidays using OR logic
            train_full['IsHoliday'] = train_full['IsHoliday_x'] | train_full['IsHoliday_y']
            train_full = train_full.drop(['IsHoliday_x', 'IsHoliday_y'], axis=1)

        print(f"   ✅ Cleaned data: {train_full.shape} (was {initial_shape})")
        return train_full

    def create_temporal_split(self, df, train_ratio=0.8):
        """Create temporal split to prevent data leakage"""
        print(f"📅 Creating temporal split ({int(train_ratio*100)}/{int((1-train_ratio)*100)})...")

        # Sort by date to ensure temporal order
        df_sorted = df.sort_values('Date').reset_index(drop=True)

        # Find split point
        split_idx = int(len(df_sorted) * train_ratio)
        split_date = df_sorted.iloc[split_idx]['Date']

        # Create splits
        train_data = df_sorted.iloc[:split_idx].copy()
        val_data = df_sorted.iloc[split_idx:].copy()

        # Create split info dictionary
        split_info = {
            'split_date': split_date,
            'train_size': len(train_data),
            'val_size': len(val_data),
            'train_date_range': (train_data['Date'].min(), train_data['Date'].max()),
            'val_date_range': (val_data['Date'].min(), val_data['Date'].max())
        }

        print(f"   📊 Split date: {split_date}")
        print(f"   📈 Train: {len(train_data):,} records ({train_data['Date'].min()} to {train_data['Date'].max()})")
        print(f"   📉 Val: {len(val_data):,} records ({val_data['Date'].min()} to {val_data['Date'].max()})")

        return train_data, val_data, split_info

    def fit(self, train_data):
        """Fit the preprocessing pipeline on training data"""
        print("🔧 Fitting preprocessing pipeline on training data...")

        # Store training data for lag feature creation
        self.train_data_for_lags = train_data.copy()

        # Fit outlier removal thresholds on training data only
        # We use the predefined thresholds from __init__
        self.outlier_thresholds = self.weekly_sales_outlier_thresholds

        print("✅ Pipeline fitted on training data")
        self.fitted = True
        return self

    def transform(self, data, is_validation=False):
        """Transform data using fitted pipeline"""
        if not self.fitted:
            raise ValueError("Pipeline must be fitted before transform!")

        print(f"🔄 Transforming {'validation' if is_validation else 'training'} data...")

        df = data.copy()

        # Step 1: Create date features
        df = self._create_date_features(df)

        # Step 2: Create holiday features
        df = self._create_holiday_features(df)

        # Step 3: Handle missing values (Crucial for MarkDowns if used)
        df = self._handle_missing_values(df)

        # Step 4: Encode categorical features (BEFORE outlier removal!)
        df = self._encode_categorical_features(df)

        # Step 5: Create lag features (different for train vs validation, conditional)
        if self.enable_lag_features:
            if is_validation:
                df = self._create_lag_features_validation(df)
            else:
                df = self._create_lag_features_training(df)

        # Step 6: Remove outliers (only on training data, conditional on self.remove_outliers)
        if not is_validation and self.remove_outliers:
            df = self._remove_outliers(df)

        # Step 7: Remove markdown features (conditional on self.remove_markdowns)
        # If self.remove_markdowns is False, MarkDown features will be kept.
        if self.remove_markdowns:
            df = self._remove_markdown_features(df)
        else:
            print("   📁 Keeping MarkDown features.")

        # Step 8: Remove redundant features
        df = self._remove_redundant_features(df)

        print(f"✅ Transform complete. Shape: {df.shape}")
        return df

    def fit_transform(self, train_data):
        """Fit and transform training data in one step"""
        return self.fit(train_data).transform(train_data, is_validation=False)

    def _create_date_features(self, df):
        """Create date features"""
        df = df.copy()
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int)
        df['Quarter'] = df['Date'].dt.quarter
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(int)
        df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
        df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
        df['IsQuarterStart'] = df['Date'].dt.is_quarter_start.astype(int)
        df['IsQuarterEnd'] = df['Date'].dt.is_quarter_end.astype(int)
        start_date = df['Date'].min()
        df['DaysFromStart'] = (df['Date'] - start_date).dt.days
        df['WeeksFromStart'] = df['DaysFromStart'] // 7
        return df

    def _create_holiday_features(self, df):
        """Create holiday features"""
        df = df.copy()
        super_bowl_dates = ['2010-02-12', '2011-02-11', '2012-02-10']
        labor_day_dates = ['2010-09-10', '2011-09-09', '2012-09-07']
        thanksgiving_dates = ['2010-11-26', '2011-11-25', '2012-11-23']
        christmas_dates = ['2010-12-31', '2011-12-30', '2012-12-28']

        df['IsSuperBowlWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(super_bowl_dates).astype(int)
        df['IsLaborDayWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(labor_day_dates).astype(int)
        df['IsThanksgivingWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(thanksgiving_dates).astype(int)
        df['IsChristmasWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(christmas_dates).astype(int)
        df['IsMajorHoliday'] = (df['IsSuperBowlWeek'] | df['IsLaborDayWeek'] |
                               df['IsThanksgivingWeek'] | df['IsChristmasWeek']).astype(int)
        df['IsHolidayMonth'] = df['Month'].isin([11, 12]).astype(int)
        df['IsBackToSchool'] = df['Month'].isin([8, 9]).astype(int)
        return df

    def _create_lag_features_training(self, df):
        """Create lag features for training data - DISABLED by default"""
        # Lag features removed to prevent overfitting
        return df

    def _create_lag_features_validation(self, df):
        """Create lag features for validation data - DISABLED by default"""
        # Lag features removed to prevent overfitting
        return df

    def _handle_missing_values(self, df):
        """
        Fills missing numerical values, particularly for MarkDown columns, with 0.
        MarkDown values are NaN when a markdown is not active.
        """
        # Identify MarkDown columns
        markdown_cols = [col for col in df.columns if 'MarkDown' in col]

        # Identify other numerical columns that might have NaNs (e.g., CPI, Unemployment)
        # For simplicity and correctness with MarkDowns, we fill MarkDowns with 0.

        # Fill NaNs in MarkDown columns with 0
        if markdown_cols:
            df[markdown_cols] = df[markdown_cols].fillna(0)
            print("   ✅ Handled missing values (filled MarkDown NaNs with 0).")

        return df

    def _remove_outliers(self, df):
        """Remove outliers from training data only"""
        initial_len = len(df)
        df_clean = df.copy()

        for store_type, thresholds in self.outlier_thresholds.items():
            # Check if the one-hot encoded column exists (it should, after _encode_categorical_features)
            if f'Type_{store_type}' in df_clean.columns:
                type_mask = df_clean[f'Type_{store_type}'] == 1
                outlier_mask = (
                    (df_clean['Weekly_Sales'] < thresholds['lower']) |
                    (df_clean['Weekly_Sales'] > thresholds['upper'])
                )
                # Remove rows that match both the store type AND the outlier condition
                df_clean = df_clean[~(type_mask & outlier_mask)]

        removed = initial_len - len(df_clean)
        print(f"   🗑️ Removed {removed:,} outliers from training data")
        return df_clean

    def _remove_markdown_features(self, df):
        """Remove markdown columns (called only if self.remove_markdowns is True)"""
        markdown_cols = [col for col in df.columns if 'MarkDown' in col]
        if markdown_cols:
            df = df.drop(markdown_cols, axis=1)
        return df

    def _remove_redundant_features(self, df):
        """Remove redundant features"""
        redundant_cols = ['Year', 'Quarter', 'Day', 'WeekOfYear', 'DaysFromStart',
                         'IsQuarterStart', 'IsQuarterEnd']
        existing_redundant = [col for col in redundant_cols if col in df.columns]
        if existing_redundant:
            df = df.drop(existing_redundant, axis=1)
        return df

    def _remove_id_columns(self, df):
        """Remove high-cardinality ID columns that cause overfitting"""
        id_cols = ['Store', 'Dept']
        existing_id_cols = [col for col in id_cols if col in df.columns]
        if existing_id_cols:
            print(f"   🗑️ Removing ID columns to prevent overfitting: {existing_id_cols}")
            df = df.drop(existing_id_cols, axis=1)
        return df

    def _encode_categorical_features(self, df):
        """Encode categorical features using both one-hot and label encoding"""
        df = df.copy()

        if 'Type' in df.columns:
            print(f"   🔧 Encoding Type column using both one-hot and label encoding...")

            # One-hot encoding
            type_dummies = pd.get_dummies(df['Type'], prefix='Type', dtype=int)

            # Label encoding (A=0, B=1, C=2)
            type_mapping = {'A': 0, 'B': 1, 'C': 2}
            df['Type_Encoded'] = df['Type'].map(type_mapping)

            # Add one-hot columns
            for col in type_dummies.columns:
                df[col] = type_dummies[col]

            # Remove original Type column
            df = df.drop('Type', axis=1)

            print(f"   ✅ Added both Type_Encoded and {list(type_dummies.columns)}")

        return df


In [13]:
def setup_mlflow():
    """Setup MLflow and DagsHub tracking"""
    print("🔧 Setting up MLflow and DagsHub...")

    # End any active runs first
    try:
        mlflow.end_run()
    except:
        pass

    # Initialize DagsHub
    try:
        dagshub.init(
            repo_owner='konstantine25b',
            repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
            mlflow=True
        )
        print("✅ DagsHub initialized successfully!")
    except Exception as e:
        print(f"⚠️ DagsHub init warning: {e}")

    # Set MLflow tracking URI
    mlflow.set_tracking_uri("https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow")

    # Create unique experiment name with timestamp
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    experiment_name = f"Experiment_7_XGBoost_{timestamp}"

    try:
        experiment_id = mlflow.create_experiment(experiment_name)
        print(f"✅ Created new experiment: {experiment_name}")
    except mlflow.exceptions.MlflowException as e:
        if "already exists" in str(e):
            experiment = mlflow.get_experiment_by_name(experiment_name)
            experiment_id = experiment.experiment_id
            print(f"✅ Using existing experiment: {experiment_name}")
        else:
            # Fallback to default experiment
            experiment_name = "Default"
            mlflow.set_experiment(experiment_name)
            print(f"⚠️ Using default experiment due to: {e}")

    mlflow.set_experiment(experiment_name)

    print(f"✅ MLflow setup complete!")
    print(f"🔗 Tracking URI: {mlflow.get_tracking_uri()}")
    print(f"📊 Experiment: {experiment_name}")

    return experiment_name



In [14]:
def get_preprocessed_data():
    """
    Use preprocessing pipeline to get model-ready data,
    configured to include MarkDown features.

    Returns:
        X_train, y_train, X_val, y_val: Model-ready datasets
        train_holidays, val_holidays: Holiday indicators for WMAE
        split_info: Information about the temporal split
    """
    print("🔄 Getting preprocessed data using pipeline...")

    # Create the preprocessing pipeline, setting remove_markdowns=False to ensure they are used
    pipeline = WalmartPreprocessingPipeline(remove_markdowns=False)

    # Load raw data
    train_full = pipeline.load_and_prepare_data()
    if train_full is None:
        # Handle case where data loading failed (e.g., files missing)
        return None, None, None, None, None, None, None

    train_full = pipeline.clean_merged_data(train_full)

    # Create temporal split
    train_data, val_data, split_info = pipeline.create_temporal_split(train_full)

    # Extract holiday information before preprocessing
    val_holidays = val_data['IsHoliday'].values.astype(bool)

    # Separate validation target (realistic test scenario)
    y_val = val_data['Weekly_Sales'].copy()
    val_data_no_target = val_data.drop('Weekly_Sales', axis=1).copy()

    # Fit and transform data using pipeline
    pipeline.fit(train_data)
    train_processed = pipeline.transform(train_data, is_validation=False)
    val_processed = pipeline.transform(val_data_no_target, is_validation=True)

    # Extract holiday information AFTER training data processing (after outlier removal)
    train_holidays = train_processed['IsHoliday'].values.astype(bool)

    # Prepare model data
    X_train = train_processed.drop(['Weekly_Sales', 'Date'], axis=1)
    y_train = train_processed['Weekly_Sales']
    X_val = val_processed.drop('Date', axis=1)

    # Verify that MarkDown columns exist in the final datasets
    markdown_cols_train = [col for col in X_train.columns if 'MarkDown' in col]
    markdown_cols_val = [col for col in X_val.columns if 'MarkDown' in col]

    print("\nMarkDown columns in X_train:", markdown_cols_train)
    print("MarkDown columns in X_val:", markdown_cols_val)
    print(f"✅ Data ready:")
    print(f"   X_train: {X_train.shape}")
    print(f"   y_train: {y_train.shape}")
    print(f"   X_val: {X_val.shape}")
    print(f"   y_val: {y_val.shape}")
    print(f"   train_holidays: {train_holidays.shape} ({train_holidays.sum()} holidays)")
    print(f"   val_holidays: {val_holidays.shape} ({val_holidays.sum()} holidays)")

    return X_train, y_train, X_val, y_val, train_holidays, val_holidays, split_info

In [15]:
def calculate_metrics(y_true, y_pred, is_holiday=None):
    """Calculate evaluation metrics including WMAE with correct Walmart formula"""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)

    # Calculate WMAE (Weighted Mean Absolute Error) - Walmart competition formula
    # w_i = 5 if holiday week, 1 otherwise
    if is_holiday is not None:
        weights = np.where(is_holiday, 5, 1)  # 5 for holidays, 1 for regular weeks
    else:
        weights = np.ones(len(y_true))  # Default to all 1s if no holiday info

    wmae = np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

    return {
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
        'wmae': wmae
    }



In [16]:
def log_results_to_mlflow(model, train_metrics, val_metrics, X_train, X_val, params, feature_list, feature_categories):
    """Log training results to MLflow after training is complete"""
    print("\n📊 Logging results to MLflow...")

    try:
        # Setup MLflow
        experiment_name = setup_mlflow()

        with mlflow.start_run(run_name="XGBoost_Walmart_Sales"):
            # Log parameters
            mlflow.log_params(params)

            # Log metrics
            for metric_name, value in train_metrics.items():
                mlflow.log_metric(f"train_{metric_name}", value)

            for metric_name, value in val_metrics.items():
                mlflow.log_metric(f"val_{metric_name}", value)

            # Log feature importance
            if hasattr(model, 'feature_importances_'):
                importance_df = pd.DataFrame({
                    'feature': feature_list,
                    'importance': model.feature_importances_
                }).sort_values('importance', ascending=False)

                # Log top 10 features
                for i, row in importance_df.head(10).iterrows():
                    mlflow.log_metric(f"importance_{row['feature']}", row['importance'])

            # Log model
            mlflow.xgboost.log_model(model, "xgboost_model")

            # Log split info
            split_info = {
                'train_size': len(X_train),
                'val_size': len(X_val),
                'n_features': X_train.shape[1]
            }
            mlflow.log_params(split_info)

            # Log feature categories
            mlflow.log_params(feature_categories)

            run_id = mlflow.active_run().info.run_id
            print(f"✅ Results logged to MLflow run: {run_id}")

    except Exception as e:
        print(f"⚠️ MLflow logging failed: {e}")
        print("   Training results are still valid, just not logged to MLflow")


In [17]:

def train_xgboost_model(X_train, y_train, X_val, y_val, train_holidays=None, val_holidays=None):
    """Train XGBoost model with comprehensive feature logging"""
    print("🚀 Training XGBoost model...")

    # Log feature information
    feature_list = list(X_train.columns)
    print(f"   📋 Total Features: {len(feature_list)}")
    print(f"   📋 Feature List: {feature_list}")

    # Categorize features for better understanding
    feature_categories = {
        'ID_Features': [f for f in feature_list if f in ['Store', 'Dept']],
        'Store_Info': [f for f in feature_list if f in ['Size']],
        'Economic': [f for f in feature_list if f in ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']],
        'Date_Features': [f for f in feature_list if f in ['Month', 'DayOfWeek', 'WeeksFromStart']],
        'Holiday_Features': [f for f in feature_list if 'Holiday' in f or 'BackToSchool' in f],
        'Type_OneHot': [f for f in feature_list if f.startswith('Type_') and f != 'Type_Encoded'],
        'Type_Label': [f for f in feature_list if f == 'Type_Encoded'],
        'Boolean_Features': [f for f in feature_list if f in ['IsWeekend', 'IsMonthStart', 'IsMonthEnd']],
        'Other': [f for f in feature_list if f not in
                 ['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
                  'Month', 'DayOfWeek', 'WeeksFromStart', 'IsWeekend', 'IsMonthStart', 'IsMonthEnd',
                  'Type_A', 'Type_B', 'Type_C', 'Type_Encoded'] and
                  'Holiday' not in f and 'BackToSchool' not in f]
    }

    print(f"   📊 Feature Categories:")
    for category, features in feature_categories.items():
        if features:
            print(f"      {category}: {features}")

    # XGBoost parameters matching experiment_2's successful configuration
    params = {
        'n_estimators': 200,          # Same as experiment_2
        'max_depth': 8,               # Same as experiment_2
        'learning_rate': 0.05,        # Same as experiment_2
        'subsample': 0.8,             # Same as experiment_2
        'colsample_bytree': 0.8,      # Same as experiment_2
        'colsample_bylevel': 0.8,     # Same as experiment_2
        'min_child_weight': 3,        # Same as experiment_2
        'gamma': 0.1,                 # Same as experiment_2
        'reg_alpha': 0.1,             # Same as experiment_2
        'reg_lambda': 1.0,            # Same as experiment_2
        'random_state': 42,
        'n_jobs': -1
    }

    print(f"   📋 Parameters: {params}")
    print(f"   🔄 Training XGBoost with regularization...")

    # Train model
    model = xgb.XGBRegressor(**params)
    model.fit(X_train, y_train)

    print(f"   📊 Making predictions...")

    # Make predictions
    train_pred = model.predict(X_train)
    val_pred = model.predict(X_val)

    # Calculate metrics
    train_metrics = calculate_metrics(y_train, train_pred, train_holidays)
    val_metrics = calculate_metrics(y_val, val_pred, val_holidays)

    # Feature importance analysis
    feature_importance = model.feature_importances_
    importance_df = pd.DataFrame({
        'feature': feature_list,
        'importance': feature_importance
    }).sort_values('importance', ascending=False)

    return {
        'model': model,
        'train_metrics': train_metrics,
        'val_metrics': val_metrics,
        'feature_importance': importance_df,
        'params': params,
        'feature_list': feature_list,
        'feature_categories': feature_categories
    }



In [18]:
def main():
    """Main experiment pipeline"""
    print("🎯 EXPERIMENT 7: XGBoost with Preprocessing Pipeline")
    print("=" * 60)

    # Get preprocessed data
    X_train, y_train, X_val, y_val, train_holidays, val_holidays, split_info = get_preprocessed_data()

    # Train XGBoost model with holiday information for correct WMAE
    model_info = train_xgboost_model(
        X_train, y_train, X_val, y_val, train_holidays, val_holidays
    )

    # Display comprehensive results
    print(f"\n✅ XGBoost Training Complete!")
    print(f"=" * 50)
    print(f"📊 TRAINING METRICS:")
    print(f"   WMAE: {model_info['train_metrics']['wmae']:.2f}")
    print(f"   RMSE: {model_info['train_metrics']['rmse']:.2f}")
    print(f"   MAE: {model_info['train_metrics']['mae']:.2f}")
    print(f"   R²: {model_info['train_metrics']['r2']:.4f}")

    print(f"\n📊 VALIDATION METRICS:")
    print(f"   WMAE: {model_info['val_metrics']['wmae']:.2f} ⭐")
    print(f"   RMSE: {model_info['val_metrics']['rmse']:.2f}")
    print(f"   MAE: {model_info['val_metrics']['mae']:.2f}")
    print(f"   R²: {model_info['val_metrics']['r2']:.4f}")

    # Holiday weight analysis
    if train_holidays is not None and val_holidays is not None:
        train_holiday_pct = (train_holidays.sum() / len(train_holidays)) * 100
        val_holiday_pct = (val_holidays.sum() / len(val_holidays)) * 100

        print(f"\n🎄 HOLIDAY ANALYSIS:")
        print(f"   Training holiday weeks: {train_holiday_pct:.1f}%")
        print(f"   Validation holiday weeks: {val_holiday_pct:.1f}%")
        print(f"   Holiday weight multiplier: 5x")

    # Calculate overfitting metrics
    wmae_ratio = model_info['val_metrics']['wmae'] / model_info['train_metrics']['wmae']
    r2_diff = model_info['train_metrics']['r2'] - model_info['val_metrics']['r2']

    print(f"\n🔍 OVERFITTING ANALYSIS:")
    print(f"   WMAE Ratio (val/train): {wmae_ratio:.2f}")
    print(f"   R² Difference (train-val): {r2_diff:.4f}")
    if wmae_ratio > 2.0:
        print("   ⚠️ High overfitting detected (WMAE ratio > 2.0)")
    elif wmae_ratio > 1.5:
        print("   ⚠️ Moderate overfitting detected (WMAE ratio > 1.5)")
    else:
        print("   ✅ Reasonable generalization")

    # Feature importance
    print(f"\n🔝 TOP 10 MOST IMPORTANT FEATURES:")
    for i, (_, row) in enumerate(model_info['feature_importance'].head(10).iterrows()):
        print(f"   {i+1:2d}. {row['feature']:<25} {row['importance']:.4f}")

    print(f"\n📈 MODEL INFO:")
    print(f"   Training samples: {len(X_train):,}")
    print(f"   Validation samples: {len(X_val):,}")
    print(f"   Features: {X_train.shape[1]}")
    print(f"   Estimators: {model_info['params']['n_estimators']}")

    # Log results to MLflow after training is complete
    log_results_to_mlflow(model_info['model'], model_info['train_metrics'], model_info['val_metrics'],
                          X_train, X_val, model_info['params'], model_info['feature_list'], model_info['feature_categories'])

    print(f"\n🎉 EXPERIMENT 7 COMPLETED!")
    print(f"=" * 60)
    print(f"🏆 Final Validation WMAE: {model_info['val_metrics']['wmae']:.2f}")
    print(f"📊 Final Validation R²: {model_info['val_metrics']['r2']:.4f}")
    print(f"🎯 This experiment uses the correct Walmart WMAE formula:")
    print(f"   • Holiday weeks weighted 5x")
    print(f"   • Regular weeks weighted 1x")
    print(f"   • Preprocessing pipeline with date and holiday features (no lag features)")
    print(f"   • Lag features removed to prevent overfitting")


In [19]:
if __name__ == "__main__":
    with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    train = pd.read_csv('train.csv')
    with zipfile.ZipFile('features.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    features = pd.read_csv('features.csv')
    main()


🎯 EXPERIMENT 7: XGBoost with Preprocessing Pipeline
🔄 Getting preprocessed data using pipeline...
📊 Loading datasets...
   📈 Train data: (421570, 5)
   🏪 Stores data: (45, 3)
   🎯 Features data: (8190, 12)
   ✅ Merged data: (421570, 17)
   📅 Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
🧹 Cleaning merged data...
   🔄 Resolving duplicate IsHoliday columns...
   ✅ Cleaned data: (421570, 16) (was (421570, 17))
📅 Creating temporal split (80/19)...
   📊 Split date: 2012-04-13 00:00:00
   📈 Train: 337,256 records (2010-02-05 00:00:00 to 2012-04-13 00:00:00)
   📉 Val: 84,314 records (2012-04-13 00:00:00 to 2012-10-26 00:00:00)
🔧 Fitting preprocessing pipeline on training data...
✅ Pipeline fitted on training data
🔄 Transforming training data...
   ✅ Handled missing values (filled MarkDown NaNs with 0).
   🔧 Encoding Type column using both one-hot and label encoding...
   ✅ Added both Type_Encoded and ['Type_A', 'Type_B', 'Type_C']
   🗑️ Removed 45,193 outliers from training data
   📁

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=d8f48e42-ff8b-4899-b71a-5e74f0d994a2&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=bcf3e2ee1f6f1b87c925dc3432052dc0597194df7990bb44322052711579cbd1




KeyboardInterrupt: 