# experiment_4_k

In [2]:
!pip install kaggle wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 618MB/s]


In [7]:
! unzip walmart-recruiting-store-sales-forecasting.zip

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [17]:
# ================================================================================
# EXPERIMENT 4: FIXING DATA LEAKAGE - SPLIT FIRST, THEN PREPROCESS
# ================================================================================

# Step 1: Setup and MLflow/DagsHub Configuration
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Install required packages
!pip install prophet plotly mlflow dagshub xgboost -q

# Setup MLflow and DagsHub
import mlflow
import dagshub

# DagsHub setup
dagshub.init(repo_owner='konstantine25b',
             repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
             mlflow=True)

# Set tracking URI
mlflow.set_tracking_uri("https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow")
mlflow.set_experiment("Experiment_4_Fixed_Data_Leakage")

<Experiment: artifact_location='mlflow-artifacts:/43ce0a29767b4be4b47a7e6d431382c2', creation_time=1750844432684, experiment_id='3', last_update_time=1750844432684, lifecycle_stage='active', name='Experiment_4_Fixed_Data_Leakage', tags={}>

In [48]:
# Step 2: Load and explore data
with mlflow.start_run(run_name="Data_Loading_Experiment_4") as run:

    print(f"📁 Starting data loading: {run.info.run_id}")

    # Load datasets
    with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall()

    train = pd.read_csv('train.csv')
    stores = pd.read_csv('stores.csv')

    # Convert Date column
    train['Date'] = pd.to_datetime(train['Date'])

    # Log basic info
    mlflow.log_param("train_shape", f"{train.shape[0]}x{train.shape[1]}")
    mlflow.log_param("date_range", f"{train['Date'].min()} to {train['Date'].max()}")
    mlflow.log_param("stores_count", train['Store'].nunique())
    mlflow.log_param("departments_count", train['Dept'].nunique())

    # Merge with stores data
    train_merged = train.merge(stores, on='Store', how='left')

    print(f"✅ Data loaded successfully!")
    print(f"📊 Training data shape: {train.shape}")
    print(f"🏪 Stores data shape: {stores.shape}")
    print(f"📊 Merged data shape: {train_merged.shape}")
    print(f"📅 Date range: {train['Date'].min()} to {train['Date'].max()}")
    print(f"🏬 Stores: {train['Store'].nunique()}, Departments: {train['Dept'].nunique()}")

📁 Starting data loading: a2ab45443a7e43ccb95a5028edc7ebba
✅ Data loaded successfully!
📊 Training data shape: (421570, 5)
🏪 Stores data shape: (45, 3)
📊 Merged data shape: (421570, 7)
📅 Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
🏬 Stores: 45, Departments: 81
🏃 View run Data_Loading_Experiment_4 at: https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/3/runs/a2ab45443a7e43ccb95a5028edc7ebba
🧪 View experiment at: https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/3


In [49]:
# ================================================================================
# STEP 3: CRITICAL FIX - TEMPORAL SPLIT FIRST (BEFORE PREPROCESSING)
# ================================================================================

with mlflow.start_run(run_name="Temporal_Split_BEFORE_Preprocessing") as run:

    print(f"⚠️ FIXING DATA LEAKAGE: Temporal split BEFORE preprocessing")
    print(f"🔄 Starting temporal split run: {run.info.run_id}")

    mlflow.log_param("split_method", "temporal_80_20_before_preprocessing")
    mlflow.log_param("fix_applied", "split_before_outlier_detection")

    # Use merged data (only basic merge, no feature engineering yet)
    data_for_split = train_merged.copy()

    # Sort by date to ensure proper temporal order
    data_for_split = data_for_split.sort_values(['Store', 'Dept', 'Date']).reset_index(drop=True)

    # Calculate split point (80% for training)
    min_date = data_for_split['Date'].min()
    max_date = data_for_split['Date'].max()
    total_days = (max_date - min_date).days
    split_days = int(total_days * 0.8)
    split_date = min_date + timedelta(days=split_days)

    # Ensure split_date falls on a week boundary (since data is weekly)
    # Find the closest Friday (assuming data is weekly ending on Friday)
    while split_date.weekday() != 4:  # 4 = Friday
        split_date += timedelta(days=1)

    # Create temporal split
    train_raw = data_for_split[data_for_split['Date'] < split_date].copy()
    val_raw = data_for_split[data_for_split['Date'] >= split_date].copy()

    # Log split information
    mlflow.log_param("split_date", split_date.strftime("%Y-%m-%d"))
    mlflow.log_param("train_records", len(train_raw))
    mlflow.log_param("val_records", len(val_raw))
    mlflow.log_param("train_date_range", f"{train_raw['Date'].min()} to {train_raw['Date'].max()}")
    mlflow.log_param("val_date_range", f"{val_raw['Date'].min()} to {val_raw['Date'].max()}")

    # Verify split quality
    train_stores = train_raw['Store'].nunique()
    val_stores = val_raw['Store'].nunique()
    train_depts = train_raw['Dept'].nunique()
    val_depts = val_raw['Dept'].nunique()

    print(f"✅ Temporal split completed BEFORE preprocessing!")
    print(f"📅 Split date: {split_date}")
    print(f"🚂 Training: {len(train_raw):,} records ({len(train_raw)/len(data_for_split)*100:.1f}%)")
    print(f"🔮 Validation: {len(val_raw):,} records ({len(val_raw)/len(data_for_split)*100:.1f}%)")
    print(f"🏪 Store coverage: Train={train_stores}, Val={val_stores}")
    print(f"🏬 Dept coverage: Train={train_depts}, Val={val_depts}")

    # Holiday distribution check
    train_holidays = train_raw['IsHoliday'].sum()
    val_holidays = val_raw['IsHoliday'].sum()
    print(f"🎄 Holiday weeks: Train={train_holidays} ({train_holidays/len(train_raw)*100:.1f}%), Val={val_holidays} ({val_holidays/len(val_raw)*100:.1f}%)")

    mlflow.log_metric("train_holiday_percentage", train_holidays/len(train_raw)*100)
    mlflow.log_metric("val_holiday_percentage", val_holidays/len(val_raw)*100)

# ================================================================================
# STEP 3: TEMPORAL SPLIT (AND NOW OUTLIER REMOVAL)
# ================================================================================

with mlflow.start_run(run_name="Temporal_Split_And_Outlier_Removal") as run:

    print(f"🔪 Starting temporal split: {run.info.run_id}")

    # Define the cutoff date for the validation set
    cutoff_date = train_merged['Date'].max() - timedelta(weeks=40)
    mlflow.log_param("validation_cutoff_date", cutoff_date.strftime('%Y-%m-%d'))
    mlflow.log_param("validation_duration_weeks", 40)

    # Split the data into training and validation sets
    train_raw = train_merged[train_merged['Date'] <= cutoff_date].copy()
    val_raw = train_merged[train_merged['Date'] > cutoff_date].copy()

    # ========================================================================
    # ✅ NEW: OUTLIER DETECTION (APPLIED TO TRAINING DATA ONLY)
    # This code is now in the correct place to prevent data leakage.
    # ========================================================================
    print("\n🔍 Starting outlier detection on the TRAINING set only...")

    # Calculate thresholds ONLY on the training data
    sales_by_store_dept_train = train_raw.groupby(['Store', 'Dept'])['Weekly_Sales']
    Q1_train = sales_by_store_dept_train.transform(lambda x: x.quantile(0.25))
    Q3_train = sales_by_store_dept_train.transform(lambda x: x.quantile(0.75))
    IQR_train = Q3_train - Q1_train

    lower_bound_train = Q1_train - 1.5 * IQR_train
    upper_bound_train = Q3_train + 1.5 * IQR_train

    # Find outliers in the training set
    train_outliers = (train_raw['Weekly_Sales'] < lower_bound_train) | (train_raw['Weekly_Sales'] > upper_bound_train)

    # Log and remove outliers from the training set
    num_outliers_train = train_outliers.sum()
    print(f"   Found {num_outliers_train} outliers in the training set ({num_outliers_train / len(train_raw):.2%})")
    mlflow.log_metric("outliers_found_in_train", num_outliers_train)

    # Remove outliers from train_raw
    train_raw = train_raw[~train_outliers]
    print(f"   Removed outliers. New training set shape: {train_raw.shape}")
    mlflow.log_metric("train_records_after_outlier_removal", len(train_raw))
    # ========================================================================

    print(f"\n✅ Temporal split and outlier removal completed!")
    print(f"   Training data shape: {train_raw.shape}")
    print(f"   Validation data shape: {val_raw.shape}")
    print(f"   Training date range: {train_raw['Date'].min()} to {train_raw['Date'].max()}")
    print(f"   Validation date range: {val_raw['Date'].min()} to {val_raw['Date'].max()}")

    mlflow.log_metric("train_records_after_split", len(train_raw))
    mlflow.log_metric("val_records_after_split", len(val_raw))


⚠️ FIXING DATA LEAKAGE: Temporal split BEFORE preprocessing
🔄 Starting temporal split run: 3a3f85f7bb0e41619cb94e59100c6c3c
✅ Temporal split completed BEFORE preprocessing!
📅 Split date: 2012-04-13 00:00:00
🚂 Training: 335,761 records (79.6%)
🔮 Validation: 85,809 records (20.4%)
🏪 Store coverage: Train=45, Val=45
🏬 Dept coverage: Train=81, Val=81
🎄 Holiday weeks: Train=26695 (8.0%), Val=2966 (3.5%)
🏃 View run Temporal_Split_BEFORE_Preprocessing at: https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/3/runs/3a3f85f7bb0e41619cb94e59100c6c3c
🧪 View experiment at: https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/3
🔪 Starting temporal split: 04640c568cd6465da7788c8b154a66b4

🔍 Starting outlier detection on the TRAINING set only...
   Found 14796 outliers in the training set (4.88%)
   Removed outliers. New training set shape: (288234, 7)

✅ Temporal split and outlier removal completed!
   Tr

In [50]:
# ================================================================================
# STEP 4: FEATURE ENGINEERING (APPLIED TO TRAIN AND VAL SEPARATELY)
# ================================================================================

with mlflow.start_run(run_name="Feature_Engineering_After_Split") as run:

    print(f"🔧 Starting feature engineering AFTER split: {run.info.run_id}")

    mlflow.log_param("feature_engineering_method", "applied_after_temporal_split")
    mlflow.log_param("train_records_input", len(train_raw))
    mlflow.log_param("val_records_input", len(val_raw))

    def create_comprehensive_date_features(df):
        """Create comprehensive date features"""
        df = df.copy()

        # Basic date features
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['Week'] = df['Date'].dt.isocalendar().week
        df['Quarter'] = df['Date'].dt.quarter

        # Cyclical features
        df['Month_sin'] = np.sin(2 * np.pi * df['Month'] / 12)
        df['Month_cos'] = np.cos(2 * np.pi * df['Month'] / 12)
        df['Week_sin'] = np.sin(2 * np.pi * df['Week'] / 52)
        df['Week_cos'] = np.cos(2 * np.pi * df['Week'] / 52)
        df['DayOfWeek_sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
        df['DayOfWeek_cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)

        # Time since reference
        reference_date = pd.Timestamp('2010-02-05')
        df['DaysFromStart'] = (df['Date'] - reference_date).dt.days
        df['WeeksFromStart'] = df['DaysFromStart'] // 7
        df['MonthsFromStart'] = ((df['Date'].dt.year - reference_date.year) * 12 +
                                df['Date'].dt.month - reference_date.month)

        # Holiday features
        super_bowl_dates = ['2010-02-12', '2011-02-11', '2012-02-10']
        labor_day_dates = ['2010-09-10', '2011-09-09', '2012-09-07']
        thanksgiving_dates = ['2010-11-26', '2011-11-25', '2012-11-23']
        christmas_dates = ['2010-12-31', '2011-12-30', '2012-12-28']

        df['IsSuperBowlWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(super_bowl_dates).astype(int)
        df['IsLaborDayWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(labor_day_dates).astype(int)
        df['IsThanksgivingWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(thanksgiving_dates).astype(int)
        df['IsChristmasWeek'] = df['Date'].dt.strftime('%Y-%m-%d').isin(christmas_dates).astype(int)
        df['IsMajorHoliday'] = (df['IsSuperBowlWeek'] | df['IsLaborDayWeek'] |
                               df['IsThanksgivingWeek'] | df['IsChristmasWeek']).astype(int)

        # Retail calendar features
        df['IsHolidayMonth'] = df['Month'].isin([11, 12]).astype(int)
        df['IsBackToSchool'] = df['Month'].isin([8, 9]).astype(int)
        df['IsSummerSeason'] = df['Month'].isin([6, 7, 8]).astype(int)
        df['IsSpringSeaso'] = df['Month'].isin([3, 4, 5]).astype(int)

        # Week patterns
        df['IsFirstWeekOfMonth'] = (df['Day'] <= 7).astype(int)
        df['IsLastWeekOfMonth'] = (df['Date'].dt.days_in_month - df['Day'] < 7).astype(int)

        return df

    def create_lagging_features_time_aware(df, target_col='Weekly_Sales'):
        """
        Create time-aware lagging features that prevent data leakage
        """
        df = df.copy()
        df = df.sort_values(['Store', 'Dept', 'Date']).reset_index(drop=True)

        # ONLY simple lag features
        lags = [1, 2, 4, 8, 12]
        for lag in lags:
            lag_col = f'{target_col}_lag_{lag}'
            df[lag_col] = df.groupby(['Store', 'Dept'])[target_col].shift(lag).fillna(0)

        # ONLY basic difference features
        df[f'{target_col}_diff_1'] = df.groupby(['Store', 'Dept'])[target_col].diff(1).fillna(0)
        df[f'{target_col}_diff_4'] = df.groupby(['Store', 'Dept'])[target_col].diff(4).fillna(0)

        return df

    # Apply feature engineering to training set
    print("🚂 Creating features for training set...")
    train_with_features = create_comprehensive_date_features(train_raw)
    train_final = create_lagging_features_time_aware(train_with_features)

    # Apply feature engineering to validation set
    print("🔮 Creating features for validation set...")
    val_with_features = create_comprehensive_date_features(val_raw)
    val_final = create_lagging_features_time_aware(val_with_features)

    # Ensure both datasets have the same columns
    train_cols = set(train_final.columns)
    val_cols = set(val_final.columns)
    common_cols = list(train_cols.intersection(val_cols))

    # Keep only common columns in the same order
    train_final = train_final[common_cols]
    val_final = val_final[common_cols]

    # Check for any remaining NaNs
    train_nans = train_final.isnull().sum().sum()
    val_nans = val_final.isnull().sum().sum()

    if train_nans > 0 or val_nans > 0:
        print(f"⚠️ Warning: NaNs found - Train: {train_nans}, Val: {val_nans}")
        # Fill any remaining NaNs with 0
        train_final = train_final.fillna(0)
        val_final = val_final.fillna(0)
        print(f"✅ NaNs filled with 0")

    print(f"✅ Feature engineering completed!")
    print(f"🚂 Train shape: {train_final.shape}")
    print(f"🔮 Val shape: {val_final.shape}")
    print(f"📊 Features created: {len(train_final.columns)}")
    print(f"🔧 NaNs in training: {train_final.isnull().sum().sum()}")
    print(f"🔧 NaNs in validation: {val_final.isnull().sum().sum()}")

    # Log metrics
    mlflow.log_metric("train_features_shape", len(train_final.columns))
    mlflow.log_metric("train_records_after_features", len(train_final))
    mlflow.log_metric("val_records_after_features", len(val_final))
    mlflow.log_metric("train_nans_final", train_final.isnull().sum().sum())
    mlflow.log_metric("val_nans_final", val_final.isnull().sum().sum())

    # Show feature categories
    feature_categories = {
        'Date Features': [col for col in train_final.columns if any(x in col for x in ['Year', 'Month', 'Day', 'Week', 'Quarter', 'sin', 'cos'])],
        'Holiday Features': [col for col in train_final.columns if any(x in col for x in ['Holiday', 'SuperBowl', 'Labor', 'Thanksgiving', 'Christmas'])],
        'Lag Features': [col for col in train_final.columns if 'lag_' in col],
        'Rolling Features': [col for col in train_final.columns if 'rolling_' in col],
        'EWM Features': [col for col in train_final.columns if 'ewm_' in col],
        'Diff Features': [col for col in train_final.columns if 'diff' in col]
    }

    print(f"\n📋 Feature Categories:")
    for category, features in feature_categories.items():
        count = len(features)
        print(f"   {category}: {count} features")
        mlflow.log_metric(f"{category.lower().replace(' ', '_')}_count", count)
        if count > 0 and count <= 3:
            print(f"     Examples: {features}")
        elif count > 3:
            print(f"     Examples: {features[:3]}")

🔧 Starting feature engineering AFTER split: 8b4bae263a114247bc992331da4946b7
🚂 Creating features for training set...
🔮 Creating features for validation set...
✅ Feature engineering completed!
🚂 Train shape: (288234, 40)
🔮 Val shape: (118540, 40)
📊 Features created: 40
🔧 NaNs in training: 0
🔧 NaNs in validation: 0

📋 Feature Categories:
   Date Features: 30 features
     Examples: ['Weekly_Sales_lag_8', 'IsChristmasWeek', 'Weekly_Sales_lag_4']
   Holiday Features: 7 features
     Examples: ['IsChristmasWeek', 'IsLaborDayWeek', 'IsHoliday']
   Lag Features: 5 features
     Examples: ['Weekly_Sales_lag_8', 'Weekly_Sales_lag_4', 'Weekly_Sales_lag_12']
   Rolling Features: 0 features
   EWM Features: 0 features
   Diff Features: 2 features
     Examples: ['Weekly_Sales_diff_4', 'Weekly_Sales_diff_1']
🏃 View run Feature_Engineering_After_Split at: https://dagshub.com/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting.mlflow/#/experiments/3/runs/8b4bae263a114247bc992331da4946b7
🧪 Vie

In [51]:
# ================================================================================
# STEP 5: DATA TYPE CHECK AND CATEGORICAL ENCODING
# ================================================================================

with mlflow.start_run(run_name="Data_Type_Check_And_Encoding") as run:

    print(f"🔍 Checking data types and encoding categoricals: {run.info.run_id}")

    mlflow.log_param("encoding_method", "one_hot_encoding_for_categoricals")

    # Check data types in both datasets
    print(f"📊 Checking data types...")

    train_dtypes = train_final.dtypes.value_counts()
    val_dtypes = val_final.dtypes.value_counts()

    print(f"🚂 Training data types: {dict(train_dtypes)}")
    print(f"🔮 Validation data types: {dict(val_dtypes)}")

    # Identify categorical columns (object, category, or specific known categoricals)
    categorical_columns = []

    # Check for object/category columns
    for col in train_final.columns:
        if train_final[col].dtype == 'object' or train_final[col].dtype.name == 'category':
            categorical_columns.append(col)

    # Add known categorical columns that might be numeric but should be treated as categorical
    known_categoricals = ['Store', 'Dept']  # These should be categorical
    for col in known_categoricals:
        if col in train_final.columns and col not in categorical_columns:
            categorical_columns.append(col)

    print(f"🏷️ Categorical columns found: {categorical_columns}")

    if len(categorical_columns) > 0:
        print(f"🔧 Applying one-hot encoding to {len(categorical_columns)} categorical columns...")

        # Apply one-hot encoding
        from sklearn.preprocessing import OneHotEncoder
        import pandas as pd

        # Create copies for encoding
        train_encoded = train_final.copy()
        val_encoded = val_final.copy()

        for col in categorical_columns:
            print(f"   Encoding {col}...")

            # Check unique values
            train_unique = train_encoded[col].nunique()
            val_unique = val_encoded[col].nunique()
            combined_unique = pd.concat([train_encoded[col], val_encoded[col]]).nunique()

            print(f"     Train unique: {train_unique}, Val unique: {val_unique}, Combined: {combined_unique}")

            # Use pandas get_dummies for one-hot encoding
            # Fit on combined data to ensure same columns
            combined_data = pd.concat([
                train_encoded[col].reset_index(drop=True),
                val_encoded[col].reset_index(drop=True)
            ])

            # Get dummies
            dummies = pd.get_dummies(combined_data, prefix=f'{col}', dummy_na=False)

            # Split back into train and val
            train_dummies = dummies.iloc[:len(train_encoded)]
            val_dummies = dummies.iloc[len(train_encoded):]

            # Reset indices
            train_dummies.index = train_encoded.index
            val_dummies.index = val_encoded.index

            # Add dummy columns to datasets
            train_encoded = pd.concat([train_encoded, train_dummies], axis=1)
            val_encoded = pd.concat([val_encoded, val_dummies], axis=1)

            # Remove original categorical column
            train_encoded = train_encoded.drop(columns=[col])
            val_encoded = val_encoded.drop(columns=[col])

            print(f"     Created {len(dummies.columns)} dummy variables")
            mlflow.log_metric(f"{col}_dummy_count", len(dummies.columns))

        # Update the final datasets
        train_final_encoded = train_encoded
        val_final_encoded = val_encoded

        print(f"✅ One-hot encoding completed!")

    else:
        print(f"✅ No categorical columns found - using original datasets")
        train_final_encoded = train_final.copy()
        val_final_encoded = val_final.copy()

    # Final data type check and conversion
    print(f"🔧 Converting all features to numeric types...")

    # Convert boolean to int
    bool_columns = []
    for col in train_final_encoded.columns:
        if train_final_encoded[col].dtype == 'bool':
            bool_columns.append(col)
            train_final_encoded[col] = train_final_encoded[col].astype(int)
            val_final_encoded[col] = val_final_encoded[col].astype(int)

    if bool_columns:
        print(f"   Converted {len(bool_columns)} boolean columns to int")

    # Convert any remaining object columns to numeric
    object_columns = []
    for col in train_final_encoded.columns:
        if train_final_encoded[col].dtype == 'object':
            object_columns.append(col)
            train_final_encoded[col] = pd.to_numeric(train_final_encoded[col], errors='coerce')
            val_final_encoded[col] = pd.to_numeric(val_final_encoded[col], errors='coerce')

    if object_columns:
        print(f"   Converted {len(object_columns)} object columns to numeric")

    # Fill any NaNs introduced during conversion
    train_nans_after = train_final_encoded.isnull().sum().sum()
    val_nans_after = val_final_encoded.isnull().sum().sum()

    if train_nans_after > 0 or val_nans_after > 0:
        print(f"⚠️ NaNs introduced during conversion - Train: {train_nans_after}, Val: {val_nans_after}")
        train_final_encoded = train_final_encoded.fillna(0)
        val_final_encoded = val_final_encoded.fillna(0)
        print(f"✅ NaNs filled with 0")

    # Final data type summary
    final_train_dtypes = train_final_encoded.dtypes.value_counts()
    final_val_dtypes = val_final_encoded.dtypes.value_counts()

    print(f"\n📊 Final Data Types:")
    print(f"🚂 Training: {dict(final_train_dtypes)}")
    print(f"🔮 Validation: {dict(final_val_dtypes)}")

    print(f"\n📊 Final Dataset Shapes:")
    print(f"🚂 Training: {train_final_encoded.shape}")
    print(f"🔮 Validation: {val_final_encoded.shape}")

    # Verify all columns are numeric
    non_numeric_train = [col for col in train_final_encoded.columns
                        if train_final_encoded[col].dtype not in ['int64', 'float64', 'int32', 'float32']]
    non_numeric_val = [col for col in val_final_encoded.columns
                      if val_final_encoded[col].dtype not in ['int64', 'float64', 'int32', 'float32']]

    if non_numeric_train or non_numeric_val:
        print(f"⚠️ Warning: Non-numeric columns still exist:")
        print(f"   Train: {non_numeric_train}")
        print(f"   Val: {non_numeric_val}")
    else:
        print(f"✅ All columns are numeric!")

    # Log final metrics
    mlflow.log_metric("final_train_shape_rows", train_final_encoded.shape[0])
    mlflow.log_metric("final_train_shape_cols", train_final_encoded.shape[1])
    mlflow.log_metric("final_val_shape_rows", val_final_encoded.shape[0])
    mlflow.log_metric("final_val_shape_cols", val_final_encoded.shape[1])
    mlflow.log_metric("categorical_columns_encoded", len(categorical_columns))

    # Update variable names for next steps
    train_final = train_final_encoded
    val_final = val_final_encoded

    print(f"🎯 Data is ready for outlier detection!")

🔍 Checking data types and encoding categoricals: d0acc360432743fe87c8785e80caef88
📊 Checking data types...
🚂 Training data types: {dtype('int64'): np.int64(16), dtype('float64'): np.int64(12), dtype('int32'): np.int64(6), Float64Dtype(): np.int64(2), dtype('bool'): np.int64(1), dtype('<M8[ns]'): np.int64(1), dtype('O'): np.int64(1), UInt32Dtype(): np.int64(1)}
🔮 Validation data types: {dtype('int64'): np.int64(16), dtype('float64'): np.int64(12), dtype('int32'): np.int64(6), Float64Dtype(): np.int64(2), dtype('bool'): np.int64(1), dtype('<M8[ns]'): np.int64(1), dtype('O'): np.int64(1), UInt32Dtype(): np.int64(1)}
🏷️ Categorical columns found: ['Type', 'Store', 'Dept']
🔧 Applying one-hot encoding to 3 categorical columns...
   Encoding Type...
     Train unique: 3, Val unique: 3, Combined: 3
     Created 3 dummy variables
   Encoding Store...
     Train unique: 45, Val unique: 45, Combined: 45
     Created 45 dummy variables
   Encoding Dept...
     Train unique: 81, Val unique: 81, Com

In [52]:
# ================================================================================
# STEP 7: XGBOOST TRAINING (FIXED VERSION)
# ================================================================================

with mlflow.start_run(run_name="XGBoost_Training_Fixed_Data_Leakage") as run:

    print(f"🚀 Starting XGBoost training with FIXED data leakage: {run.info.run_id}")

    import xgboost as xgb
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    import joblib

    mlflow.log_param("model_type", "xgboost_regressor")
    mlflow.log_param("evaluation_metric", "WMAE_weighted_mean_absolute_error")
    mlflow.log_param("data_leakage_status", "FIXED")
    mlflow.log_param("train_size", len(train_final))
    mlflow.log_param("val_size", len(val_final))

    # 1. Prepare features and target
    print("🔧 Preparing features and target variables...")

    target_column = 'Weekly_Sales'
    exclude_columns = ['Weekly_Sales', 'Date']

    # Get feature columns
    feature_columns = [col for col in train_final.columns if col not in exclude_columns]

    print(f"   Total features available: {len(feature_columns)}")

    # Prepare training data
    X_train = train_final[feature_columns].copy()
    y_train = train_final[target_column].copy()
    X_val = val_final[feature_columns].copy()
    y_val = val_final[target_column].copy()

    # Get holiday flags for WMAE calculation
    train_is_holiday = train_final['IsHoliday'].copy()
    val_is_holiday = val_final['IsHoliday'].copy()

    print(f"📊 Final data shapes:")
    print(f"   X_train: {X_train.shape}")
    print(f"   X_val: {X_val.shape}")

    # 2. Define WMAE function
    def calculate_wmae(y_true, y_pred, is_holiday, holiday_weight=5.0):
        """Calculate Weighted Mean Absolute Error"""
        abs_errors = np.abs(y_true - y_pred)
        weights = np.where(is_holiday, holiday_weight, 1.0)
        wmae = np.sum(weights * abs_errors) / np.sum(weights)
        return wmae

    # 3. Train XGBoost model (Fixed)
    print(f"🤖 Training XGBoost model...")

    # Model parameters
    xgb_params = {
        'n_estimators': 1000,
        'max_depth': 8,
        'learning_rate': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'reg_alpha': 0.1,
        'reg_lambda': 1.0,
        'random_state': 42,
        'n_jobs': -1,
        'objective': 'reg:squarederror'
    }

    # Log parameters
    for param, value in xgb_params.items():
        mlflow.log_param(f"xgb_{param}", value)

    # Initialize model
    xgb_model = xgb.XGBRegressor(**xgb_params)

    # Train model (simplified without eval_metric in fit)
    print("   Fitting model...")
    xgb_model.fit(X_train, y_train)

    print(f"✅ Model training completed!")

    # 4. Make predictions
    print(f"🔮 Making predictions...")

    train_pred = xgb_model.predict(X_train)
    val_pred = xgb_model.predict(X_val)

    # 5. Calculate metrics
    print(f"📊 Calculating metrics...")

    # Training metrics
    train_mae = mean_absolute_error(y_train, train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))
    train_r2 = r2_score(y_train, train_pred)
    train_wmae = calculate_wmae(y_train, train_pred, train_is_holiday)

    # Validation metrics
    val_mae = mean_absolute_error(y_val, val_pred)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    val_r2 = r2_score(y_val, val_pred)
    val_wmae = calculate_wmae(y_val, val_pred, val_is_holiday)

    # Holiday breakdown
    train_holiday_mask = train_is_holiday == True
    train_non_holiday_mask = train_is_holiday == False
    val_holiday_mask = val_is_holiday == True
    val_non_holiday_mask = val_is_holiday == False

    # Holiday MAE
    train_holiday_mae = mean_absolute_error(y_train[train_holiday_mask], train_pred[train_holiday_mask]) if train_holiday_mask.sum() > 0 else 0
    train_non_holiday_mae = mean_absolute_error(y_train[train_non_holiday_mask], train_pred[train_non_holiday_mask]) if train_non_holiday_mask.sum() > 0 else 0

    val_holiday_mae = mean_absolute_error(y_val[val_holiday_mask], val_pred[val_holiday_mask]) if val_holiday_mask.sum() > 0 else 0
    val_non_holiday_mae = mean_absolute_error(y_val[val_non_holiday_mask], val_pred[val_non_holiday_mask]) if val_non_holiday_mask.sum() > 0 else 0

    # 6. Display results
    print(f"\n" + "="*80)
    print(f"📊 EXPERIMENT 4 RESULTS (DATA LEAKAGE FIXED)")
    print(f"="*80)

    print(f"\n🚂 Training Metrics:")
    print(f"   WMAE: ${train_wmae:,.2f}")
    print(f"   MAE: ${train_mae:,.2f}")
    print(f"   RMSE: ${train_rmse:,.2f}")
    print(f"   R²: {train_r2:.4f}")
    print(f"   Holiday MAE: ${train_holiday_mae:,.2f}")
    print(f"   Non-Holiday MAE: ${train_non_holiday_mae:,.2f}")

    print(f"\n🔮 Validation Metrics:")
    print(f"   WMAE: ${val_wmae:,.2f}")
    print(f"   MAE: ${val_mae:,.2f}")
    print(f"   RMSE: ${val_rmse:,.2f}")
    print(f"   R²: {val_r2:.4f}")
    print(f"   Holiday MAE: ${val_holiday_mae:,.2f}")
    print(f"   Non-Holiday MAE: ${val_non_holiday_mae:,.2f}")

    # 7. Log metrics to MLflow
    metrics_to_log = {
        "train_wmae": train_wmae,
        "train_mae": train_mae,
        "train_rmse": train_rmse,
        "train_r2": train_r2,
        "train_holiday_mae": train_holiday_mae,
        "train_non_holiday_mae": train_non_holiday_mae,
        "val_wmae": val_wmae,
        "val_mae": val_mae,
        "val_rmse": val_rmse,
        "val_r2": val_r2,
        "val_holiday_mae": val_holiday_mae,
        "val_non_holiday_mae": val_non_holiday_mae,
        "total_features": len(feature_columns),
        "train_holiday_count": train_holiday_mask.sum(),
        "val_holiday_count": val_holiday_mask.sum()
    }

    for metric_name, value in metrics_to_log.items():
        mlflow.log_metric(metric_name, value)

    # 8. Feature importance
    print(f"\n🎯 Top 15 Feature Importance:")

    feature_importance = pd.DataFrame({
        'feature': X_train.columns,
        'importance': xgb_model.feature_importances_
    }).sort_values('importance', ascending=False)

    for i, (_, row) in enumerate(feature_importance.head(15).iterrows()):
        print(f"   {i+1:2d}. {row['feature']:35s}: {row['importance']:.4f}")

    # Log top features
    for i, (_, row) in enumerate(feature_importance.head(10).iterrows()):
        mlflow.log_metric(f"feature_importance_rank_{i+1}", row['importance'])
        mlflow.log_param(f"top_feature_{i+1}", row['feature'])

    # 9. Model performance analysis
    print(f"\n📈 Model Performance Analysis:")
    print(f"   Training vs Validation WMAE: ${train_wmae:,.2f} vs ${val_wmae:,.2f}")
    print(f"   Overfitting check: {(train_wmae - val_wmae)/val_wmae*100:.1f}% difference")
    print(f"   Holiday vs Non-Holiday performance:")
    print(f"     Holiday MAE: ${val_holiday_mae:,.2f} ({val_holiday_mask.sum()} records)")
    print(f"     Non-Holiday MAE: ${val_non_holiday_mae:,.2f} ({val_non_holiday_mask.sum()} records)")

    # 10. Save model
    print(f"\n💾 Saving model...")
    model_path = "xgboost_walmart_experiment4_fixed.pkl"
    joblib.dump(xgb_model, model_path)
    mlflow.log_artifact(model_path)

    # Save feature importance
    feature_importance.to_csv("feature_importance_experiment4.csv", index=False)
    mlflow.log_artifact("feature_importance_experiment4.csv")

    print(f"\n" + "="*80)
    print(f"🎉 EXPERIMENT 4 COMPLETED!")
    print(f"✅ Data Leakage: FIXED")
    print(f"🎯 Main Metric (Validation WMAE): ${val_wmae:,.2f}")
    print(f"📊 Validation R²: {val_r2:.4f}")
    print(f"🔧 Features Used: {len(feature_columns)}")
    print(f"="*80)

    # Final status
    mlflow.log_param("experiment_status", "COMPLETED")
    mlflow.log_param("data_leakage_fixed", "YES")
    mlflow.log_param("main_wmae", f"${val_wmae:.2f}")

🚀 Starting XGBoost training with FIXED data leakage: 1d16409318a342c89f4aa06c0f46766b
🔧 Preparing features and target variables...
   Total features available: 164
📊 Final data shapes:
   X_train: (321582, 164)
   X_val: (82260, 164)
🤖 Training XGBoost model...
   Fitting model...
✅ Model training completed!
🔮 Making predictions...
📊 Calculating metrics...

📊 EXPERIMENT 4 RESULTS (DATA LEAKAGE FIXED)

🚂 Training Metrics:
   WMAE: $95.21
   MAE: $91.72
   RMSE: $176.55
   R²: 0.9999
   Holiday MAE: $106.04
   Non-Holiday MAE: $90.46

🔮 Validation Metrics:
   WMAE: $506.65
   MAE: $557.08
   RMSE: $3,029.55
   R²: 0.9564
   Holiday MAE: $146.44
   Non-Holiday MAE: $571.98

🎯 Top 15 Feature Importance:
    1. Weekly_Sales_lag_1                 : 0.6252
    2. Weekly_Sales_lag_4                 : 0.0314
    3. Weekly_Sales_diff_1                : 0.0231
    4. IsChristmasWeek                    : 0.0153
    5. Weekly_Sales_lag_2                 : 0.0111
    6. Dept_38                      