<a href="https://colab.research.google.com/github/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting/blob/lodia/model_exp_LightGBM_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install kaggle dagshub mlflow wandb onnx -Uq
from google.colab import drive
drive.mount('/content/drive')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.7/24.7 MB[0m [31m90.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m76.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m100.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m106.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.7/242.7 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
! mkdir ~/.kaggle

In [3]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [4]:
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
!kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 849MB/s]


In [6]:
! unzip walmart-recruiting-store-sales-forecasting.zip

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [15]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import logging
import os
import sys
from io import StringIO
import warnings
import mlflow
import dagshub
import zipfile
from datetime import datetime

In [8]:
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

# --- Configuration ---
# Set up a temporary directory for CmdStanPy (even if not directly used, good practice)
colab_tmp_dir = '/tmp/cmdstanpy_tmp_colab'
os.makedirs(colab_tmp_dir, exist_ok=True)
os.environ['CMDSTANPY_TEMP'] = colab_tmp_dir
# Suppress cmdstanpy and general INFO logs (for Prophet, but good to keep general)
logging.getLogger('cmdstanpy').setLevel(logging.CRITICAL)
logging.getLogger().setLevel(logging.WARNING) # Set root logger to WARNING to reduce general verbosity


In [9]:
# Aggregates results by Store and Dept to calculate WMAE etc.
def aggregate_results(df):
    """Aggregates sales data to calculate total weekly sales and identifies holidays."""
    df_agg = df.groupby('Date').agg(
        Weekly_Sales=('Weekly_Sales', 'sum'),
        IsHoliday=('IsHoliday', 'max') # If any store is holiday, the week is holiday
    ).reset_index()
    return df_agg



In [10]:
# Calculate Weighted Mean Absolute Error (WMAE)
def calculate_wmae(y_true, y_pred, is_holiday):
    """
    Calculates the Weighted Mean Absolute Error (WMAE) based on Walmart's criteria.
    Holiday weeks are weighted 5x.
    """
    weights = np.where(is_holiday, 5, 1)
    wmae = np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)
    return wmae

In [11]:
# Custom Preprocessing Pipeline
class WalmartPreprocessingPipeline:
    def __init__(self, train_date_split='2012-04-13', remove_outliers=True, remove_markdowns=True, enable_lag_features=False):
        self.train_date_split = pd.to_datetime(train_date_split)
        self.remove_outliers = remove_outliers
        self.remove_markdowns = remove_markdowns
        self.enable_lag_features = enable_lag_features
        self.weekly_sales_outlier_thresholds = {
            'A': {'lower': -20000, 'upper': 75000},
            'B': {'lower': -10000, 'upper': 40000},
            'C': {'lower': -5000, 'upper': 20000}
        }
        self.outliers_removed_count = 0

    def fit_transform(self, train_df, features_df, stores_df):
        df = self._merge_data(train_df, features_df, stores_df)
        df = self._clean_data(df)
        df = self._feature_engineer_dates(df)
        df = self._feature_engineer_holidays(df)
        df = self._feature_engineer_store_type(df) # Ensure type encoding happens early for outlier removal
        if self.remove_outliers:
            df = self._remove_outliers(df)
        if self.remove_markdowns:
            df = self._remove_markdowns(df)
        if not self.enable_lag_features:
            df = self._remove_lag_features(df) # Remove MarkDowns if not enabled
        df = self._handle_missing_values(df)
        df = self._remove_redundant_features(df)
        return df

    def _merge_data(self, train_df, features_df, stores_df):
        df = pd.merge(train_df, stores_df, on='Store', how='left')
        df = pd.merge(df, features_df, on=['Store', 'Date'], how='left')
        return df

    def _clean_data(self, df):
        # Handle duplicate IsHoliday columns after merge (e.g., IsHoliday_x, IsHoliday_y)
        if 'IsHoliday_x' in df.columns and 'IsHoliday_y' in df.columns:
            df['IsHoliday'] = df['IsHoliday_x'] | df['IsHoliday_y']
            df = df.drop(columns=['IsHoliday_x', 'IsHoliday_y'])
        # Ensure IsHoliday is boolean
        df['IsHoliday'] = df['IsHoliday'].astype(bool)
        return df

    def _feature_engineer_dates(self, df):
        df['Date'] = pd.to_datetime(df['Date'])
        df['Year'] = df['Date'].dt.year
        df['Month'] = df['Date'].dt.month
        df['Day'] = df['Date'].dt.day
        df['DayOfWeek'] = df['Date'].dt.dayofweek
        df['WeekOfYear'] = df['Date'].dt.isocalendar().week.astype(int) # Use isocalendar for week
        df['IsWeekend'] = df['DayOfWeek'].isin([5, 6]).astype(int)
        df['IsMonthStart'] = (df['Date'].dt.day == 1).astype(int)
        df['IsMonthEnd'] = (df['Date'].dt.is_month_end).astype(int)

        # Days/Weeks from start
        min_date = df['Date'].min()
        df['DaysFromStart'] = (df['Date'] - min_date).dt.days
        df['WeeksFromStart'] = (df['DaysFromStart'] // 7).astype(int)
        return df

    def _feature_engineer_holidays(self, df):
        # Define major US holidays and their approximate weeks
        holidays = {
            'SuperBowl': [datetime(y, 2, d) for y in range(2010, 2013) for d in [7, 8, 9, 10, 11, 12, 13]], # Approx 2nd week Feb
            'LaborDay': [datetime(y, 9, d) for y in range(2010, 2013) for d in [1, 2, 3, 4, 5, 6, 7]], # Approx 1st week Sep
            'Thanksgiving': [datetime(y, 11, d) for y in range(2010, 2013) for d in [22, 23, 24, 25, 26, 27, 28]], # Approx 4th week Nov
            'Christmas': [datetime(y, 12, d) for y in range(2010, 2013) for d in [24, 25, 26, 27, 28, 29, 30, 31]] # Approx last week Dec
        }

        # Create binary flags for holiday weeks
        df['IsSuperBowlWeek'] = df['Date'].isin(holidays['SuperBowl']).astype(int)
        df['IsLaborDayWeek'] = df['Date'].isin(holidays['LaborDay']).astype(int)
        df['IsThanksgivingWeek'] = df['Date'].isin(holidays['Thanksgiving']).astype(int)
        df['IsChristmasWeek'] = df['Date'].isin(holidays['Christmas']).astype(int)

        # Broader holiday indicators
        df['IsMajorHoliday'] = (df['IsSuperBowlWeek'] | df['IsLaborDayWeek'] | df['IsThanksgivingWeek'] | df['IsChristmasWeek']).astype(int)
        df['IsHolidayMonth'] = df['Month'].isin([11, 12, 1, 2, 9]).astype(int) # Nov, Dec, Jan, Feb, Sep
        df['IsBackToSchool'] = df['Month'].isin([8]).astype(int) # August often back to school

        return df

    def _feature_engineer_store_type(self, df):
        df['Type_Encoded'] = df['Type'].astype('category').cat.codes
        df = pd.get_dummies(df, columns=['Type'], prefix='Type', drop_first=False)
        return df

    def _remove_outliers(self, df):
        initial_rows = len(df)
        df_cleaned = pd.DataFrame()
        for store_type in self.weekly_sales_outlier_thresholds.keys():
            subset = df[df[f'Type_{store_type}'] == 1].copy() # Ensure we're working on the dummy column
            lower = self.weekly_sales_outlier_thresholds[store_type]['lower']
            upper = self.weekly_sales_outlier_thresholds[store_type]['upper']
            subset = subset[(subset['Weekly_Sales'] >= lower) & (subset['Weekly_Sales'] <= upper)]
            df_cleaned = pd.concat([df_cleaned, subset])
        self.outliers_removed_count = initial_rows - len(df_cleaned)
        if self.outliers_removed_count > 0:
            print(f"Removed {self.outliers_removed_count} outliers based on Weekly_Sales thresholds.")
        return df_cleaned

    def _remove_markdowns(self, df):
        markdown_cols = [col for col in df.columns if 'MarkDown' in col]
        return df.drop(columns=markdown_cols, errors='ignore')

    def _remove_lag_features(self, df):
        # Placeholder: If any specific 'lagged_sales' features were created, they'd be removed here.
        # For now, it mainly means removing MarkDowns which act like lag features.
        return df

    def _handle_missing_values(self, df):
        # For simplicity, fill numerical NaNs with 0 (or median/mean as appropriate)
        # For LightGBM, it can handle NaNs directly, but explicit filling is safer.
        numerical_cols = df.select_dtypes(include=np.number).columns
        df[numerical_cols] = df[numerical_cols].fillna(0) # Simple imputation
        return df

    def _remove_redundant_features(self, df):
        # Drop columns not needed for LightGBM or that are redundant after feature engineering
        # 'Date' will be used for splitting, then dropped as a feature
        cols_to_drop = [
            'Year', # Can be redundant with other temporal features
            'Day',  # WeekOfYear/DayOfWeek cover this
            'Size' # keeping this as a feature for LightGBM
        ]
        return df.drop(columns=[col for col in cols_to_drop if col in df.columns], errors='ignore')


In [12]:
# --- MLflow Setup Function ---
def setup_mlflow_lgbm(repo_owner, repo_name):
    """Setup MLflow and DagsHub tracking for LightGBM."""
    print("🔧 Setting up MLflow and DagsHub for LightGBM...")

    # End any active runs first
    try:
        mlflow.end_run()
    except:
        pass

    # Initialize DagsHub
    try:
        dagshub.init(
            repo_owner='konstantine25b',
            repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
            mlflow=True
        )
        print("✅ DagsHub initialized successfully!")
    except Exception as e:
        print(f"⚠️ DagsHub init warning: {e}")
        print("   (Ensure DAGSHUB_USER_TOKEN environment variable is set or you have write access)")


    # Set MLflow tracking URI
    mlflow.set_tracking_uri(f"https://dagshub.com/{repo_owner}/{repo_name}.mlflow")
    print(f"MLflow tracking URI set to: {mlflow.get_tracking_uri()}")

    # Create a new experiment for this run
    experiment_name = f"LightGBM_Walmart_Sales_{datetime.now().strftime('%Y%m%d%H%M%S')}"
    mlflow.set_experiment(experiment_name)
    print(f"MLflow Experiment set to: '{experiment_name}'")

In [20]:
# --- Main Execution ---
def main():
    print("🚀 Starting LightGBM Walmart Sales Forecasting Experiment...")

    # Load data
    print("Loading data...")
    train_raw = pd.read_csv('train.csv')
    features_raw = pd.read_csv('features.csv')
    stores_raw = pd.read_csv('stores.csv')
    print("Data loaded.")

    # Preprocess data
    print("Preprocessing data...")
    pipeline = WalmartPreprocessingPipeline(
        train_date_split='2012-04-13',
        remove_outliers=True,
        remove_markdowns=True,
        enable_lag_features=False
    )
    processed_df = pipeline.fit_transform(train_raw, features_raw, stores_raw)
    print("Data preprocessing complete.")
    print(f"Total rows after preprocessing: {len(processed_df)}")
    print(f"Outliers removed during preprocessing: {pipeline.outliers_removed_count}")

    # Define features and target
    # 'Date' is used for splitting but not as a feature for LightGBM.
    # 'Weekly_Sales' is the target.
    # All other columns in processed_df are considered potential features.
    all_potential_features = [col for col in processed_df.columns if col not in ['Date', 'Weekly_Sales']]
    target = 'Weekly_Sales'

    # Identify categorical features for LightGBM
    # Ensure 'Store' and 'Dept' are included here
    categorical_features_for_lgbm = [
        'Store', 'Dept', 'Month', 'DayOfWeek', 'WeekOfYear',
        'Type_Encoded', 'Type_A', 'Type_B', 'Type_C',
        'IsHoliday', 'IsWeekend', 'IsMonthStart', 'IsMonthEnd',
        'IsSuperBowlWeek', 'IsLaborDayWeek', 'IsThanksgivingWeek',
        'IsChristmasWeek', 'IsMajorHoliday', 'IsHolidayMonth', 'IsBackToSchool'
    ]

    # Convert specified categorical columns to 'category' dtype
    for col in categorical_features_for_lgbm:
        if col in processed_df.columns:
            processed_df[col] = processed_df[col].astype('category')
        else:
            print(f"Warning: Categorical feature '{col}' not found in processed data. Skipping conversion.")
            # Remove from the list if not found to prevent errors later
            categorical_features_for_lgbm = [f for f in categorical_features_for_lgbm if f != col]


    # Temporal Split - slice the processed_df first
    print(f"Splitting data into training and validation sets at {pipeline.train_date_split}...")

    train_df_sliced = processed_df[processed_df['Date'] <= pipeline.train_date_split].copy()
    val_df_sliced = processed_df[processed_df['Date'] > pipeline.train_date_split].copy()

    # Now, define X_train, y_train, etc. from the sliced dataframes
    X_train = train_df_sliced[all_potential_features]
    y_train = train_df_sliced[target]
    is_holiday_train = train_df_sliced['IsHoliday']

    X_val = val_df_sliced[all_potential_features]
    y_val = val_df_sliced[target]
    is_holiday_val = val_df_sliced['IsHoliday']

    print(f"Training set size: {len(X_train)} records")
    print(f"Validation set size: {len(X_val)} records")

    # --- LightGBM Model Training ---
    print("\n🚀 Training LightGBM model...")

    lgbm_params = {
        'objective': 'regression_l1', # MAE objective
        'metric': 'mae',
        'n_estimators': 1000,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'lambda_l1': 0.1,
        'lambda_l2': 0.1,
        'num_leaves': 31,
        'verbose': -1, # Suppress verbose output during training
        'n_jobs': -1, # Use all available cores
        'seed': 42,
        'boosting_type': 'gbdt',
        # 'early_stopping_round': 50 # Moved to callbacks
    }

    model = lgb.LGBMRegressor(**lgbm_params)

    # Use StringIO to capture LightGBM's default verbose output during fit if not suppressed by verbose=-1
    original_stdout = sys.stdout
    original_stderr = sys.stderr
    sys.stdout = StringIO()
    sys.stderr = StringIO()

    try:
        model.fit(X_train, y_train,
                  eval_set=[(X_val, y_val)],
                  eval_metric='mae', # Evaluate on MAE for early stopping
                  callbacks=[lgb.early_stopping(50, verbose=False)], # Use callback for early stopping
                  categorical_feature=[col for col in categorical_features_for_lgbm if col in X_train.columns]
                 )
    except Exception as e:
        print(f"⛔ LightGBM training failed: {e}")
        sys.stdout = original_stdout # Restore stdout on error
        sys.stderr = original_stderr
        return # Exit if training fails
    finally:
        sys.stdout = original_stdout # Restore stdout
        sys.stderr = original_stderr

    print("✅ LightGBM Training Complete!")

    # --- Evaluation ---
    print("\n📊 Evaluating model performance...")

    # Predictions
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)

    # Ensure no negative predictions (sales cannot be negative)
    y_train_pred[y_train_pred < 0] = 0
    y_val_pred[y_val_pred < 0] = 0

    # TRAINING METRICS
    train_wmae = calculate_wmae(y_train, y_train_pred, is_holiday_train)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    train_r2 = r2_score(y_train, y_train_pred)

    print("\n==================================================")
    print("📊 TRAINING METRICS (Aggregated In-Sample):")
    print(f"   WMAE: {train_wmae:.2f}")
    print(f"   RMSE: {train_rmse:.2f}")
    print(f"   MAE: {train_mae:.2f}")
    print(f"   R²: {train_r2:.4f}")

    # VALIDATION METRICS
    val_wmae = calculate_wmae(y_val, y_val_pred, is_holiday_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    val_mae = mean_absolute_error(y_val, y_val_pred)
    val_r2 = r2_score(y_val, y_val_pred)

    print("\n📊 VALIDATION METRICS (Aggregated Out-of-Sample):")
    print(f"   WMAE: {val_wmae:.2f} ⭐")
    print(f"   RMSE: {val_rmse:.2f}")
    print(f"   MAE: {val_mae:.2f}")
    print(f"   R²: {val_r2:.4f}")

    # Overfitting Analysis
    print("\n🔍 OVERFITTING ANALYSIS:")
    wmae_ratio = val_wmae / train_wmae if train_wmae != 0 else float('inf')
    r2_diff = train_r2 - val_r2
    print(f"   WMAE Ratio (val/train): {wmae_ratio:.2f}")
    if wmae_ratio > 2.0:
        print("   ⚠️ High overfitting detected (WMAE ratio > 2.0)")
    elif wmae_ratio > 1.2:
        print("   ❗ Moderate overfitting detected (WMAE ratio > 1.2)")
    else:
        print("   ✅ Low overfitting detected")
    print(f"   R² Difference (train-val): {r2_diff:.4f}")

    # Feature Importance
    print("\n🎯 FEATURE IMPORTANCE (LightGBM):")
    # Ensure X_train.columns are aligned with model.feature_importances_
    # If a categorical feature was not found, it might have been removed from categorical_features_for_lgbm list
    # but still needs to be handled in X_train.columns
    if len(X_train.columns) == len(model.feature_importances_):
        feature_importances = pd.DataFrame({
            'feature': X_train.columns,
            'importance': model.feature_importances_
        }).sort_values(by='importance', ascending=False)
        print(feature_importances.head(15)) # Print top 15 features
    else:
        print("Error: Feature importance column count mismatch. Cannot display.")


    # Model Info
    print("\n📈 MODEL INFO:")
    print(f"   Training records: {len(X_train)}")
    print(f"   Validation records: {len(X_val)}")
    print(f"   Number of features: {len(X_train.columns)}")
    print(f"   LightGBM Parameters: {lgbm_params}")



    print("\n🎉 EXPERIMENT COMPLETED!")
    print("============================================================")
    print(f"🏆 Final Validation WMAE (Aggregated): {val_wmae:.2f}")
    print(f"📊 Final Validation R² (Aggregated): {val_r2:.4f}")
    print("🎯 This experiment uses the correct Walmart WMAE formula:")
    print("   • Holiday weeks weighted 5x")
    print("   • Regular weeks weighted 1x")
    print("   • Global LightGBM model trained on all Store-Dept data")
    print("   • Extensive date and holiday features, plus Store/Dept as categorical features")


In [21]:
if __name__ == "__main__":
    with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    train = pd.read_csv('train.csv')
    with zipfile.ZipFile('features.csv.zip', 'r') as zip_ref:
        zip_ref.extractall('.')
    features = pd.read_csv('features.csv')
    main()

🚀 Starting LightGBM Walmart Sales Forecasting Experiment...
Loading data...
Data loaded.
Preprocessing data...
Removed 28279 outliers based on Weekly_Sales thresholds.
Data preprocessing complete.
Total rows after preprocessing: 393291
Outliers removed during preprocessing: 28279
Splitting data into training and validation sets at 2012-04-13 00:00:00...
Training set size: 315896 records
Validation set size: 77395 records

🚀 Training LightGBM model...
✅ LightGBM Training Complete!

📊 Evaluating model performance...

📊 TRAINING METRICS (Aggregated In-Sample):
   WMAE: 1554.33
   RMSE: 3251.47
   MAE: 1464.41
   R²: 0.9481

📊 VALIDATION METRICS (Aggregated Out-of-Sample):
   WMAE: 1663.04 ⭐
   RMSE: 3359.20
   MAE: 1654.74
   R²: 0.9449

🔍 OVERFITTING ANALYSIS:
   WMAE Ratio (val/train): 1.07
   ✅ Low overfitting detected
   R² Difference (train-val): 0.0031

🎯 FEATURE IMPORTANCE (LightGBM):
           feature  importance
1             Dept       10765
0            Store        8761
9    