In [5]:
# === Colab setup
!pip install --quiet xgboost lightgbm catboost optuna shap plotly scikit-learn optuna-integration[lightgbm] optuna-integration[xgboost] optuna-integration[sklearn]


from google.colab import drive
drive.mount('/content/drive', force_remount=False)

import os
DRIVE_DIR = '/content/drive/MyDrive/BigMart'
if not os.path.exists(DRIVE_DIR):
    os.makedirs(DRIVE_DIR, exist_ok=True)
print('Drive directory:', DRIVE_DIR)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Drive directory: /content/drive/MyDrive/BigMart


In [8]:
# -*- coding: utf-8 -*-
"""
BigMart Sales Prediction Pipeline
"""

import os
import numpy as np
import pandas as pd
from typing import Tuple, List
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import Ridge
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    INPUT_DIR = '/content/drive/MyDrive/BigMart'
    OUTPUT_DIR = '/content/drive/MyDrive/BigMart/Output'
    RANDOM_STATE = 3
    N_SPLITS = 5
    LGBM_ROUNDS = 20000
    XGB_ROUNDS = 15000
    EARLY_STOPPING_ROUNDS = 300
    REFERENCE_YEAR = 2025


# ============================================================================
# FEATURE ENGINEERING
# ============================================================================

class FeatureEngineer:
    """Handles all feature engineering operations"""

    def __init__(self):
        self.label_encoders = {}
        self.aggregated_features = {}

    def create_item_category(self, df: pd.DataFrame) -> pd.DataFrame:
        """Extract item category from identifier"""
        if 'Item_Category' not in df.columns:
            df['Item_Category'] = df['Item_Identifier'].astype(str).str[:2]
        return df

    def fix_fat_content(self, df: pd.DataFrame) -> pd.DataFrame:
        """Standardize fat content labels"""
        df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({
            'low fat': 'Low Fat', 'LF': 'Low Fat', 'reg': 'Regular'
        })
        return df

    def handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """Impute missing values"""
        item_weight_median = df.groupby('Item_Identifier')['Item_Weight'].transform('median')
        global_weight_median = df['Item_Weight'].median()
        df['Item_Weight'] = df['Item_Weight'].fillna(item_weight_median).fillna(global_weight_median)

        df.loc[df['Item_Visibility'] == 0, 'Item_Visibility'] = np.nan
        visibility_median = df.groupby('Item_Type')['Item_Visibility'].transform('median')
        df['Item_Visibility'] = df['Item_Visibility'].fillna(visibility_median)

        outlet_size_mode = df.groupby('Outlet_Type')['Outlet_Size'].transform(
            lambda x: x.mode().iloc[0] if not x.mode().empty else "Medium"
        )
        df['Outlet_Size'] = df['Outlet_Size'].fillna(outlet_size_mode).fillna("Medium")

        return df

    def create_basic_features(self, df: pd.DataFrame, reference_year: int) -> pd.DataFrame:
        """Create basic engineered features with fixed reference year"""
        df['Outlet_Age'] = reference_year - df['Outlet_Establishment_Year']

        # Outlet age bins
        df['Outlet_Age_Bin'] = pd.cut(
            df['Outlet_Age'],
            bins=[0, 5, 10, 15, 100],
            labels=['Very_New', 'New', 'Established', 'Old']
        )

        df['Item_Visibility_Bin'] = pd.qcut(
            df['Item_Visibility'],
            q=3,
            labels=['Low_Vis', 'Medium_Vis', 'High_Vis'],
            duplicates='drop'
        )

        df['Item_MRP_Bin'] = pd.cut(
            df['Item_MRP'],
            bins=[0, 70, 140, 200, 3000],
            labels=['Low_Price', 'Medium_Price', 'High_Price', 'Premium_Price']
        )

        return df

    def create_interaction_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create interaction features"""
        df['MRP_Weight'] = df['Item_MRP'] * df['Item_Weight']
        df['MRP_Visibility'] = df['Item_MRP'] * df['Item_Visibility']

        df['Weight_Visibility'] = df['Item_Weight'] * df['Item_Visibility']

        return df

    def create_aggregated_features_train(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create and store aggregated features from training data only"""
        # Outlet-level aggregations
        outlet_aggs = df.groupby('Outlet_Identifier').agg({
            'Item_MRP': ['mean', 'std', 'median'],
            'Item_Weight': ['mean', 'std'],
            'Item_Visibility': ['mean', 'std']
        }).reset_index()
        outlet_aggs.columns = ['Outlet_Identifier'] + [
            f'Outlet_{col[0]}_{col[1]}' for col in outlet_aggs.columns[1:]
        ]
        self.aggregated_features['outlet'] = outlet_aggs
        df = df.merge(outlet_aggs, on='Outlet_Identifier', how='left')


        # Unique item ratio feature
        outlet_item_counts = df.groupby('Outlet_Identifier').agg({
            'Item_Identifier': ['nunique', 'count']
        }).reset_index()
        outlet_item_counts.columns = ['Outlet_Identifier', 'Unique_Items', 'Total_Items']
        outlet_item_counts['Unique_Item_Ratio'] = outlet_item_counts['Unique_Items'] / outlet_item_counts['Total_Items']

        df = df.merge(outlet_item_counts[['Outlet_Identifier', 'Unique_Item_Ratio', 'Unique_Items', 'Total_Items']],
                      on='Outlet_Identifier', how='left')

        self.aggregated_features['outlet_items'] = outlet_item_counts[['Outlet_Identifier', 'Unique_Item_Ratio', 'Unique_Items', 'Total_Items']]

        # Item-type aggregations
        item_type_aggs = df.groupby('Item_Type').agg({
            'Item_MRP': ['mean', 'std'],
            'Item_Visibility': ['mean', 'std'],
            'Item_Weight': ['mean']
        }).reset_index()
        item_type_aggs.columns = ['Item_Type'] + [
            f'ItemType_{col[0]}_{col[1]}' for col in item_type_aggs.columns[1:]
        ]
        self.aggregated_features['item_type'] = item_type_aggs
        df = df.merge(item_type_aggs, on='Item_Type', how='left')

        return df

    def apply_aggregated_features_test(self, df: pd.DataFrame) -> pd.DataFrame:
        """Apply stored aggregated features to test data"""
        df = df.merge(self.aggregated_features['outlet'], on='Outlet_Identifier', how='left')
        df = df.merge(self.aggregated_features['item_type'], on='Item_Type', how='left')
        return df

    def label_encode(self, df: pd.DataFrame, cols: List[str], fit: bool = False) -> pd.DataFrame:
        """Label encode categorical variables"""
        df = df.copy()

        for col in cols:
            if col in df.columns:
                if fit:
                    self.label_encoders[col] = LabelEncoder()
                    df[col] = self.label_encoders[col].fit_transform(df[col].astype(str))
                else:
                    # Handle unseen categories
                    df[col] = df[col].astype(str).map(
                        lambda x: self.label_encoders[col].transform([x])[0]
                        if x in self.label_encoders[col].classes_
                        else -1
                    )

        return df

    def encode_categorical(self, df: pd.DataFrame, fit: bool = False) -> pd.DataFrame:
        """Encode categorical variables"""

        label_encode_cols = ['Item_Identifier', 'Outlet_Identifier']
        df = self.label_encode(df, label_encode_cols, fit=fit)

        if 'Item_Fat_Content' in df.columns:
            df['Item_Fat_Content'] = df['Item_Fat_Content'].map({
                'Low Fat': 0, 'Regular': 1, 'Non-Edible': 2
            })

        if 'Outlet_Size' in df.columns:
            df['Outlet_Size'] = df['Outlet_Size'].map({
                'Small': 0, 'Medium': 1, 'High': 2
            })

        categorical_cols = ['Item_Type', 'Outlet_Type', 'Outlet_Location_Type',
                           'Outlet_Age_Bin', 'Item_Visibility_Bin',
                           'Item_MRP_Bin', 'Item_Category']

        existing_cols = [col for col in categorical_cols if col in df.columns]

        df = pd.get_dummies(df, columns=existing_cols,
                           prefix=existing_cols, drop_first=False)

        return df


# ============================================================================
# PREPROCESSING PIPELINE
# ============================================================================

class Preprocessor:
    """Main preprocessing pipeline"""

    def __init__(self, config: Config):
        self.config = config
        self.feature_engineer = FeatureEngineer()

    def preprocess_train(self, df: pd.DataFrame) -> pd.DataFrame:
        """Full preprocessing pipeline for training data"""
        print("Starting preprocessing...")
        df = df.copy()

        # Feature engineering
        df = self.feature_engineer.create_item_category(df)
        df = self.feature_engineer.fix_fat_content(df)
        df = self.feature_engineer.handle_missing_values(df)
        df = self.feature_engineer.create_basic_features(df, self.config.REFERENCE_YEAR)
        df = self.feature_engineer.create_interaction_features(df)
        df = self.feature_engineer.create_aggregated_features_train(df)  # Compute on train only
        df = self.feature_engineer.encode_categorical(df, fit=True)

        print(f"Preprocessing complete. Shape: {df.shape}")
        return df

    def preprocess_test(self, df: pd.DataFrame) -> pd.DataFrame:
        """Preprocessing pipeline for test data"""
        print("Preprocessing test data...")
        df = df.copy()

        df = self.feature_engineer.create_item_category(df)
        df = self.feature_engineer.fix_fat_content(df)
        df = self.feature_engineer.handle_missing_values(df)
        df = self.feature_engineer.create_basic_features(df, self.config.REFERENCE_YEAR)
        df = self.feature_engineer.create_interaction_features(df)
        df = self.feature_engineer.apply_aggregated_features_test(df)
        df = self.feature_engineer.encode_categorical(df, fit=False)

        print(f"Test preprocessing complete. Shape: {df.shape}")
        return df


# ============================================================================
# MODEL TRAINING
# ============================================================================

class ModelTrainer:
    """Handles model training with GroupKFold cross-validation"""

    def __init__(self, config: Config):
        self.config = config

    def train_lightgbm(self, X: pd.DataFrame, y: pd.Series, groups: pd.Series,
                       X_test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Train LightGBM with GroupKFold CV"""
        print("\n" + "="*80)
        print("Training LightGBM with GroupKFold")
        print("="*80)

        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'num_leaves': 116,
            'learning_rate': 0.09450119316286584,
            'max_depth': 4,
            'feature_fraction': 0.9320909184684667,
            'bagging_fraction': 0.6797788057232733,
            'bagging_freq': 5,
            'min_child_samples': 60,
            'min_child_weight': 0.0386883317530884,
            'min_gain_to_split': 0.0549201598381078,
            'reg_alpha': 9.477209577153645e-08,
            'reg_lambda': 0.11698784340860989,
            'verbose': -1
        }

        gkf = GroupKFold(n_splits=self.config.N_SPLITS)

        oof_preds = np.zeros(len(X))
        test_preds = np.zeros(len(X_test))
        rmses = []

        for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups), 1):
            print(f"\nFold {fold}/{self.config.N_SPLITS}")

            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            train_data = lgb.Dataset(X_train, label=y_train)
            val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

            model = lgb.train(
                params,
                train_data,
                num_boost_round=self.config.LGBM_ROUNDS,
                valid_sets=[train_data, val_data],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=self.config.EARLY_STOPPING_ROUNDS),
                    lgb.log_evaluation(period=500)
                ]
            )

            oof_preds[val_idx] = model.predict(X_val)
            test_preds += model.predict(X_test) / self.config.N_SPLITS

            rmse = np.sqrt(mean_squared_error(y_val, oof_preds[val_idx]))
            rmses.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.4f}")

        print(f"\nLightGBM CV RMSE: {np.mean(rmses):.4f} (+/- {np.std(rmses):.4f})")
        return oof_preds, test_preds

    def train_xgboost(self, X: pd.DataFrame, y: pd.Series, groups: pd.Series,
                      X_test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Train XGBoost with GroupKFold CV"""
        print("\n" + "="*80)
        print("Training XGBoost with GroupKFold")
        print("="*80)

        params = {
            'objective': 'reg:squarederror',
            'eval_metric': 'rmse',
            'seed': self.config.RANDOM_STATE,
            'verbosity': 0,
            'tree_method': 'hist',
            'max_depth': 3,
            'learning_rate': 0.011789839446789884,
            'subsample': 0.8195852322660473,
            'colsample_bytree': 0.9380241079482217,
            'colsample_bylevel': 0.5556572065430505,
            'min_child_weight': 26,
            'gamma': 4.265164162900841,
            'reg_alpha': 0.35322223713188117,
            'reg_lambda': 0.0028308377883415736
        }

        gkf = GroupKFold(n_splits=self.config.N_SPLITS)

        oof_preds = np.zeros(len(X))
        test_preds = np.zeros(len(X_test))
        rmses = []

        for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups), 1):
            print(f"\nFold {fold}/{self.config.N_SPLITS}")

            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            dtrain = xgb.DMatrix(X_train, label=y_train)
            dval = xgb.DMatrix(X_val, label=y_val)

            model = xgb.train(
                params,
                dtrain,
                num_boost_round=self.config.XGB_ROUNDS,
                evals=[(dtrain, 'train'), (dval, 'eval')],
                early_stopping_rounds=self.config.EARLY_STOPPING_ROUNDS,
                verbose_eval=500
            )

            oof_preds[val_idx] = model.predict(dval)
            test_preds += model.predict(xgb.DMatrix(X_test)) / self.config.N_SPLITS

            rmse = np.sqrt(mean_squared_error(y_val, oof_preds[val_idx]))
            rmses.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.4f}")

        print(f"\nXGBoost CV RMSE: {np.mean(rmses):.4f} (+/- {np.std(rmses):.4f})")
        return oof_preds, test_preds

    def train_random_forest(self, X: pd.DataFrame, y: pd.Series, groups: pd.Series,
                           X_test: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray]:
        """Train Random Forest with GroupKFold CV"""
        print("\n" + "="*80)
        print("Training Random Forest with GroupKFold")
        print("="*80)

        params = {
            "n_estimators": 500,
            "max_depth": 10,
            "min_samples_split": 3,
            "min_samples_leaf": 5,
            "max_features": "sqrt",
            "random_state": self.config.RANDOM_STATE,
            "n_jobs": -1,
            "verbose": 0
        }

        gkf = GroupKFold(n_splits=self.config.N_SPLITS)

        oof_preds = np.zeros(len(X))
        test_preds = np.zeros(len(X_test))
        rmses = []

        for fold, (train_idx, val_idx) in enumerate(gkf.split(X, y, groups), 1):
            print(f"\nFold {fold}/{self.config.N_SPLITS}")

            X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

            model = RandomForestRegressor(**params)
            model.fit(X_train, y_train)

            oof_preds[val_idx] = model.predict(X_val)
            test_preds += model.predict(X_test) / self.config.N_SPLITS

            rmse = np.sqrt(mean_squared_error(y_val, oof_preds[val_idx]))
            rmses.append(rmse)
            print(f"Fold {fold} RMSE: {rmse:.4f}")

        print(f"\nRandom Forest CV RMSE: {np.mean(rmses):.4f} (+/- {np.std(rmses):.4f})")
        return oof_preds, test_preds

    def train_meta_model(self, oof_preds_dict: dict, y_train: pd.Series,
                        test_preds_dict: dict) -> Tuple[np.ndarray, np.ndarray]:
        """Train meta-model (Ridge) on out-of-fold predictions"""
        print("\n" + "="*80)
        print("Training Meta-Model (Ridge)")
        print("="*80)

        X_meta = np.column_stack([oof_preds_dict[name] for name in sorted(oof_preds_dict.keys())])
        X_test_meta = np.column_stack([test_preds_dict[name] for name in sorted(test_preds_dict.keys())])

        meta_model = Ridge(alpha=10.0, random_state=self.config.RANDOM_STATE)
        meta_model.fit(X_meta, y_train)

        oof_meta = meta_model.predict(X_meta)
        test_meta = meta_model.predict(X_test_meta)

        rmse = np.sqrt(mean_squared_error(y_train, oof_meta))
        print(f"Meta-Model RMSE: {rmse:.4f}")
        print(f"Weights: {dict(zip(sorted(oof_preds_dict.keys()), meta_model.coef_))}")

        return oof_meta, test_meta


# ============================================================================
# MAIN PIPELINE
# ============================================================================

def main():
    """Main execution pipeline"""
    print("="*80)
    print("BigMart Sales Prediction - Fixed Pipeline")
    print("="*80)

    config = Config()

    # Load data
    print("\nLoading data...")
    train_df = pd.read_csv(os.path.join(config.INPUT_DIR, 'train.csv'))
    test_df = pd.read_csv(os.path.join(config.INPUT_DIR, 'test.csv'))
    print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")

    # Store test identifiers
    test_ids = test_df[['Item_Identifier', 'Outlet_Identifier']].copy()

    # Store outlet groups for GroupKFold BEFORE encoding
    outlet_groups = train_df['Outlet_Identifier'].copy()

    # Preprocess
    preprocessor = Preprocessor(config)
    train_processed = preprocessor.preprocess_train(train_df)
    test_processed = preprocessor.preprocess_test(test_df)

    # Prepare features
    drop_cols = ['Item_Outlet_Sales', 'Outlet_Establishment_Year']
    X_train = train_processed.drop(columns=drop_cols, errors='ignore')
    y_train = train_processed['Item_Outlet_Sales']
    X_test = test_processed.drop(columns=['Outlet_Establishment_Year'], errors='ignore')

    # Align columns
    missing_cols = set(X_train.columns) - set(X_test.columns)
    for col in missing_cols:
        X_test[col] = 0
    X_test = X_test[X_train.columns]

    print(f"\nFinal shapes - Train: {X_train.shape}, Test: {X_test.shape}")

    # Train models
    trainer = ModelTrainer(config)
    oof_lgbm, test_lgbm = trainer.train_lightgbm(X_train, y_train, outlet_groups, X_test)
    oof_xgb, test_xgb = trainer.train_xgboost(X_train, y_train, outlet_groups, X_test)
    oof_rf, test_rf = trainer.train_random_forest(X_train, y_train, outlet_groups, X_test)

    # Ensemble
    oof_preds = {'lgbm': oof_lgbm, 'xgb': oof_xgb, 'rf': oof_rf}
    test_preds = {'lgbm': test_lgbm, 'xgb': test_xgb, 'rf': test_rf}
    oof_meta, test_meta = trainer.train_meta_model(oof_preds, y_train, test_preds)

    # Save
    os.makedirs(config.OUTPUT_DIR, exist_ok=True)


    #Meta-model predictions (stacked ensemble)
    output_meta = test_ids.copy()
    output_meta['Item_Outlet_Sales'] = np.maximum(test_meta, 0)
    output_meta.to_csv(os.path.join(config.OUTPUT_DIR, 'predictions_meta_stacked.csv'), index=False)
    print("\nSaved meta-model predictions: predictions_meta_stacked.csv")



    print("\n" + "="*80)
    print("Pipeline completed successfully!")
    print("predictions_meta_stacked.csv ")
    print("="*80)


if __name__ == "__main__":
    main()

BigMart Sales Prediction - Fixed Pipeline

Loading data...
Train shape: (8523, 12), Test shape: (5681, 11)
Starting preprocessing...
Preprocessing complete. Shape: (8523, 65)
Preprocessing test data...
Test preprocessing complete. Shape: (5681, 61)

Final shapes - Train: (8523, 63), Test: (5681, 63)

Training LightGBM with GroupKFold

Fold 1/5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[85]	training's rmse: 1009.33	valid_1's rmse: 1992.87
Fold 1 RMSE: 1992.8652

Fold 2/5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[11]	training's rmse: 1200.28	valid_1's rmse: 1499.05
Fold 2 RMSE: 1499.0546

Fold 3/5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[21]	training's rmse: 1068.74	valid_1's rmse: 1125.61
Fold 3 RMSE: 1125.6107

Fold 4/5
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[30]	