In [3]:
!pip install -q pandas

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, StackingRegressor, HistGradientBoostingRegressor
from sklearn.linear_model import RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import warnings

warnings.filterwarnings('ignore')

# --- 1. Data Loading and Preprocessing ---

def load_data(train_file, test_file):
    """
    Loads and combines the training and testing datasets for consistent preprocessing.
    """
    try:
        train_df = pd.read_csv(train_file)
        test_df = pd.read_csv(test_file)
    except FileNotFoundError as e:
        print(f"Error: {e}. Please ensure the files exist.")
        return None, None
    
    train_df['source'] = 'train'
    test_df['source'] = 'test'
    combined_df = pd.concat([train_df, test_df], ignore_index=True, sort=False)
    
    # Store the original Item_Identifier and Outlet_Identifier for the submission file
    test_ids = test_df[['Item_Identifier', 'Outlet_Identifier']]
    return combined_df, test_ids

def preprocess_features(df):
    """
    Performs advanced feature engineering, cleaning, and imputation.
    """
    # Impute missing Item_Weight using the median per Item_Identifier
    # Using median is more robust to outliers than mean.
    df['Item_Weight'] = df.groupby('Item_Identifier')['Item_Weight'].transform(
        lambda x: x.fillna(x.median())
    )
    
    # Impute missing Outlet_Size based on the mode of the Outlet_Type
    outlet_sizes_mode = df.groupby('Outlet_Type')['Outlet_Size'].apply(lambda x: x.mode().iloc[0])
    df['Outlet_Size'] = df.apply(
        lambda row: outlet_sizes_mode[row['Outlet_Type']] if pd.isna(row['Outlet_Size']) else row['Outlet_Size'],
        axis=1
    )
    
    # Handle categorical feature inconsistencies
    df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'low fat':'Low Fat', 'LF':'Low Fat', 'reg':'Regular'})
    
    # --- Advanced Feature Engineering ---
    
    # 1. New feature for Outlet Age
    df['Outlet_Years'] = 2013 - df['Outlet_Establishment_Year']
    
    # 2. Correcting and creating new visibility features
    # First, replace 0 visibility with the mean visibility of that item across all outlets
    visibility_means = df.groupby('Item_Identifier')['Item_Visibility'].transform(
        lambda x: x.replace(0, x[x != 0].mean())
    )
    # If an item had 0 visibility across all its appearances, fill with the global average visibility
    df['Item_Visibility'] = visibility_means.fillna(df['Item_Visibility'].mean())
    
    # Create a new feature: visibility per outlet type
    df['Item_Visibility_per_Outlet_Type'] = df['Item_Visibility'] / df.groupby('Outlet_Type')['Item_Visibility'].transform('mean')

    # 3. Item_MRP Bins
    # Binning the Item_MRP into categories can help the model capture non-linear relationships.
    df['Item_MRP_Bins'] = pd.cut(df['Item_MRP'], bins=[0, 70, 140, 210, 280], labels=['Low', 'Medium', 'High', 'Very_High'])
    
    # 4. Item_Type Grouping
    df['Item_Type_Combined'] = df['Item_Identifier'].apply(lambda x: x[:2])
    df['Item_Type_Combined'] = df['Item_Type_Combined'].replace({'DR': 'Drinks', 'FD': 'Food', 'NC': 'Non-Consumables'})
    
    return df

def separate_and_encode(df):
    """
    Separates the combined data and applies a single OneHotEncoder.
    """
    # Define features to be dropped before encoding
    drop_cols = ['Item_Identifier', 'Outlet_Establishment_Year']
    
    # Separate data back into train and test sets
    train_processed = df[df['source'] == 'train'].drop('source', axis=1)
    test_processed = df[df['source'] == 'test'].drop('source', axis=1)

    X_train = train_processed.drop(['Item_Outlet_Sales'] + drop_cols, axis=1)
    y_train = train_processed['Item_Outlet_Sales']
    
    X_test = test_processed.drop(drop_cols, axis=1)

    # Use OneHotEncoder within a ColumnTransformer
    categorical_cols = [
        'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size',
        'Outlet_Location_Type', 'Outlet_Type', 'Item_Type_Combined', 'Item_MRP_Bins'
    ]
    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
        ],
        remainder='passthrough'
    )

    X_train_encoded = preprocessor.fit_transform(X_train)
    X_test_encoded = preprocessor.transform(X_test)
    
    return X_train_encoded, y_train, X_test_encoded

# --- 2. Ensemble Model Building and Training ---

def build_stacked_model():
    """
    Builds a Stacking Regressor model with a more diverse set of base estimators.
    """
    # Optimized hyperparameters for the base models to reduce overfitting
    # Added 'tree_method' for XGBoost and 'device' for LightGBM to enable GPU usage.
    estimators = [
        ('xgb', xgb.XGBRegressor(
            objective='reg:squarederror', n_estimators=100, learning_rate=0.03,
            max_depth=4, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.005, 
            random_state=42, n_jobs=-1,
            tree_method='gpu_hist' # Enables GPU for faster training
        )),
        ('gbr', GradientBoostingRegressor(
            n_estimators=1000, learning_rate=0.03, max_depth=3,
            min_samples_leaf=20, max_features='sqrt', random_state=42
        )),
        ('lgbm', lgb.LGBMRegressor(
            objective='regression', n_estimators=1000, learning_rate=0.03,
            num_leaves=31, min_child_samples=20, subsample=0.7, 
            colsample_bytree=0.7, reg_alpha=0.001, random_state=42, n_jobs=-1,
            device='gpu' # Enables GPU for faster training
        )),
        ('hgbm', HistGradientBoostingRegressor(
            max_iter=1000, learning_rate=0.03, max_leaf_nodes=31, 
            min_samples_leaf=20, random_state=42
        ))
    ]

    # Use a more flexible meta-regressor like LassoCV
    # The 'normalize' parameter was removed in recent scikit-learn versions, so it's been removed here.
    meta_regressor = LassoCV(
        eps=1e-5, n_alphas=100, random_state=42
    )
    
    # Create the StackingRegressor with 10-fold cross-validation
    stacked_model = StackingRegressor(
        estimators=estimators,
        final_estimator=meta_regressor,
        cv=5, # Increased folds for more robust cross-validation
        n_jobs=-1
    )
    
    return stacked_model

# --- 3. Cross-Validation and Prediction ---

def train_and_predict(model, X_train, y_train, X_test):
    """
    Trains the model using K-Fold cross-validation and makes predictions.
    """
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof_predictions = np.zeros(X_train.shape[0])
    test_predictions = np.zeros(X_test.shape[0])
    
    print("Starting K-Fold Cross-Validation...")
    for fold, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        print(f"--- Fold {fold+1}/{kf.n_splits} ---")
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        
        model.fit(X_train_fold, y_train_fold)
        
        oof_preds = model.predict(X_val_fold)
        oof_predictions[val_index] = oof_preds
        
        fold_rmse = np.sqrt(mean_squared_error(y_val_fold, oof_preds))
        print(f"Fold {fold+1} RMSE: {fold_rmse:.4f}")
        
        test_predictions += model.predict(X_test) / kf.n_splits
    
    overall_rmse = np.sqrt(mean_squared_error(y_train, oof_predictions))
    print(f"\nOverall Cross-Validation RMSE: {overall_rmse:.4f}")
    
    return test_predictions

# --- 4. Main Execution Block ---

def main():
    """
    Main function to run the entire prediction pipeline.
    """
    print("Loading and preprocessing data...")
    combined_df, test_ids = load_data('/kaggle/input/big-mart/train.csv', '/kaggle/input/big-mart/test.csv')

    if combined_df is None:
        return

    combined_df = preprocess_features(combined_df.copy())
    
    X_train_encoded, y_train, X_test_encoded = separate_and_encode(combined_df)

    print("Building the advanced ensemble model...")
    stacked_model = build_stacked_model()
    
    test_predictions = train_and_predict(stacked_model, X_train_encoded, y_train, X_test_encoded)
    
    # Create the submission DataFrame
    submission_df = test_ids.copy()
    submission_df['Item_Outlet_Sales'] = test_predictions
    
    # Post-process predictions to be non-negative
    submission_df['Item_Outlet_Sales'] = np.maximum(0, submission_df['Item_Outlet_Sales'])
    
    submission_df.to_csv('submission.csv', index=False)
    print("\nPrediction complete. The output file 'submission.csv' has been generated.")
    print("\nHead of the submission file:")
    print(submission_df.head().to_markdown(index=False, numalign="left", stralign="left"))


if __name__ == "__main__":
    main()


Loading and preprocessing data...
Building the advanced ensemble model...
Starting K-Fold Cross-Validation...
--- Fold 1/5 ---



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NV


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 12 dense feature groups (0.06 MB) transferred to GPU in 0.004543 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 2196.369724
Fold 2 RMSE: 1076.5898
--- Fold 3/5 ---



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1101
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5454, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50



    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 12 dense feature groups (0.06 MB) transferred to GPU in 0.017546 secs. 1 sparse feature groups
[LightGBM] [Info] Size o


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1097
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NV


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1098
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] Total Bins 1096
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 5455, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NV


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"



[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5456, number of used features: 50
[LightGBM] [Info] Using GPU Device: Tesla T4, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 12 dense feature groups (0.06 MB) transferred to GPU in 0.002810 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 2170.457236
Fold 5 RMSE: 1115.4590

Overall Cross-Validation RMSE: 1078.7625

Prediction complete. The output file 'submission.csv' has been generated.

Head of the submission file:
| Item_Identifier   | Outlet_Identifier   | Item_Outlet_Sales   |
|:------------------|:--------------------|:--------------------|
| FDW58             | OUT049              | 1674.41             |
| FDW14             | OUT017              | 1371.82             |