In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/sampleSubmission.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/stores.csv
/kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
/kaggle/input/walmart-recruiting-store-sales-forecasting/test.csv.zip


In [2]:
! pip install pytorch-forecasting pytorch-lightning pandas numpy scikit-learn

Collecting pytorch-forecasting
  Downloading pytorch_forecasting-1.4.0-py3-none-any.whl.metadata (14 kB)
Collecting lightning<3.0.0,>=2.0.0 (from pytorch-forecasting)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch!=2.0.1,<3.0.0,>=2.0.0->pytorch-forecasting)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.m

In [3]:
!pip install  neuralforecast --quiet

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m285.8/285.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import warnings
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from neuralforecast import NeuralForecast
from neuralforecast.models import TFT
from statsmodels.tools.sm_exceptions import ValueWarning # Ensure this is handled if it's still an issue
import zipfile
import os

# Suppress warnings for cleaner output in Kaggle notebooks
warnings.filterwarnings("ignore", category=ValueWarning)
warnings.filterwarnings("ignore")
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", None) # Display all columns


In [5]:
KAGGLE_DATA_PATH = "/kaggle/input/walmart-recruiting-store-sales-forecasting/"

In [6]:
def calculate_wmae(y_true, y_pred, is_holiday_flag, holiday_weight=5.0):
    
    abs_errors = np.abs(y_true - y_pred)
    weights = np.where(is_holiday_flag.astype(bool), holiday_weight, 1.0)
    wmae = np.sum(weights * abs_errors) / np.sum(weights)
    return wmae

In [7]:
class DateFeatureCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if "Date" not in X.columns:
            raise ValueError("DateFeatureCreator requires 'Date' column in input X.")
            
        # Ensure 'Date' is datetime type before operations
        if not pd.api.types.is_datetime64_any_dtype(X['Date']):
            X['Date'] = pd.to_datetime(X['Date'])

        # Using to_period('W') and then converting to integer week number
        # rank(method="dense") ensures consecutive integers for weeks
        X["week"] = (X["Date"].dt.to_period("W").rank(method="dense").astype(int) - 1)
        
        # Cyclical features for different periodicities
        X["sin_13"] = np.sin(2 * np.pi * X["week"] / 13) # Roughly quarterly seasonality
        X["cos_13"] = np.cos(2 * np.pi * X["week"] / 13)
        X["sin_23"] = np.sin(2 * np.pi * X["week"] / 23) # A different, less common periodicity
        X["cos_23"] = np.cos(2 * np.pi * X["week"] / 23)
        
        # Drop the original 'Date' column as its information is now in cyclical features
        X = X.drop(columns=["Date"], errors='ignore')
        return X


In [8]:
class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        return X.drop(columns=self.columns, errors="ignore")


In [9]:
class ColumnTransformerWithNames(ColumnTransformer):
    """
    A wrapper around ColumnTransformer to retain column names and return a DataFrame.
    Handles OneHotEncoder output specifically.
    """
    def __init__(self, transformers, remainder='drop'): 
        super().__init__(transformers=transformers, remainder=remainder) 
        self.output_columns_ = None

    def fit(self, X, y=None):
        super().fit(X, y)
        self.output_columns_ = self._get_feature_names_out_internal(X)
        return self

    def _get_feature_names_out_internal(self, X):
        column_names = []
        for name, transformer, columns in self.transformers_:
            if transformer == 'drop':
                continue
            elif transformer == 'passthrough':
                # Ensure passthrough columns are correctly identified from original X
                if isinstance(columns, str):
                    column_names.append(columns)
                else:
                    column_names.extend(list(columns))
            else:
                if hasattr(transformer, 'get_feature_names_out'):
                    if isinstance(columns, str): 
                        col_names = [columns]
                    else:
                        col_names = list(columns)
                    column_names.extend(list(transformer.get_feature_names_out(col_names)))
                else:
                    if isinstance(columns, str): # Fallback for transformers without get_feature_names_out
                        column_names.append(columns)
                    else:
                        column_names.extend(list(columns))
        return column_names

    def transform(self, X):
        transformed_array = super().transform(X)
        if self.output_columns_ is None:
             raise RuntimeError("ColumnTransformerWithNames must be fitted before transform.")
        
        # Convert to dense array if it's a sparse matrix (older sklearn default for OHE)
        if hasattr(transformed_array, 'toarray'):
            transformed_array = transformed_array.toarray()

        # Ensure that the index is preserved from the input X
        # This is CRITICAL for maintaining alignment with y
        return pd.DataFrame(transformed_array, index=X.index, columns=self.output_columns_)

    def fit_transform(self, X, y=None):
        transformed_array = super().fit_transform(X, y)
        self.output_columns_ = self._get_feature_names_out_internal(X)
        
        # Convert to dense array if it's a sparse matrix (older sklearn default for OHE)
        if hasattr(transformed_array, 'toarray'):
            transformed_array = transformed_array.toarray()

        # Ensure that the index is preserved from the input X
        # This is CRITICAL for maintaining alignment with y
        return pd.DataFrame(transformed_array, index=X.index, columns=self.output_columns_)


In [10]:
class MultiIndexKeeper(BaseEstimator, TransformerMixin):
    def __init__(self, index_cols=["Date", "Store", "Dept"]):
        self.index_cols = index_cols

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        
        if 'Date' in X.columns and not pd.api.types.is_datetime64_any_dtype(X['Date']):
            X['Date'] = pd.to_datetime(X['Date'])
            
        missing_cols = [col for col in self.index_cols if col not in X.columns]
        if missing_cols:
            raise ValueError(f"MultiIndexKeeper: Missing columns in input X: {missing_cols}")
            
        # IMPORTANT: When setting index, ensure 'Date', 'Store', 'Dept' columns
        # are not dropped, as they are needed later by NeuralForecast as covariates.
        # This is already handled by drop=False.
        X.set_index(self.index_cols, drop=False, inplace=True)
        return X

In [11]:
class TFTRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, input_chunk_length=52, output_chunk_length=39, epochs=25, batch_size=32, random_seed=42):
        self.input_chunk_length = input_chunk_length
        self.output_chunk_length = output_chunk_length
        self.epochs = epochs
        self.batch_size = batch_size
        self.random_seed = random_seed
        self.nf_ = None
        self.model_ = None
        self.trained_df_ = None # Store the DataFrame used for training

    def fit(self, X, y):
        # Ensure y has a name for proper merging if it doesn't already
        if y.name is None:
            y.name = 'y' 

        y_multiindexed = pd.Series(y.values, index=X.index, name='y')

        df = X.copy() 
        df['y'] = y_multiindexed 

        if not pd.api.types.is_datetime64_any_dtype(df.index.get_level_values('Date')):
            raise ValueError("MultiIndex 'Date' level is not datetime type. Ensure MultiIndexKeeper makes it datetime.")
        df['ds'] = df.index.get_level_values('Date') 
        
        df["unique_id"] = df["Store"].astype(str) + "_" + df["Dept"].astype(str) 

        # Store the prepared DataFrame for use in predict
        self.trained_df_ = df.copy() # Store the full df (including y and features)

        # --- DEBUGGING STEP: Check for NaNs immediately before NeuralForecast fit ---
        nan_check = df.isnull().sum()
        cols_with_nans = nan_check[nan_check > 0].index.tolist()
        if cols_with_nans:
            print(f"DEBUG: Found NaNs in the following columns before NeuralForecast fit: {cols_with_nans}")
            # Print head including index for better context
            print(df.loc[df[cols_with_nans[0]].isnull(), cols_with_nans].head())
            raise ValueError(f"Found missing values in {cols_with_nans}.")
        # --- END DEBUGGING STEP ---

        self.model_ = TFT(
            h=self.output_chunk_length,
            input_size=self.input_chunk_length,
            batch_size=self.batch_size,
            random_seed=self.random_seed,
        )

        self.nf_ = NeuralForecast(models=[self.model_], freq="W-FRI")

        self.nf_.fit(df=df)
        return self

    def predict(self, X):
        # X here is the transformed X_val from the pipeline, with MultiIndex ('Date', 'Store', 'Dept')
        
        # 1. Prepare future covariates from X (which is X_val after preprocessing)
        df_future_covariates_raw = X.copy() # This X contains all features for the validation period
        
        if not pd.api.types.is_datetime64_any_dtype(df_future_covariates_raw.index.get_level_values('Date')):
            raise ValueError("MultiIndex 'Date' level is not datetime type in predict. Ensure MultiIndexKeeper makes it datetime.")
        
        df_future_covariates_raw['ds'] = df_future_covariates_raw.index.get_level_values('Date')
        df_future_covariates_raw["unique_id"] = df_future_covariates_raw["Store"].astype(str) + "_" + df_future_covariates_raw["Dept"].astype(str)

        # Identify all covariate columns (all columns in self.trained_df_ except 'ds', 'unique_id', 'y')
        covariate_cols = [col for col in self.trained_df_.columns if col not in ['ds', 'unique_id', 'y']]

        # Select only the relevant future covariate columns and the required 'ds', 'unique_id'
        df_future_covariates_selected = df_future_covariates_raw[['ds', 'unique_id'] + covariate_cols].copy()
        
        # 2. Generate the full expected future dataframe for all series for the forecast horizon
        expected_future_df_template = self.nf_.make_future_dataframe(self.trained_df_)
        
        # 3. Merge the generated template with our actual future covariates (X_val)
        futr_df_complete = pd.merge(
            expected_future_df_template,
            df_future_covariates_selected,
            on=['unique_id', 'ds'],
            how='left'
        )
        
        nan_check_futr = futr_df_complete.isnull().sum()
        cols_with_nans_futr = nan_check_futr[nan_check_futr > 0].index.tolist()
        if cols_with_nans_futr:
            print(f"DEBUG: Found NaNs in futr_df_complete for columns: {cols_with_nans_futr}. Filling with 0.")
            futr_df_complete[cols_with_nans_futr] = futr_df_complete[cols_with_nans_futr].fillna(0)

        # 4. Perform the prediction
        # Ensure that `df` has all required unique_ids and `futr_df` aligns.
        # This is the point where the model generates predictions.
        forecast_df = self.nf_.predict(df=self.trained_df_, futr_df=futr_df_complete) 
        
        forecast_df = forecast_df.rename(columns={'TFT': 'yhat'})

        if 'unique_id' in forecast_df.columns:
            forecast_df[['Store', 'Dept']] = forecast_df['unique_id'].str.split('_', expand=True)
            forecast_df['Store'] = forecast_df['Store'].astype(float).astype(int)
            forecast_df['Dept'] = forecast_df['Dept'].astype(float).astype(int)

        forecast_df['Date'] = pd.to_datetime(forecast_df['ds']) 
        
        # Ensure the index columns are correct before setting the index
        # Also, make sure that 'Store' and 'Dept' are properly integer type before setting multi-index
        forecast_df_indexed = forecast_df.set_index(['Date', 'Store', 'Dept'])[['yhat']]

        # This is where NaNs can be introduced if forecast_df_indexed doesn't cover all X.index
        final_predictions = forecast_df_indexed.reindex(X.index)

        y_pred = final_predictions['yhat'].values.flatten() 

        # FIX: Fill any NaNs in the final predictions array with 0 before evaluation
        if np.isnan(y_pred).any():
            print("DEBUG: Found NaNs in final y_pred after reindex. Filling with 0.")
            y_pred = np.nan_to_num(y_pred, nan=0.0)

        y_pred[y_pred < 0] = 0

        return y_pred

In [12]:
def main():
    print("🚀 Starting Walmart Sales Forecasting with Temporal Fusion Transformer on Kaggle")
    print("=" * 80)

    try:
        # --- 1. Data Loading (Kaggle specific path) ---
        print("📊 Loading datasets from Kaggle input path...")
        
        train_zip_path = os.path.join(KAGGLE_DATA_PATH, 'train.csv.zip')
        features_zip_path = os.path.join(KAGGLE_DATA_PATH, 'features.csv.zip')
        stores_csv_path = os.path.join(KAGGLE_DATA_PATH, 'stores.csv') 

        # --- UNZIP THE FILES ---
        print("   📂 Unzipping necessary data files...")
        with zipfile.ZipFile(train_zip_path, 'r') as zip_ref:
            zip_ref.extractall('.') 
        print(f"      - Extracted: {train_zip_path}")
        
        with zipfile.ZipFile(features_zip_path, 'r') as zip_ref:
            zip_ref.extractall('.') 
        print(f"      - Extracted: {features_zip_path}")

        # Now, load the unzipped CSVs (they will be in the current directory, '/kaggle/working/')
        train_df = pd.read_csv('train.csv')
        features_df = pd.read_csv('features.csv')
        stores_df = pd.read_csv(stores_csv_path) 

        # Convert Date columns to datetime early for consistency
        train_df['Date'] = pd.to_datetime(train_df['Date'])
        features_df['Date'] = pd.to_datetime(features_df['Date'])

        print(f"   📈 Train data: {train_df.shape}")
        print(f"   📊 Features data: {features_df.shape}")
        print(f"   🏪 Stores data: {stores_df.shape}")

        # --- 2. Data Merging and Initial Cleaning ---
        print("\n🧹 Merging data and initial cleaning...")
        merged_df = pd.merge(train_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
        train_full = pd.merge(merged_df, stores_df, on=['Store'], how='left')

        # Fill NaN in MarkDown columns with 0, assuming no markdown if not specified
        markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
        for col in markdown_cols:
            if col in train_full.columns:
                train_full[col] = train_full[col].fillna(0)

        # Remove rows with negative Weekly_Sales
        initial_rows = len(train_full)
        train_full = train_full[train_full['Weekly_Sales'] > 0]
        print(f"   🗑️ Removed {initial_rows - len(train_full)} rows with negative Weekly_Sales.")

        # --- Define column lists here, BEFORE their use ---
        numerical_cols = ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment'] + [f'MarkDown{i}' for i in range(1, 6)]
        categorical_ohe_cols = ["Type", "IsHoliday"] 
        passthrough_cols = ["Store", "Dept"] 
        # --- End of column definitions ---


        # --- NEW FIX: Ensure continuous series and fill NaNs for NeuralForecast ---
        print("   Filling missing dates and sales for time series continuity...")
        
        # Create a full set of (Store, Dept, Date) unique combinations
        unique_store_dept_dates = train_full[['Store', 'Dept', 'Date']].drop_duplicates()

        # Generate all expected dates for each (Store, Dept)
        df_list = []
        for (store, dept), group in unique_store_dept_dates.groupby(['Store', 'Dept']):
            series_min_date = group['Date'].min()
            series_max_date = group['Date'].max()
            full_series_dates = pd.date_range(start=series_min_date, end=series_max_date, freq='W-FRI')
            
            temp_df = pd.DataFrame({
                'Store': store,
                'Dept': dept,
                'Date': full_series_dates
            })
            df_list.append(temp_df)
        
        complete_series_df = pd.concat(df_list, ignore_index=True)
        
        # Merge the complete series dates with the original train_full data
        train_full_cleaned = pd.merge(
            complete_series_df,
            train_full,
            on=['Store', 'Dept', 'Date'],
            how='left'
        )
        
        # Now, fill NaNs in 'Weekly_Sales' with 0 (as sales cannot be truly missing here)
        nan_sales_before_fill = train_full_cleaned['Weekly_Sales'].isnull().sum()
        train_full_cleaned['Weekly_Sales'] = train_full_cleaned['Weekly_Sales'].fillna(0)
        print(f"   Filled {nan_sales_before_fill} NaN Weekly_Sales values with 0 for series continuity.")

        # Re-merge the original features_df and stores_df to fill in associated data
        train_full_cleaned = pd.merge(train_full_cleaned, features_df.drop(columns=['IsHoliday'], errors='ignore'), on=['Store', 'Date'], how='left', suffixes=('', '_feats'))
        train_full_cleaned = pd.merge(train_full_cleaned, stores_df, on=['Store'], how='left', suffixes=('', '_stores'))

        # Combine IsHoliday if it was duplicated, prioritize original if available, otherwise features_df
        if 'IsHoliday_feats' in train_full_cleaned.columns:
            train_full_cleaned['IsHoliday'] = train_full_cleaned['IsHoliday'].fillna(train_full_cleaned['IsHoliday_feats'])
            train_full_cleaned = train_full_cleaned.drop(columns=['IsHoliday_feats'])
        
        # Now fill NaNs in features columns (numerical and categorical where appropriate)
        # Using groupby transform for numerical, then a general fill for any remaining
        for col in numerical_cols: 
            if col in train_full_cleaned.columns:
                train_full_cleaned[col] = train_full_cleaned.groupby(['Store', 'Dept'])[col].transform(lambda x: x.fillna(x.mean()))
                train_full_cleaned[col] = train_full_cleaned[col].fillna(train_full_cleaned[col].mean()) # Fill any remaining with global mean

        # Ensure no negative sales after any filling process (though filling with 0 should prevent this)
        train_full_cleaned['Weekly_Sales'][train_full_cleaned['Weekly_Sales'] < 0] = 0

        # Sort by date, store, and department for time series consistency (important for NeuralForecast)
        train_full = train_full_cleaned.sort_values(by=['Date', 'Store', 'Dept']).reset_index(drop=True)

        print(f"   ✅ Merged and cleaned data: {train_full.shape}")
        print(f"   📅 Date range: {train_full['Date'].min()} to {train_full['Date'].max()}")
        print(f"   Sanity check: NaNs in Weekly_Sales after cleaning: {train_full['Weekly_Sales'].isnull().sum()}")


        # --- 3. Data Splitting (80/20 Time-based) ---
        print("\n📅 Step 1: Creating temporal split (80/20)...")
        
        df_sorted = train_full.sort_values('Date').reset_index(drop=True)

        unique_dates = sorted(df_sorted['Date'].unique())
        total_weeks = len(unique_dates)
        train_ratio = 0.8 
        train_weeks = int(total_weeks * train_ratio)

        if train_weeks < 1:
            train_weeks = 1
        if train_weeks >= total_weeks:
            train_weeks = total_weeks - 1 

        split_date = unique_dates[train_weeks - 1] 

        X_train = df_sorted[df_sorted['Date'] <= split_date].drop(columns=['Weekly_Sales']).copy()
        y_train = df_sorted[df_sorted['Date'] <= split_date]['Weekly_Sales'].copy()
        X_val = df_sorted[df_sorted['Date'] > split_date].drop(columns=['Weekly_Sales']).copy()
        y_val = df_sorted[df_sorted['Date'] > split_date]['Weekly_Sales'].copy()

        print(f"   📊 Split date: {split_date}")
        print(f"   📈 Train: {len(X_train):,} records ({X_train['Date'].min()} to {X_train['Date'].max()})")
        print(f"   📉 Val: {len(X_val):,} records ({X_val['Date'].min()} to {X_val['Date'].max()})")

        # --- 4. Pipeline Definition ---
        print("\n⚙️ Step 2: Defining preprocessing and TFT pipeline...")

        numerical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean'))
        ])

        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])

        preprocessor = ColumnTransformerWithNames(transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_ohe_cols),
            ('pass', 'passthrough', passthrough_cols) 
        ], remainder='drop') 

        pipeline = Pipeline([
            ("multi_index_keeper", MultiIndexKeeper(index_cols=["Date", "Store", "Dept"])),
            ("date_feature_creator", DateFeatureCreator()), 
            ("preprocessor", preprocessor), 
            ("tft_regressor", TFTRegressor(input_chunk_length=52, output_chunk_length=39, epochs=25, batch_size=32, random_seed=42))
        ])

        print("   ✅ Pipeline defined.")

        # --- 5. Model Training ---
        print("\n🧠 Step 3: Training TFT model...")
        pipeline.fit(X_train, y_train)
        print("   ✅ TFT Model Training Complete!")

        # --- 6. Model Evaluation ---
        print("\n📊 Step 4: Evaluating model on validation set...")
        y_pred_val = pipeline.predict(X_val)

        y_pred_val[y_pred_val < 0] = 0

        is_holiday_val = X_val['IsHoliday'].values.astype(bool)

        if len(y_val) > 0:
            val_mae = mean_absolute_error(y_val, y_pred_val)
            val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
            val_wmae = calculate_wmae(y_val, y_pred_val, is_holiday_val)
        else:
            val_mae, val_rmse, val_wmae = 0, 0, 0
            print("   ⚠️ Warning: No data points for evaluation. Metrics set to 0.")

        holiday_mask_val = is_holiday_val.astype(bool)
        holiday_mae_val = mean_absolute_error(y_val[holiday_mask_val], y_pred_val[holiday_mask_val]) if holiday_mask_val.any() else np.nan
        non_holiday_mae_val = mean_absolute_error(y_val[~holiday_mask_val], y_pred_val[~holiday_mask_val]) if (~holiday_mask_val).any() else np.nan

        print("\n" + "=" * 60)
        print("🎯 EXPERIMENT TFT RESULTS SUMMARY")
        print("=" * 60)

        print("\n📊 Validation Metrics:")
        print(f"   WMAE (Competition Metric): ${val_wmae:,.2f}")
        print(f"   MAE: ${val_mae:,.2f}")
        print(f"   RMSE: ${val_rmse:,.2f}")

        print("\n📊 Holiday Breakdown:")
        print(f"   Holiday MAE: ${holiday_mae_val:,.2f} ({int(holiday_mask_val.sum()):,} samples)")
        print(f"   Non-Holiday MAE: ${non_holiday_mae_val:,.2f} ({int((~holiday_mask_val).sum()):,} samples)")

        print("\n🎉 Experiment TFT: Complete!")

    except Exception as e:
        print(f"❌ Experiment failed: {e}")
        raise

In [13]:
if __name__ == "__main__":
    main()

🚀 Starting Walmart Sales Forecasting with Temporal Fusion Transformer on Kaggle
📊 Loading datasets from Kaggle input path...
   📂 Unzipping necessary data files...
      - Extracted: /kaggle/input/walmart-recruiting-store-sales-forecasting/train.csv.zip
      - Extracted: /kaggle/input/walmart-recruiting-store-sales-forecasting/features.csv.zip
   📈 Train data: (421570, 5)
   📊 Features data: (8190, 12)
   🏪 Stores data: (45, 3)

🧹 Merging data and initial cleaning...
   🗑️ Removed 1358 rows with negative Weekly_Sales.
   Filling missing dates and sales for time series continuity...
   Filled 26507 NaN Weekly_Sales values with 0 for series continuity.
   ✅ Merged and cleaned data: (446719, 27)
   📅 Date range: 2010-02-05 00:00:00 to 2012-10-26 00:00:00
   Sanity check: NaNs in Weekly_Sales after cleaning: 0

📅 Step 1: Creating temporal split (80/20)...
   📊 Split date: 2012-04-06 00:00:00
   📈 Train: 357,562 records (2010-02-05 00:00:00 to 2012-04-06 00:00:00)
   📉 Val: 89,157 records 

INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
INFO: ----------------------------------------------------------------------------------------------------
distributed_backend=nccl
All distributed processes registered. Starting with 2 processes
----------------------------------------------------------------------------------------------------

2025-07-17 16:57:58.839500: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752771479.222730     116 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752771479.346429     116 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO: `Trainer.fit` stopped: `max_steps=1000` reached.


   ✅ TFT Model Training Complete!

📊 Step 4: Evaluating model on validation set...
DEBUG: Found NaNs in futr_df_complete for columns: ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'Type_A', 'Type_B', 'Type_C', 'IsHoliday_False', 'IsHoliday_True', 'Store', 'Dept']. Filling with 0.


INFO: GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs
2025-07-17 17:01:47.064320: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752771707.082298      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752771707.087766      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Predicting: |          | 0/? [00:00<?, ?it/s]

DEBUG: Found NaNs in final y_pred after reindex. Filling with 0.

🎯 EXPERIMENT TFT RESULTS SUMMARY

📊 Validation Metrics:
   WMAE (Competition Metric): $1,421.02
   MAE: $1,585.18
   RMSE: $3,415.59

📊 Holiday Breakdown:
   Holiday MAE: $868.07 (6,617 samples)
   Non-Holiday MAE: $1,642.66 (82,540 samples)

🎉 Experiment TFT: Complete!
