<a href="https://colab.research.google.com/github/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting/blob/lodia/model_exp_RandomForest_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 1.02GB/s]


In [7]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [8]:
import pandas as pd

import zipfile
import os

path = "/content/"


files_to_unzip = [
    "features.csv.zip",
    "test.csv.zip",
    "train.csv.zip",
    "sampleSubmission.csv.zip"
]

print("Checking and unzipping individual files...")
for file_name in files_to_unzip:
    full_path = os.path.join(path, file_name)
    if os.path.exists(full_path):
        try:
            with zipfile.ZipFile(full_path, 'r') as zip_ref:

                zip_ref.extractall(path)
            print(f"Successfully unzipped: {file_name}")

        except zipfile.BadZipFile:
            print(f"Warning: {file_name} is not a valid zip file or already unzipped/corrupted.")
    else:
        print(f"Info: {file_name} not found, likely already unzipped or not present.")

Checking and unzipping individual files...
Successfully unzipped: features.csv.zip
Successfully unzipped: test.csv.zip
Successfully unzipped: train.csv.zip
Successfully unzipped: sampleSubmission.csv.zip


In [9]:
try:
    train_df = pd.read_csv(path + "train.csv")
    test_df = pd.read_csv(path + "test.csv")
    features_df = pd.read_csv(path + "features.csv")
    stores_df = pd.read_csv(path + "stores.csv")
    print("All datasets loaded successfully!")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please check the 'path' variable.")

All datasets loaded successfully!


In [10]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])
features_df['Date'] = pd.to_datetime(features_df['Date'])

In [11]:
train_merged = pd.merge(train_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
train_merged = pd.merge(train_merged, stores_df, on=['Store'], how='left')

In [12]:
test_merged = pd.merge(test_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
test_merged = pd.merge(test_merged, stores_df, on=['Store'], how='left')

In [13]:
print("Shape:", train_merged.shape)
print("\n--- Merged Training Data Sample ---")
print(train_merged.head())
print("\nMissing values in merged train data after initial merge:")
print(train_merged.isnull().sum())

Shape: (421570, 16)

--- Merged Training Data Sample ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  \
0      1     1 2010-02-05      24924.50      False        42.31       2.572   
1      1     1 2010-02-12      46039.49       True        38.51       2.548   
2      1     1 2010-02-19      41595.55      False        39.93       2.514   
3      1     1 2010-02-26      19403.54      False        46.63       2.561   
4      1     1 2010-03-05      21827.90      False        46.50       2.625   

   MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  \
0        NaN        NaN        NaN        NaN        NaN  211.096358   
1        NaN        NaN        NaN        NaN        NaN  211.242170   
2        NaN        NaN        NaN        NaN        NaN  211.289143   
3        NaN        NaN        NaN        NaN        NaN  211.319643   
4        NaN        NaN        NaN        NaN        NaN  211.350143   

   Unemployment Type    Size  
0   

In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error # For basic MAE
import warnings

# Suppress pandas FutureWarnings related to .loc[] with boolean indexing
warnings.simplefilter(action='ignore', category=FutureWarning)


# --- WMAE Custom Metric Function ---
def weighted_mean_absolute_error(y_true, y_pred, is_holiday_flag):
    weights = np.where(is_holiday_flag, 5, 1) # 5 for holiday, 1 for non-holiday
    y_pred = np.maximum(0, y_pred) # Ensure predictions are non-negative
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)

In [15]:
print("Starting Random Forest Model Improvement Process...")



try:
    if 'train_df' not in locals() or 'features_df' not in locals() or 'stores_df' not in locals():
        print("Raw dataframes not found in current session. Attempting to load from '/' path.")
        path = "/"
        train_df = pd.read_csv(path + "train.csv")
        test_df = pd.read_csv(path + "test.csv")
        features_df = pd.read_csv(path + "features.csv")
        stores_df = pd.read_csv(path + "stores.csv")
        print("Raw datasets reloaded successfully!")

    train_df['Date'] = pd.to_datetime(train_df['Date'])
    features_df['Date'] = pd.to_datetime(features_df['Date'])

    train_merged = pd.merge(train_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
    train_merged = pd.merge(train_merged, stores_df, on=['Store'], how='left')
    print("train_merged created successfully.")

except Exception as e:
    print(f"Error re-loading or merging dataframes: {e}. Please ensure original CSVs are accessible.")
    # Exit or handle appropriately if dataframes can't be loaded


Starting Random Forest Model Improvement Process...
train_merged created successfully.


In [16]:
train_merged = train_merged.sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)


In [17]:
print("\n--- Preprocessing & Feature Engineering (Improved) ---")

# 1. Handle Negative Weekly_Sales in training data
train_merged['Weekly_Sales'] = train_merged['Weekly_Sales'].apply(lambda x: max(0, x))
print("Handled negative Weekly_Sales by setting them to 0 in training data.")



--- Preprocessing & Feature Engineering (Improved) ---
Handled negative Weekly_Sales by setting them to 0 in training data.


In [18]:
# 2. Date Feature Engineering
def create_date_features(df):
    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['Week'] = df['Date'].dt.isocalendar().week.astype(int) # Week of the year
    df['DayOfWeek'] = df['Date'].dt.dayofweek
    df['DayOfYear'] = df['Date'].dt.dayofyear
    df['IsMonthStart'] = df['Date'].dt.is_month_start.astype(int)
    df['IsMonthEnd'] = df['Date'].dt.is_month_end.astype(int)
    df['TimeIdx'] = (df['Date'] - df['Date'].min()).dt.days


    # The 'IsHoliday' flag itself is already a strong feature.

    return df

train_merged = create_date_features(train_merged)
print("Created date-based features.")


Created date-based features.


In [19]:
# 3. Handle Missing Values for MarkDowns
markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
for col in markdown_cols:
    train_merged[col] = train_merged[col].fillna(0)
print("Filled NaN values in MarkDown columns with 0.")



Filled NaN values in MarkDown columns with 0.


In [20]:
print(train_merged.head())

   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  \
0      1     1 2010-02-05      24924.50      False        42.31       2.572   
1      1     1 2010-02-12      46039.49       True        38.51       2.548   
2      1     1 2010-02-19      41595.55      False        39.93       2.514   
3      1     1 2010-02-26      19403.54      False        46.63       2.561   
4      1     1 2010-03-05      21827.90      False        46.50       2.625   

   MarkDown1  MarkDown2  MarkDown3  ...  Type    Size  Year  Month Week  \
0        0.0        0.0        0.0  ...     A  151315  2010      2    5   
1        0.0        0.0        0.0  ...     A  151315  2010      2    6   
2        0.0        0.0        0.0  ...     A  151315  2010      2    7   
3        0.0        0.0        0.0  ...     A  151315  2010      2    8   
4        0.0        0.0        0.0  ...     A  151315  2010      3    9   

   DayOfWeek  DayOfYear  IsMonthStart  IsMonthEnd  TimeIdx  
0          4 

In [21]:
# 4. Handle Missing Values for CPI and Unemployment (Teammate's observation about features.csv complexity)
# Use ffill/bfill within each Store group, then fallback to median for robustness.
print("Handling NaN values for CPI and Unemployment (using ffill/bfill per store, then median fallback)...")
train_merged['CPI'] = train_merged.groupby('Store')['CPI'].ffill().bfill()
train_merged['Unemployment'] = train_merged.groupby('Store')['Unemployment'].ffill().bfill()


Handling NaN values for CPI and Unemployment (using ffill/bfill per store, then median fallback)...


In [22]:
# Fallback to global median for any remaining NaNs (e.g., if an entire store's CPI/Unemployment was NaN)
train_merged['CPI'] = train_merged['CPI'].fillna(train_merged['CPI'].median())
train_merged['Unemployment'] = train_merged['Unemployment'].fillna(train_merged['Unemployment'].median())
print("Filled NaN values in CPI and Unemployment columns.")

Filled NaN values in CPI and Unemployment columns.


In [23]:
# 4. Create Lagged and Rolling Mean Features
print("Creating lagged and rolling mean features (this might take a few moments)...")

# Sort again just to be sure, critical for lags
train_merged = train_merged.sort_values(by=['Store', 'Dept', 'Date'])

# Lag 1: Sales from the previous week for the same Store and Dept
# Ensure to apply on the 'Weekly_Sales' column AFTER negative sales have been handled
train_merged['Lag_Weekly_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)

# Lag 52: Sales from the same week in the previous year (annual seasonality)
train_merged['Lag52_Weekly_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(52)

# Rolling Mean: Average sales over the last 4 weeks (shifted to avoid data leakage)
train_merged['RollingMean_4W_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
    lambda x: x.shift(1).rolling(window=4, min_periods=1).mean()
)

# Rolling Std: Standard deviation of sales over the last 4 weeks (shifted)
train_merged['RollingStd_4W_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
    lambda x: x.shift(1).rolling(window=4, min_periods=1).std()
)

Creating lagged and rolling mean features (this might take a few moments)...


In [24]:
# Fill NaNs created by lagging (for first few weeks of each series)
# Use 0 or a reasonable default, depending on how you want the model to interpret "no past sales".
train_merged['Lag_Weekly_Sales'] = train_merged['Lag_Weekly_Sales'].fillna(0)
train_merged['Lag52_Weekly_Sales'] = train_merged['Lag52_Weekly_Sales'].fillna(0)
train_merged['RollingMean_4W_Sales'] = train_merged['RollingMean_4W_Sales'].fillna(0)
train_merged['RollingStd_4W_Sales'] = train_merged['RollingStd_4W_Sales'].fillna(0)
print("Lagged and rolling mean/std features created and NaNs filled.")

Lagged and rolling mean/std features created and NaNs filled.


In [25]:
# 5. Define features and target before splitting
# IMPORTANT: Now include 'IsHoliday' and the new lagged/rolling features in 'numerical_features'
X = train_merged.drop(columns=['Weekly_Sales', 'Date']) # Date is dropped after features are extracted
y = train_merged['Weekly_Sales']
is_holiday_flags = train_merged['IsHoliday'] # Store for WMAE calculation

In [26]:
print(train_merged.head())

   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  \
0      1     1 2010-02-05      24924.50      False        42.31       2.572   
1      1     1 2010-02-12      46039.49       True        38.51       2.548   
2      1     1 2010-02-19      41595.55      False        39.93       2.514   
3      1     1 2010-02-26      19403.54      False        46.63       2.561   
4      1     1 2010-03-05      21827.90      False        46.50       2.625   

   MarkDown1  MarkDown2  MarkDown3  ...  Week  DayOfWeek  DayOfYear  \
0        0.0        0.0        0.0  ...     5          4         36   
1        0.0        0.0        0.0  ...     6          4         43   
2        0.0        0.0        0.0  ...     7          4         50   
3        0.0        0.0        0.0  ...     8          4         57   
4        0.0        0.0        0.0  ...     9          4         64   

   IsMonthStart IsMonthEnd  TimeIdx  Lag_Weekly_Sales  Lag52_Weekly_Sales  \
0             0      

In [27]:
# 6. Categorical & Numerical Feature Definitions for Preprocessor
categorical_features = ['Type']
numerical_features = ['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment'] + \
                     markdown_cols + \
                     ['Year', 'Month', 'Week', 'DayOfWeek', 'DayOfYear', 'IsMonthStart', 'IsMonthEnd', 'TimeIdx'] + \
                     ['IsHoliday', 'Lag_Weekly_Sales', 'Lag52_Weekly_Sales', 'RollingMean_4W_Sales', 'RollingStd_4W_Sales'] # <-- New features included!


In [28]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ],
    remainder='drop' # Drop any other columns not specified
)
print("Defined preprocessor for categorical encoding and updated numerical features.")


Defined preprocessor for categorical encoding and updated numerical features.


In [29]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ],
    remainder='drop' # Drop any other columns not specified
)
print("Defined preprocessor for categorical encoding and updated numerical features.")


Defined preprocessor for categorical encoding and updated numerical features.


In [30]:
print(train_merged.head())

   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  \
0      1     1 2010-02-05      24924.50      False        42.31       2.572   
1      1     1 2010-02-12      46039.49       True        38.51       2.548   
2      1     1 2010-02-19      41595.55      False        39.93       2.514   
3      1     1 2010-02-26      19403.54      False        46.63       2.561   
4      1     1 2010-03-05      21827.90      False        46.50       2.625   

   MarkDown1  MarkDown2  MarkDown3  ...  Week  DayOfWeek  DayOfYear  \
0        0.0        0.0        0.0  ...     5          4         36   
1        0.0        0.0        0.0  ...     6          4         43   
2        0.0        0.0        0.0  ...     7          4         50   
3        0.0        0.0        0.0  ...     8          4         57   
4        0.0        0.0        0.0  ...     9          4         64   

   IsMonthStart IsMonthEnd  TimeIdx  Lag_Weekly_Sales  Lag52_Weekly_Sales  \
0             0      

In [31]:
# 7. Data Splitting (Chronological 60/20/20 split)
total_rows = len(train_merged)
train_end_idx = int(total_rows * 0.6)
val_end_idx = int(total_rows * 0.8) # 60% + 20%

X_train = X.iloc[:train_end_idx]
y_train = y.iloc[:train_end_idx]
is_holiday_train = is_holiday_flags.iloc[:train_end_idx]

X_val = X.iloc[train_end_idx:val_end_idx]
y_val = y.iloc[train_end_idx:val_end_idx]
is_holiday_val = is_holiday_flags.iloc[train_end_idx:val_end_idx]

X_local_test = X.iloc[val_end_idx:]
y_local_test = y.iloc[val_end_idx:]
is_holiday_local_test = is_holiday_flags.iloc[val_end_idx:]


In [32]:
print(f"\nData split into Training ({len(X_train)} rows), Validation ({len(X_val)} rows), Local Test ({len(X_local_test)} rows).")
# Print actual dates to confirm the split ranges
if not train_merged.empty:
    print(f"Train dates: {train_merged['Date'].iloc[0].date()} to {train_merged['Date'].iloc[train_end_idx-1].date()}")
    if val_end_idx > train_end_idx: # Check if validation set is not empty
        print(f"Validation dates: {train_merged['Date'].iloc[train_end_idx].date()} to {train_merged['Date'].iloc[val_end_idx-1].date()}")
    if len(X_local_test) > 0: # Check if local test set is not empty
        print(f"Local Test dates: {train_merged['Date'].iloc[val_end_idx].date()} to {train_merged['Date'].iloc[-1].date()}")
else:
    print("train_merged DataFrame is empty, cannot display date ranges.")



Data split into Training (252942 rows), Validation (84314 rows), Local Test (84314 rows).
Train dates: 2010-02-05 to 2011-04-29
Validation dates: 2011-05-06 to 2010-11-19
Local Test dates: 2010-11-26 to 2012-10-26


In [33]:
# --- RandomForest_Training_&_Evaluation (Conceptual MLflow Run) ---
print("\n--- Model Training & Evaluation (Random Forest - Improved Features) ---")

# Create the Random Forest Regressor model within a pipeline
# For a start, we'll keep n_estimators=100. Consider increasing later.
rf_model = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1))
                          ])

print("Training Random Forest model on training set with improved features...")
rf_model.fit(X_train, y_train)
print("Model training complete.")


--- Model Training & Evaluation (Random Forest - Improved Features) ---
Training Random Forest model on training set with improved features...
Model training complete.


In [34]:
# 1. Make predictions and evaluate on the Validation Set
print("\nEvaluating on Validation Set...")
val_predictions = rf_model.predict(X_val)
val_wmae = weighted_mean_absolute_error(y_val, val_predictions, is_holiday_val)
print(f"Validation WMAE: {val_wmae:.4f}")
val_mae = mean_absolute_error(y_val, val_predictions)
print(f"Validation MAE: {val_mae:.4f}")


Evaluating on Validation Set...
Validation WMAE: 1873.3626
Validation MAE: 1692.2703


In [35]:
# 2. Make predictions and evaluate on the Local Test Set
print("\nEvaluating on Local Test Set...")
local_test_predictions = rf_model.predict(X_local_test)
local_test_wmae = weighted_mean_absolute_error(y_local_test, local_test_predictions, is_holiday_local_test)
print(f"Local Test WMAE: {local_test_wmae:.4f}")
local_test_mae = mean_absolute_error(y_local_test, local_test_predictions)
print(f"Local Test MAE: {local_test_mae:.4f}")



Evaluating on Local Test Set...
Local Test WMAE: 1578.4943
Local Test MAE: 1407.5672
