<a href="https://colab.research.google.com/github/konstantine25b/Walmart-Recruiting---Store-Sales-Forecasting/blob/lodia/model_exp_RandomForest_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install kaggle



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! mkdir ~/.kaggle

In [4]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

In [5]:
! chmod 600 ~/.kaggle/kaggle.json

In [6]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 508MB/s]


In [7]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [8]:
import pandas as pd

import zipfile
import os

path = "/content/"


files_to_unzip = [
    "features.csv.zip",
    "test.csv.zip",
    "train.csv.zip",
    "sampleSubmission.csv.zip"
]

print("Checking and unzipping individual files...")
for file_name in files_to_unzip:
    full_path = os.path.join(path, file_name)
    if os.path.exists(full_path):
        try:
            with zipfile.ZipFile(full_path, 'r') as zip_ref:

                zip_ref.extractall(path)
            print(f"Successfully unzipped: {file_name}")

        except zipfile.BadZipFile:
            print(f"Warning: {file_name} is not a valid zip file or already unzipped/corrupted.")
    else:
        print(f"Info: {file_name} not found, likely already unzipped or not present.")

Checking and unzipping individual files...
Successfully unzipped: features.csv.zip
Successfully unzipped: test.csv.zip
Successfully unzipped: train.csv.zip
Successfully unzipped: sampleSubmission.csv.zip


In [9]:
try:
    train_df = pd.read_csv(path + "train.csv")
    test_df = pd.read_csv(path + "test.csv")
    features_df = pd.read_csv(path + "features.csv")
    stores_df = pd.read_csv(path + "stores.csv")
    print("All datasets loaded successfully!")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please check the 'path' variable.")

All datasets loaded successfully!


In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, make_scorer # make_scorer for custom metric
import warnings
from datetime import timedelta

warnings.simplefilter(action='ignore', category=FutureWarning)

In [11]:
def weighted_mean_absolute_error(y_true, y_pred, is_holiday_flag):
    weights = np.where(is_holiday_flag, 5, 1)
    y_pred = np.maximum(0, y_pred) # Ensure predictions are non-negative
    return np.sum(weights * np.abs(y_true - y_pred)) / np.sum(weights)


In [12]:
print("\n--- 1. Data Loading and Merging ---")
try:
    path = "./" # Adjust this path if your files are in a specific directory
    train_df = pd.read_csv(path + "train.csv")
    features_df = pd.read_csv(path + "features.csv")
    stores_df = pd.read_csv(path + "stores.csv")
    print("Raw datasets loaded successfully!")

    # Convert 'Date' columns to datetime objects for time-series operations
    train_df['Date'] = pd.to_datetime(train_df['Date'])
    features_df['Date'] = pd.to_datetime(features_df['Date'])

    # Merge datasets: train data with features (like temperature, fuel price) and store details
    train_merged = pd.merge(train_df, features_df, on=['Store', 'Date', 'IsHoliday'], how='left')
    train_merged = pd.merge(train_merged, stores_df, on=['Store'], how='left')
    print("train_merged created successfully by combining train, features, and stores datasets.")

except Exception as e:
    print(f"Error loading or merging dataframes: {e}. Please ensure original CSVs are accessible and path is correct.")
    exit()



--- 1. Data Loading and Merging ---
Raw datasets loaded successfully!
train_merged created successfully by combining train, features, and stores datasets.


In [13]:
# --- 2. Preprocessing & Feature Engineering ---
print("\n--- 2. Preprocessing & Feature Engineering (V9.0) ---")

# Handle Negative Weekly_Sales: Sales cannot be negative, so set any negative values to 0.
train_merged['Weekly_Sales'] = train_merged['Weekly_Sales'].apply(lambda x: max(0, x))
print("Handled negative Weekly_Sales by setting them to 0.")

# 2.1 Date-based Features (Enhanced with Cyclical Features)
print("Creating date-based features (Year, Month, Week, DayOfWeek, DayOfYear, IsMonthStart, IsMonthEnd, TimeIdx, Month_sin, Month_cos)...")
train_merged['Year'] = train_merged['Date'].dt.year
train_merged['Month'] = train_merged['Date'].dt.month
train_merged['Week'] = train_merged['Date'].dt.isocalendar().week.astype(int)
train_merged['DayOfWeek'] = train_merged['Date'].dt.dayofweek
train_merged['DayOfYear'] = train_merged['Date'].dt.dayofyear
train_merged['IsMonthStart'] = train_merged['Date'].dt.is_month_start.astype(int)
train_merged['IsMonthEnd'] = train_merged['Date'].dt.is_month_end.astype(int)
train_merged['TimeIdx'] = (train_merged['Date'] - train_merged['Date'].min()).dt.days # Days since first record

# Cyclical features for Month (from teammate's code)
train_merged['Month_sin'] = np.sin(2 * np.pi * train_merged['Month'] / 12)
train_merged['Month_cos'] = np.cos(2 * np.pi * train_merged['Month'] / 12)

print("Date-based and cyclical features created.")


--- 2. Preprocessing & Feature Engineering (V9.0) ---
Handled negative Weekly_Sales by setting them to 0.
Creating date-based features (Year, Month, Week, DayOfWeek, DayOfYear, IsMonthStart, IsMonthEnd, TimeIdx, Month_sin, Month_cos)...
Date-based and cyclical features created.


In [14]:
# 2.2 Handle Missing Values
print("Handling missing values (MarkDown, CPI, Unemployment)...")
markdown_cols = [f'MarkDown{i}' for i in range(1, 6)]
for col in markdown_cols:
    train_merged[col] = train_merged[col].fillna(0)
print("Filled NaN values in MarkDown columns with 0.")

Handling missing values (MarkDown, CPI, Unemployment)...
Filled NaN values in MarkDown columns with 0.


In [15]:
# CPI and Unemployment: Fill NaN using forward-fill then backward-fill within each store group,
# then fill any remaining NaNs with the median.
train_merged['CPI'] = train_merged.groupby('Store')['CPI'].ffill().bfill()
train_merged['Unemployment'] = train_merged.groupby('Store')['Unemployment'].ffill().bfill()
train_merged['CPI'] = train_merged['CPI'].fillna(train_merged['CPI'].median())
train_merged['Unemployment'] = train_merged['Unemployment'].fillna(train_merged['Unemployment'].median())
print("Filled NaN values in CPI and Unemployment columns.")

Filled NaN values in CPI and Unemployment columns.


In [16]:
# 2.3 Lagged and Rolling Mean Sales Features (Crucial for Time Series, Streamlined)
print("Creating lagged and rolling mean sales features...")
# Ensure data is sorted by Store, Dept, and Date for correct chronological calculations
train_merged = train_merged.sort_values(by=['Store', 'Dept', 'Date']).reset_index(drop=True)

# Lagged sales from previous week and previous year (52 weeks) - KEPT, as per V2.0 performance
train_merged['Lag_Weekly_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1).fillna(0)
train_merged['Lag52_Weekly_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(52).fillna(0)

# Rolling mean of sales over the past 4 weeks (excluding current week) - KEPT
train_merged['RollingMean_4W_Sales'] = train_merged.groupby(['Store', 'Dept'])['Weekly_Sales'].transform(
    lambda x: x.shift(1).rolling(window=4, min_periods=1).mean()
).fillna(0)

print("Lagged (1 & 52-week) and RollingMean_4W_Sales features created.")


Creating lagged and rolling mean sales features...
Lagged (1 & 52-week) and RollingMean_4W_Sales features created.


In [17]:
# 2.4 Interaction Feature (Temperature * IsHoliday)
print("Creating Temperature_IsHoliday interaction feature...")
train_merged['Temperature_IsHoliday'] = train_merged['Temperature'] * train_merged['IsHoliday']
print("Temperature_IsHoliday created.")


Creating Temperature_IsHoliday interaction feature...
Temperature_IsHoliday created.


In [18]:
# Define features (X) and target (y) for the model
X = train_merged.drop(columns=['Weekly_Sales', 'Date'])
y = train_merged['Weekly_Sales']
is_holiday_flags = train_merged['IsHoliday'] # Keep for WMAE calculation


In [19]:
# --- 3. ColumnTransformer and Pipeline Setup ---
print("\n--- 3. Preprocessor and Pipeline Setup ---")

categorical_features = ['Type']
numerical_features = ['Store', 'Dept', 'Size', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'IsHoliday'] + \
                     markdown_cols + \
                     ['Year', 'Month', 'Week', 'DayOfWeek', 'DayOfYear', 'IsMonthStart', 'IsMonthEnd', 'TimeIdx'] + \
                     ['Month_sin', 'Month_cos'] + \
                     ['Lag_Weekly_Sales', 'Lag52_Weekly_Sales', 'RollingMean_4W_Sales'] + \
                     ['Temperature_IsHoliday']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', 'passthrough', numerical_features)
    ],
    remainder='drop' # Explicitly drop columns not specified
)
print("ColumnTransformer for preprocessing defined.")


--- 3. Preprocessor and Pipeline Setup ---
ColumnTransformer for preprocessing defined.


In [20]:
print("\n--- 4. Data Splitting (Chronological with Friday Alignment) ---")
# Ensure data is sorted by Date before splitting to maintain chronological order for global split
train_merged = train_merged.sort_values(by='Date').reset_index(drop=True)

total_rows = len(train_merged)
min_date = train_merged['Date'].min()
max_date = train_merged['Date'].max()
total_days = (max_date - min_date).days

# Calculate split dates based on total time span
train_split_days = int(total_days * 0.6)
val_split_days = int(total_days * 0.8)

train_split_date = min_date + timedelta(days=train_split_days)
val_split_date = min_date + timedelta(days=val_split_days)

# Align split dates to Friday (weekday() == 4 for Friday)
while train_split_date.weekday() != 4 and train_split_date <= max_date:
    train_split_date += timedelta(days=1)
while val_split_date.weekday() != 4 and val_split_date <= max_date:
    val_split_date += timedelta(days=1)

# Ensure split dates don't exceed max date or overlap incorrectly
if train_split_date >= max_date: # Handle edge case for very small datasets
    train_split_date = min_date + timedelta(days=int(total_days * 0.5))
    if train_split_date.weekday() != 4: train_split_date += timedelta(days=(4 - train_split_date.weekday() + 7) % 7)

if val_split_date <= train_split_date or val_split_date >= max_date:
    val_split_date = train_split_date + timedelta(days=int(total_days * 0.2)) # Recalculate if issue
    if val_split_date.weekday() != 4: val_split_date += timedelta(days=(4 - val_split_date.weekday() + 7) % 7)



--- 4. Data Splitting (Chronological with Friday Alignment) ---


In [21]:
X_train = X[train_merged['Date'] < train_split_date]
y_train = y[train_merged['Date'] < train_split_date]
is_holiday_train = is_holiday_flags[train_merged['Date'] < train_split_date]

X_val = X[(train_merged['Date'] >= train_split_date) & (train_merged['Date'] < val_split_date)]
y_val = y[(train_merged['Date'] >= train_split_date) & (train_merged['Date'] < val_split_date)]
is_holiday_val = is_holiday_flags[(train_merged['Date'] >= train_split_date) & (train_merged['Date'] < val_split_date)]

X_local_test = X[train_merged['Date'] >= val_split_date]
y_local_test = y[train_merged['Date'] >= val_split_date]
is_holiday_local_test = is_holiday_flags[train_merged['Date'] >= val_split_date]


In [22]:
print(f"Data split into Training ({len(X_train)} rows), Validation ({len(X_val)} rows), Local Test ({len(X_local_test)} rows).")
if not train_merged.empty:
    print(f"Train dates: {train_merged['Date'].min().date()} to {train_split_date.date()} (approx.)")
    print(f"Validation dates: {train_split_date.date()} to {val_split_date.date()} (approx.)")
    print(f"Local Test dates: {val_split_date.date()} to {train_merged['Date'].max().date()} (approx.)")




Data split into Training (252413 rows), Validation (83348 rows), Local Test (85809 rows).
Train dates: 2010-02-05 to 2011-09-30 (approx.)
Validation dates: 2011-09-30 to 2012-04-13 (approx.)
Local Test dates: 2012-04-13 to 2012-10-26 (approx.)


In [23]:
# --- 5. Hyperparameter Tuning with RandomizedSearchCV and TimeSeriesSplit ---
print("\n--- 5. Hyperparameter Tuning (RandomizedSearchCV with TimeSeriesSplit) ---")

from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit # Import TimeSeriesSplit

# Define the full pipeline
rf_pipeline_tuning = Pipeline(steps=[('preprocessor', preprocessor),
                                      ('regressor', RandomForestRegressor(random_state=42))])

# Conservative parameter distribution for quick initial tuning
param_distributions = {
    'regressor__n_estimators': [100, 150],      # Fewer estimators for speed
    'regressor__max_depth': [15, 25],           # Controlled depth
    'regressor__min_samples_split': [5],        # Higher values lead to less complex, faster trees
    'regressor__min_samples_leaf': [2],         # Higher values lead to less complex, faster trees
    'regressor__max_features': [0.8],           # Fixed for simplicity and speed
}

# Use TimeSeriesSplit for CV (only on training data)
# n_splits=2 means 2 validation folds will be created on the training data.
# The splits will be chronological.
tscv = TimeSeriesSplit(n_splits=2)

# Use MAE as scoring metric for RandomizedSearchCV
scorer = make_scorer(mean_absolute_error, greater_is_better=False)

random_search = RandomizedSearchCV(
    estimator=rf_pipeline_tuning,
    param_distributions=param_distributions,
    n_iter=3,         # Only 3 iterations for ultra-fast check
    cv=tscv,
    scoring=scorer,
    verbose=2,
    random_state=42,
    n_jobs=-1,        # Use all available CPU cores
    return_train_score=True
)

print(f"Starting Randomized Search for optimal Random Forest hyperparameters (n_iter={random_search.n_iter})...")
random_search.fit(X_train, y_train)


--- 5. Hyperparameter Tuning (RandomizedSearchCV with TimeSeriesSplit) ---
Starting Randomized Search for optimal Random Forest hyperparameters (n_iter=3)...
Fitting 2 folds for each of 3 candidates, totalling 6 fits


In [24]:
# --- 6. Train Final Model with Best Hyperparameters on FULL Training Data ---
print("\n--- 6. Train Final Model with Best Hyperparameters on FULL Training Data ---")
best_params = random_search.best_params_
print(f"Best hyperparameters found after tuning: {best_params}")

# Create the final Random Forest Regressor with best parameters
final_rf_regressor = RandomForestRegressor(random_state=42,
                                           n_estimators=best_params['regressor__n_estimators'],
                                           max_depth=best_params['regressor__max_depth'],
                                           min_samples_split=best_params['regressor__min_samples_split'],
                                           min_samples_leaf=best_params['regressor__min_samples_leaf'],
                                           max_features=best_params['regressor__max_features'],
                                           n_jobs=-1
                                          )

# Create the final pipeline
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                  ('regressor', final_rf_regressor)])

print("Training final model on FULL X_train with best hyperparameters...")
final_pipeline.fit(X_train, y_train)


--- 6. Train Final Model with Best Hyperparameters on FULL Training Data ---
Best hyperparameters found after tuning: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 5, 'regressor__min_samples_leaf': 2, 'regressor__max_features': 0.8, 'regressor__max_depth': 15}
Training final model on FULL X_train with best hyperparameters...


In [25]:
# --- 7. Final Model Evaluation ---
print("\n--- 7. Final Model Evaluation ---")

# Evaluate the final model on the Validation Set
print("\nEvaluating on Validation Set with final model...")
val_predictions_final = final_pipeline.predict(X_val)
val_wmae_final = weighted_mean_absolute_error(y_val, val_predictions_final, is_holiday_val)
print(f"Validation WMAE (Final Model): {val_wmae_final:.4f}")
val_mae_final = mean_absolute_error(y_val, val_predictions_final)
print(f"Validation MAE (Final Model): {val_mae_final:.4f}")

# Evaluate the final model on the Local Test Set
print("\nEvaluating on Local Test Set with final model...")
local_test_predictions_final = final_pipeline.predict(X_local_test)
local_test_wmae_final = weighted_mean_absolute_error(y_local_test, local_test_predictions_final, is_holiday_local_test)
print(f"Local Test WMAE (Final Model): {local_test_wmae_final:.4f}")
local_test_mae_final = mean_absolute_error(y_local_test, local_test_predictions_final)
print(f"Local Test MAE (Final Model): {local_test_mae_final:.4f}")

print("\n--- Random Forest Model Version 9.0 Training and Evaluation Complete ---")



--- 7. Final Model Evaluation ---

Evaluating on Validation Set with final model...
Validation WMAE (Final Model): 1805.7305
Validation MAE (Final Model): 1615.0387

Evaluating on Local Test Set with final model...
Local Test WMAE (Final Model): 1490.7268
Local Test MAE (Final Model): 1307.3495

--- Random Forest Model Version 9.0 Training and Evaluation Complete ---
