In [1]:
# Feature Engineering for Multivariate Time-Series Forecasting
"""
This notebook focuses on:
- Creating time-based and calendar features
- Generating lag and rolling statistics
- Preparing datasets for ML and deep learning models
- Ensuring no data leakage in feature creation

"""
print("Feature Engineering Notebook Loaded")


Feature Engineering Notebook Loaded


In [2]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler


In [4]:
## Load Processed Data
data_path = "C:\\Multivariate_TimeSeries_Forecasting_CP2\\data\\raw\\processed\\walmart_merged_cleaned.csv"
df = pd.read_csv(data_path, parse_dates=["Date"], index_col="Date")

df.shape


(421570, 17)

In [6]:
# Create Time-Based Features
df = df.sort_values(["Store", "Dept", "Date"])
df.head()


Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,Type_x,Size_x,Type_y,Size_y
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2010-02-05,1,1,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,A,151315,A,151315
2010-02-12,1,1,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,A,151315,A,151315
2010-02-19,1,1,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,A,151315,A,151315
2010-02-26,1,1,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,A,151315,A,151315
2010-03-05,1,1,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,A,151315,A,151315


In [8]:
# Create Calendar Time Features
df["year"] = df.index.year
df["month"] = df.index.month
df["week"] = df.index.isocalendar().week
df["day"] = df.index.day


In [10]:
# One-Hot Encode Categorical Variables
df = pd.get_dummies(df, columns=["Type_x", "Type_y"], drop_first=True)
df.head()

Unnamed: 0_level_0,Store,Dept,Weekly_Sales,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,...,Size_x,Size_y,year,month,week,day,Type_x_B,Type_x_C,Type_y_B,Type_y_C
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-02-05,1,1,24924.5,False,42.31,2.572,0.0,0.0,0.0,0.0,...,151315,151315,2010,2,5,5,False,False,False,False
2010-02-12,1,1,46039.49,True,38.51,2.548,0.0,0.0,0.0,0.0,...,151315,151315,2010,2,6,12,False,False,False,False
2010-02-19,1,1,41595.55,False,39.93,2.514,0.0,0.0,0.0,0.0,...,151315,151315,2010,2,7,19,False,False,False,False
2010-02-26,1,1,19403.54,False,46.63,2.561,0.0,0.0,0.0,0.0,...,151315,151315,2010,2,8,26,False,False,False,False
2010-03-05,1,1,21827.9,False,46.5,2.625,0.0,0.0,0.0,0.0,...,151315,151315,2010,3,9,5,False,False,False,False


In [11]:
# Define Target Variable
TARGET = "Weekly_Sales"


In [12]:
# Create Lag Features
LAG_WEEKS = [1, 2, 4, 8]

for lag in LAG_WEEKS:
    df[f"lag_{lag}"] = df.groupby(["Store", "Dept"])[TARGET].shift(lag)


In [14]:
# Create Rolling Statistics Features
ROLLING_WINDOWS = [4, 8, 12]

for window in ROLLING_WINDOWS:
    df[f"rolling_mean_{window}"] = (
        df.groupby(["Store", "Dept"])[TARGET]
        .shift(1)
        .rolling(window)
        .mean()
    )
    
    df[f"rolling_std_{window}"] = (
        df.groupby(["Store", "Dept"])[TARGET]
        .shift(1)
        .rolling(window)
        .std()
    )


In [15]:
# Check for Missing Values
df.isna().sum().sort_values(ascending=False).head(10)


rolling_mean_12    38615
rolling_std_12     38615
rolling_mean_8     25966
lag_8              25966
rolling_std_8      25966
rolling_mean_4     13134
lag_4              13134
rolling_std_4      13134
lag_2               6625
lag_1               3331
dtype: int64

In [16]:
# Drop Initial Rows with Insufficient History
df_fe = df.dropna().copy()
df_fe.shape


(382955, 33)

In [17]:
# Prepare Feature Matrix and Target Vector - Feature List Separation
feature_cols = [col for col in df_fe.columns if col != TARGET]

X = df_fe[feature_cols]
y = df_fe[TARGET]


In [18]:
# Scale Numerical Features
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X),
    columns=X.columns,
    index=X.index
)


In [21]:
# Save Feature-Engineered Dataset for ML Models
ml_path = "C:\\Multivariate_TimeSeries_Forecasting_CP2\\data\\raw\\features.csv"
df_fe.to_csv(ml_path)

print("ML feature dataset saved.")


ML feature dataset saved.


In [22]:
# Prepare Data for Deep Learning Models - Create Sequences
# Prepare Data for LSTM / GRU (Sequence Framing)
def create_sequences(X, y, window_size):
    X_seq, y_seq = [], []
    for i in range(window_size, len(X)):
        X_seq.append(X.iloc[i-window_size:i].values)
        y_seq.append(y.iloc[i])
    return np.array(X_seq), np.array(y_seq)

WINDOW_SIZE = 8
X_seq, y_seq = create_sequences(X_scaled, y, WINDOW_SIZE)

X_seq.shape, y_seq.shape


((382947, 8, 32), (382947,))

In [24]:
import os

# Create directory if it doesn't exist
output_dir = "C:\\Multivariate_TimeSeries_Forecasting_CP2\\data\\processed"
os.makedirs(output_dir, exist_ok=True)

# Save with absolute path
np.save(os.path.join(output_dir, "X_lstm.npy"), X_seq)
np.save(os.path.join(output_dir, "y_lstm.npy"), y_seq)

print("DL sequence data saved.")


DL sequence data saved.


### Feature Engineering Summary

- Calendar features capture seasonal effects.
- Lag features model temporal dependencies.
- Rolling statistics represent trend and volatility.
- Feature scaling ensures stable deep learning training.
- Sequence framing enables LSTM/GRU models.

This engineered dataset supports both classical ML
and deep learning-based forecasting.
