Running Optuna on params below (taking too long so did 2 of this)
{'window': 14, 'n_layers': 4, 'lstm_units': 128, 'dropout_rate': 0.2, 'static_dense': 48, 'learning_rate': 0.0018917900209603875, 'batch_size': 128}

In [None]:
# ───────────────────────────────────────────────────────────────────────────────
# 1.  Loading libraries and datasets, and set up data
# ───────────────────────────────────────────────────────────────────────────────
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math, random, warnings

from sklearn.metrics         import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing   import LabelEncoder, MinMaxScaler

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers    import (
    Input, LSTM, Embedding, Flatten, Dense,
    Concatenate, SpatialDropout1D,
    BatchNormalization, Dropout
)
from tensorflow.keras.models    import Model

# ───────────────────────────────────────────────────────────────────────────────
# 2.  (Unchanged) Read & clean
# ───────────────────────────────────────────────────────────────────────────────
df = pd.read_csv(r" ... csv")

time_varying_categorical_cols = ['Rain?','Name','Puasa','Public Holiday','Day','Month']
static_categorical_cols       = ['Store_No','State','CODE (subcluster 1)',
                                 'CODE FY26 1 (subcluster 2)','CODE FY26 2 (subcluster 3)']
categorical_cols = time_varying_categorical_cols + static_categorical_cols
numeric_cols     = ['Net_Amount','TC','Days_after_Opening','Average Daily Temperature (°C)']

df['CODE (subcluster 1)'] = df['CODE (subcluster 1)'].fillna('blank').replace('', 'blank')
df['Name']                = df['Name'].fillna('no PH').replace('', 'no PH')
df['Puasa']               = df['Puasa'].fillna(0).replace('', 0)
df['Public Holiday']      = df['Public Holiday'].fillna(0).replace('', 0)

# ───────────────────────────────────────────────────────────────────────────────
# 3.  (Unchanged) Encode categoricals
# ───────────────────────────────────────────────────────────────────────────────
embed_cols = []
for col in categorical_cols:
    n = df[col].nunique()
    if col in static_categorical_cols or n >= 7:
        embed_cols.append(col)

encoders = {}
for col in embed_cols:
    le = LabelEncoder()
    df[col + '_enc'] = le.fit_transform(df[col])
    encoders[col]  = le

df['Rain?'] = df['Rain?'].map({'Yes':1, ' No':0})

# ───────────────────────────────────────────────────────────────────────────────
# 4.  (Unchanged) Scale continuous features
# ───────────────────────────────────────────────────────────────────────────────
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# ───────────────────────────────────────────────────────────────────────────────
# 5.  (New) Time-aware train/val split helper for Optuna
# ───────────────────────────────────────────────────────────────────────────────
def time_aware_train_val_split(df, window,
        time_numeric_cols, static_cols_enc, val_frac=0.2):
    """
    For each store, slide windows of length `window`.
    First (1 - val_frac) of them → train, last val_frac → val.
    Returns six arrays: X_num_tr, X_num_va, X_name_tr, X_name_va, ..., y_tr, y_va
    """
    X_num_tr, X_num_va = [], []
    X_name_tr, X_name_va = [], []
    X_day_tr,  X_day_va  = [], []
    X_month_tr,X_month_va= [], []
    X_stat_tr, X_stat_va = [], []
    y_tr,      y_va      = [], []

    for _, grp in df.groupby('Store_No'):
        grp = grp.sort_values('Date')
        T = len(grp)
        n_windows = T - window
        if n_windows <= 0: 
            continue

        split_i = int((1 - val_frac) * n_windows)
        arr_num   = grp[time_numeric_cols].values
        arr_name  = grp['Name_enc' ].values
        arr_day   = grp['Day_enc'  ].values
        arr_month = grp['Month_enc'].values
        arr_stat  = grp[static_cols_enc].iloc[window:].values
        arr_tgt   = grp[['Net_Amount','TC']].values[window:]

        for i in range(n_windows):
            seq_num   = arr_num[i : i+window]
            seq_name  = arr_name[i : i+window]
            seq_day   = arr_day[i : i+window]
            seq_month = arr_month[i : i+window]
            stat_vec  = arr_stat[i]
            target    = arr_tgt[i]

            if i < split_i:
                X_num_tr .append(seq_num)
                X_name_tr.append(seq_name)
                X_day_tr .append(seq_day)
                X_month_tr.append(seq_month)
                X_stat_tr.append(stat_vec)
                y_tr     .append(target)
            else:
                X_num_va .append(seq_num)
                X_name_va.append(seq_name)
                X_day_va .append(seq_day)
                X_month_va.append(seq_month)
                X_stat_va.append(stat_vec)
                y_va     .append(target)

    # stack into numpy arrays
    def _stack(lst): return np.stack(lst) if lst else np.array(lst)
    return (
        _stack(X_num_tr), _stack(X_num_va),
        _stack(X_name_tr),_stack(X_name_va),
        _stack(X_day_tr), _stack(X_day_va),
        _stack(X_month_tr),_stack(X_month_va),
        _stack(X_stat_tr),_stack(X_stat_va),
        _stack(y_tr),     _stack(y_va)
    )

# ───────────────────────────────────────────────────────────────────────────────
# 6.  (Modified) Build Sales LSTM with variable depth
# ───────────────────────────────────────────────────────────────────────────────
def build_sales_lstm(
        W, F,
        time_cardinalities, static_cardinalities,
        lstm_units   = 48,
        dropout_rate = 0.2,
        static_dense = 64,
        learning_rate= 1e-3,
        n_layers     = 1             # <<< NEW
    ):
    # — time-series inputs & embeddings —
    num_in       = Input((W, F), name='num_in')
    name_in      = Input((W,),   dtype='int32', name='name_seq_in')
    day_in       = Input((W,),   dtype='int32', name='day_seq_in')
    month_in     = Input((W,),   dtype='int32', name='month_seq_in')

    dim = lambda n: min(50, n//2 + 1)
    emb_name  = SpatialDropout1D(dropout_rate)(
                    Embedding(time_cardinalities['name'],  dim(time_cardinalities['name']))(name_in))
    emb_day   = SpatialDropout1D(dropout_rate)(
                    Embedding(time_cardinalities['day'],   dim(time_cardinalities['day'])) (day_in))
    emb_month = SpatialDropout1D(dropout_rate)(
                    Embedding(time_cardinalities['month'], dim(time_cardinalities['month']))(month_in))

    x = Concatenate(axis=-1)([num_in, emb_name, emb_day, emb_month])

    # — stacked LSTM layers —
    for l in range(n_layers):
        x = LSTM(lstm_units,
                 dropout=0.0,
                 return_sequences=(l < n_layers-1)
                )(x)
        x = BatchNormalization()(x)
        x = Dropout(dropout_rate)(x)

    # — static branch embeddings & dense —
    static_inputs, static_vecs = [], []
    for base, vocab in static_cardinalities:
        s_in  = Input((1,), dtype='int32', name=f"{base}_in")
        s_emb = Embedding(vocab, dim(vocab))(s_in)
        s_emb = Flatten()(s_emb)
        s_emb = BatchNormalization()(s_emb)
        s_emb = Dropout(dropout_rate)(s_emb)
        static_inputs.append(s_in)
        static_vecs  .append(s_emb)

    s_cat = Concatenate()(static_vecs)
    s_cat = Dense(static_dense, activation='relu')(s_cat)
    s_cat = BatchNormalization()(s_cat)
    s_cat = Dropout(dropout_rate)(s_cat)

    out = Dense(2, activation='linear', name='sales_out')(
              Concatenate()([x, s_cat])
          )

    model = Model([num_in, name_in, day_in, month_in] + static_inputs, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss='mse', metrics=['mae']
    )
    return model

# ───────────────────────────────────────────────────────────────────────────────
# 7.  Prepare cardinalities & static cols list (unchanged)
# ───────────────────────────────────────────────────────────────────────────────
time_cardinalities = {
    'name' : df['Name_enc'].nunique(),
    'day'  : df['Day_enc'].nunique(),
    'month': df['Month_enc'].nunique()
}
static_cols_enc = [c + '_enc' for c in static_categorical_cols if c in embed_cols]
static_cardinalities = [
    (col.replace('_enc',''), df[col].nunique())
    for col in static_cols_enc
]

time_numeric_cols = [
    'Net_Amount','TC','Days_after_Opening',
    'Average Daily Temperature (°C)','Rain?','Puasa','Public Holiday'
]

# ───────────────────────────────────────────────────────────────────────────────
# 8.  Optuna hyperparameter search
#       tuning: window • n_layers • lstm_units • dropout_rate • static_dense • learning_rate
# ───────────────────────────────────────────────────────────────────────────────
import optuna
optuna.logging.set_verbosity(optuna.logging.INFO)
warnings.filterwarnings("ignore")  # silence TF warnings during tuning

def make_input_dict(X_num, X_name, X_day, X_month, X_stat):
    d = {
        'num_in':       X_num.astype('float32'),
        'name_seq_in':  X_name.astype('int32'),
        'day_seq_in':   X_day.astype('int32'),
        'month_seq_in': X_month.astype('int32'),
    }
    for i,(base,_) in enumerate(static_cardinalities):
        d[f'{base}_in'] = X_stat[:,i].reshape(-1,1).astype('int32')
    return d

def objective(trial):
    # ─ sample hyperparameters ─
    window        = trial.suggest_int("window",         7, 28, step=7)
    n_layers      = trial.suggest_int("n_layers",       1, 4)
    lstm_units    = trial.suggest_int("lstm_units",    32,256,step=32)
    dropout_rate  = trial.suggest_float("dropout_rate",  0.0,0.5,step=0.1)
    static_dense  = trial.suggest_int("static_dense",   16,128,step=16)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5,1e-2)
    batch_size    = trial.suggest_categorical("batch_size",[32,64,128])

    # ─ re-split train/val on-the-fly ─
    (X_num_tr, X_num_va,
     X_name_tr,X_name_va,
     X_day_tr, X_day_va,
     X_month_tr,X_month_va,
     X_stat_tr,X_stat_va,
     y_tr,     y_va) = time_aware_train_val_split(
                        df, window,
                        time_numeric_cols, static_cols_enc,
                        val_frac=0.2
                      )

    train_inputs = make_input_dict(
        X_num_tr, X_name_tr, X_day_tr, X_month_tr, X_stat_tr)
    val_inputs   = make_input_dict(
        X_num_va, X_name_va, X_day_va, X_month_va, X_stat_va)

    # ─ build & compile model ─
    model = build_sales_lstm(
        window, X_num_tr.shape[-1],
        time_cardinalities, static_cardinalities,
        lstm_units    = lstm_units,
        dropout_rate  = dropout_rate,
        static_dense  = static_dense,
        learning_rate = learning_rate,
        n_layers      = n_layers
    )

    # ─ train silently ─
    es = EarlyStopping("val_loss", patience=3, restore_best_weights=True)
    history = model.fit(
        train_inputs, y_tr,
        validation_data=(val_inputs,y_va),
        epochs=50, batch_size=batch_size,
        callbacks=[es], verbose=0
    )

    return min(history.history["val_loss"])


study = optuna.create_study(
    direction="minimize",
    pruner=optuna.pruners.MedianPruner()
)
study.optimize(objective, n_trials=20, show_progress_bar=True)

print("\n✅ Best hyperparameters:")
for k,v in study.best_params.items():
    print(f"  {k:<14s}: {v}")

# ───────────────────────────────────────────────────────────────────────────────
# 9.  (Optional) Retrain & evaluate final model with best params
# ───────────────────────────────────────────────────────────────────────────────
best = study.best_params
window = best['window']

(X_num_tr, X_num_va, X_num_te,
 X_name_tr,X_name_va,X_name_te,
 X_day_tr, X_day_va, X_day_te,
 X_month_tr,X_month_va,X_month_te,
 X_stat_tr,X_stat_va,X_stat_te,
 y_tr,     y_va,     y_te) = (
    *time_aware_train_val_split(df, window, time_numeric_cols, static_cols_enc, val_frac=0.15),
    *_                              # leftover are test if you extend your split helper
)

# (Rebuild train/val/test dicts, retrain, and run your final evaluation code here…)
