In [1]:
import pandas as pd
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

import optuna

file_path = "../feature_engineered_data.csv"
TARGET_COL = "thai_price"

df = pd.read_csv(file_path)

drop_cols = [TARGET_COL]
for c in ["year_month", "date", "Year", "Month", "world_price_thb_kg"]:
    if c in df.columns:
        drop_cols.append(c)


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = df.dropna(subset=[TARGET_COL])

y = df[TARGET_COL]

X_raw = df.drop(columns=drop_cols).select_dtypes(include=["number"])

print("Before imputation:")
print("X_raw shape:", X_raw.shape)
print("NaN per column (top 10):")
print(X_raw.isna().sum().sort_values(ascending=False).head(10))

imputer = SimpleImputer(strategy="median")
X = pd.DataFrame(imputer.fit_transform(X_raw), columns=X_raw.columns)

print("\nAfter imputation:")
print("Any NaN left?", X.isna().any().any())

feature_names = list(X.columns)
print("\nTotal candidate features:", len(feature_names))


Before imputation:
X_raw shape: (222, 104)
NaN per column (top 10):
production_growth          1
thai_production            0
world_price_thb_kg_lag6    0
thai_production_lag1       0
china_pmi_lag12            0
china_pmi_lag6             0
china_pmi_lag3             0
china_pmi_lag1             0
oil_price_thb_lag12        0
oil_price_thb_lag6         0
dtype: int64

After imputation:
Any NaN left? False

Total candidate features: 104


In [7]:
WINDOW = 12

values_X = X.values
values_y = y.values

n_samples = len(X) - WINDOW
n_features = X.shape[1]

X_seq = np.zeros((n_samples, WINDOW, n_features))
y_seq = np.zeros(n_samples)

for i in range(n_samples):
    X_seq[i] = values_X[i:i + WINDOW, :]
    y_seq[i] = values_y[i + WINDOW - 1]

print("Sequence shapes:")
print("X_seq:", X_seq.shape)
print("y_seq:", y_seq.shape)


Sequence shapes:
X_seq: (210, 12, 104)
y_seq: (210,)


In [8]:
factor_names = feature_names
factor_to_index = {f: i for i, f in enumerate(factor_names)}

def select_features_from_seq(X_seq, selected_factors):
    idx = [factor_to_index[f] for f in selected_factors]
    return X_seq[:, :, idx]

n_timesteps = WINDOW
print("\nMapped features:", factor_to_index)



Mapped features: {'thai_production': 0, 'domestic_consumption': 1, 'import_volume': 2, 'export_volume': 3, 'stock': 4, 'exchange_rate_usd': 5, 'china_pmi': 6, 'oil_price_thb': 7, 'month': 8, 'quarter': 9, 'year': 10, 'is_season': 11, 'thai_price_ma3': 12, 'thai_price_std3': 13, 'thai_price_ma6': 14, 'thai_price_std6': 15, 'thai_price_ma12': 16, 'thai_price_std12': 17, 'world_price_thb_kg_ma3': 18, 'world_price_thb_kg_std3': 19, 'world_price_thb_kg_ma6': 20, 'world_price_thb_kg_std6': 21, 'world_price_thb_kg_ma12': 22, 'world_price_thb_kg_std12': 23, 'oil_price_thb_ma3': 24, 'oil_price_thb_std3': 25, 'oil_price_thb_ma6': 26, 'oil_price_thb_std6': 27, 'oil_price_thb_ma12': 28, 'oil_price_thb_std12': 29, 'china_pmi_ma3': 30, 'china_pmi_std3': 31, 'china_pmi_ma6': 32, 'china_pmi_std6': 33, 'china_pmi_ma12': 34, 'china_pmi_std12': 35, 'thai_price_pct1': 36, 'thai_price_pct3': 37, 'thai_price_pct6': 38, 'thai_price_pct12': 39, 'world_price_thb_kg_pct1': 40, 'world_price_thb_kg_pct3': 41, 'w

In [9]:
def build_lstm_model(n_timesteps, n_features):
    model = Sequential([
        LSTM(32, activation='tanh', input_shape=(n_timesteps, n_features)),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

In [16]:

import tensorflow as tf

def objective_lstm(trial):
    # เคลียร์ session เก่าของ TF เพื่อลดโอกาสเจอ error เรื่อง memory / graph ซ้อนกัน
    tf.keras.backend.clear_session()

    selected = []

    # เลือก feature 0/1
    for f in factor_names:
        use = trial.suggest_categorical(f"include_" + f, [0, 1])
        if use == 1:
            selected.append(f)

    # ถ้าไม่เลือกอะไรเลย → ให้ค่า loss ใหญ่ ๆ
    if len(selected) == 0:
        return 1e9

    try:
        # ตัด sequence ให้เหลือเฉพาะ feature ที่เลือก
        Xtr = select_features_from_seq(X_train_seq, selected)
        Xte = select_features_from_seq(X_test_seq, selected)

        # กันกรณี shape แปลก ๆ
        if Xtr.shape[1] != n_timesteps:
            return 1e9
        if Xtr.shape[2] == 0:
            return 1e9

        # สร้างโมเดล
        model = build_lstm_model(n_timesteps, Xtr.shape[2])

        es = EarlyStopping(
            monitor="val_loss",
            patience=3,
            restore_best_weights=True
        )

        # เทรน
        model.fit(
            Xtr, y_train_seq,
            validation_split=0.2,
            epochs=20,
            batch_size=32,
            verbose=0,
            callbacks=[es]
        )

        # พยากรณ์
        pred = model.predict(Xte, verbose=0).ravel()
        rmse = mean_squared_error(y_test_seq, pred, squared=False)

    except Exception as e:
        # ถ้า trial นี้พัง (shape / numerical / convergence ฯลฯ) ให้ penalty หนัก ๆ ไปเลย
        rmse = 1e9

    return rmse


In [17]:
study_lstm_feat = optuna.create_study(direction="minimize")
study_lstm_feat.optimize(objective_lstm, n_trials=20)



[I 2025-11-27 22:37:04,154] A new study created in memory with name: no-name-3343eed5-8fa5-4f49-8009-bcd84210b130





[I 2025-11-27 22:37:04,558] Trial 0 finished with value: 1000000000.0 and parameters: {'include_thai_production': 1, 'include_domestic_consumption': 1, 'include_import_volume': 0, 'include_export_volume': 0, 'include_stock': 0, 'include_exchange_rate_usd': 0, 'include_china_pmi': 0, 'include_oil_price_thb': 1, 'include_month': 1, 'include_quarter': 0, 'include_year': 0, 'include_is_season': 0, 'include_thai_price_ma3': 1, 'include_thai_price_std3': 0, 'include_thai_price_ma6': 1, 'include_thai_price_std6': 0, 'include_thai_price_ma12': 0, 'include_thai_price_std12': 1, 'include_world_price_thb_kg_ma3': 1, 'include_world_price_thb_kg_std3': 0, 'include_world_price_thb_kg_ma6': 0, 'include_world_price_thb_kg_std6': 0, 'include_world_price_thb_kg_ma12': 1, 'include_world_price_thb_kg_std12': 1, 'include_oil_price_thb_ma3': 1, 'include_oil_price_thb_std3': 1, 'include_oil_price_thb_ma6': 1, 'include_oil_price_thb_std6': 0, 'include_oil_price_thb_ma12': 1, 'include_oil_price_thb_std12': 0, 

In [22]:
best_params = study_lstm_feat.best_params

selected_features = [
    f for f in factor_names
    if best_params.get("include_" + f, 0) == 1
]

print("\n=== LSTM Selected Features ===\n")
for f in selected_features:
    print(f"'{f}',")

print(f"\nTotal selected: {len(selected_features)} / {len(factor_names)}")



=== LSTM Selected Features ===

'thai_production',
'domestic_consumption',
'oil_price_thb',
'month',
'thai_price_ma3',
'thai_price_ma6',
'thai_price_std12',
'world_price_thb_kg_ma3',
'world_price_thb_kg_ma12',
'world_price_thb_kg_std12',
'oil_price_thb_ma3',
'oil_price_thb_std3',
'oil_price_thb_ma6',
'oil_price_thb_ma12',
'china_pmi_ma6',
'thai_price_pct6',
'thai_price_pct12',
'world_price_thb_kg_pct1',
'world_price_thb_kg_pct3',
'world_price_thb_kg_pct6',
'world_price_thb_kg_pct12',
'oil_price_thb_pct3',
'oil_price_thb_pct6',
'exchange_rate_usd_pct3',
'exchange_rate_usd_pct12',
'thai_production_pct12',
'net_export',
'world_price_thb_kg_lag1',
'oil_price_thb_lag12',
'china_pmi_lag6',
'china_pmi_lag12',
'thai_production_lag3',
'thai_production_lag6',
'stock_lag3',
'stock_lag6',
'exchange_rate_usd_lag6',
'exchange_rate_usd_lag12',
'net_export_lag1',
'net_export_lag3',
'supply_demand_balance_lag1',
'stock_consumption_ratio_lag1',
'stock_consumption_ratio_lag3',
'stock_consumption_ratio_l