In [None]:
import pandas as pd

file_path = "../feature_engineered_data.csv"
TARGET_COL = "thai_price"

df = pd.read_csv(file_path)
drop_cols = [TARGET_COL]

for c in ["year_month", "date", "Year", "Month","world_price_thb_kg"]:
    if c in df.columns:
        drop_cols.append(c)

df = df.dropna(subset=[TARGET_COL])

y = df[TARGET_COL]
X = df.drop(columns=drop_cols)

X = X.select_dtypes(include=["number"])

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Feature names:", list(X.columns))


X shape: (222, 104)
y shape: (222,)
Feature names: ['thai_production', 'domestic_consumption', 'import_volume', 'export_volume', 'stock', 'exchange_rate_usd', 'china_pmi', 'oil_price_thb', 'month', 'quarter', 'year', 'is_season', 'thai_price_ma3', 'thai_price_std3', 'thai_price_ma6', 'thai_price_std6', 'thai_price_ma12', 'thai_price_std12', 'world_price_thb_kg_ma3', 'world_price_thb_kg_std3', 'world_price_thb_kg_ma6', 'world_price_thb_kg_std6', 'world_price_thb_kg_ma12', 'world_price_thb_kg_std12', 'oil_price_thb_ma3', 'oil_price_thb_std3', 'oil_price_thb_ma6', 'oil_price_thb_std6', 'oil_price_thb_ma12', 'oil_price_thb_std12', 'china_pmi_ma3', 'china_pmi_std3', 'china_pmi_ma6', 'china_pmi_std6', 'china_pmi_ma12', 'china_pmi_std12', 'thai_price_pct1', 'thai_price_pct3', 'thai_price_pct6', 'thai_price_pct12', 'world_price_thb_kg_pct1', 'world_price_thb_kg_pct3', 'world_price_thb_kg_pct6', 'world_price_thb_kg_pct12', 'oil_price_thb_pct1', 'oil_price_thb_pct3', 'oil_price_thb_pct6', 'oil_p

In [None]:
from xgboost import XGBRegressor
from sklearn.feature_selection import SelectFromModel

xgb_base = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=32,
    importance_type="gain"
)

xgb_base.fit(X, y)

selector = SelectFromModel(
    xgb_base,
    threshold="median",
    prefit=True
)

X_sel = selector.transform(X)

selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]

print("Selected features:")
for f in selected_features:
    print("-", f)

print(f"\nSelected {len(selected_features)} / {X.shape[1]} features")


Selected features:
- domestic_consumption
- stock
- exchange_rate_usd
- china_pmi
- oil_price_thb
- year
- thai_price_ma3
- thai_price_std3
- thai_price_ma6
- thai_price_std6
- thai_price_ma12
- world_price_thb_kg_ma3
- world_price_thb_kg_std3
- world_price_thb_kg_ma6
- world_price_thb_kg_std6
- world_price_thb_kg_ma12
- oil_price_thb_std3
- oil_price_thb_ma6
- oil_price_thb_std12
- china_pmi_ma3
- china_pmi_std6
- china_pmi_ma12
- thai_price_pct1
- thai_price_pct3
- thai_price_pct6
- thai_price_pct12
- world_price_thb_kg_pct1
- world_price_thb_kg_pct3
- world_price_thb_kg_pct6
- oil_price_thb_pct3
- oil_price_thb_pct6
- exchange_rate_usd_pct6
- exchange_rate_usd_pct12
- net_export
- stock_consumption_ratio
- thai_price_lag1
- thai_price_lag3
- thai_price_lag6
- thai_price_lag12
- world_price_thb_kg_lag1
- world_price_thb_kg_lag3
- world_price_thb_kg_lag6
- oil_price_thb_lag1
- oil_price_thb_lag6
- china_pmi_lag12
- stock_lag1
- stock_lag3
- stock_lag6
- exchange_rate_usd_lag1
- exchan



In [12]:
from sklearn.model_selection import TimeSeriesSplit, cross_val_score

xgb_final = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

tscv = TimeSeriesSplit(n_splits=5)

scores = cross_val_score(
    xgb_final,
    X_sel, y,
    cv=tscv,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE (after feature selection):", -scores.mean())


CV RMSE (after feature selection): 11.372645133114279


In [13]:
xgb_final = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

tscv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(
    xgb_final, X_sel, y,
    cv=tscv,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE (after feature selection):", -scores.mean())


CV RMSE (after feature selection): 11.372645133114279


In [14]:
selected_mask = selector.get_support()
selected_features = X.columns[selected_mask]

print("Selected features in 'feature', format:\n")
for f in selected_features:
    print(f"'{f}',")


Selected features in 'feature', format:

'domestic_consumption',
'stock',
'exchange_rate_usd',
'china_pmi',
'oil_price_thb',
'year',
'thai_price_ma3',
'thai_price_std3',
'thai_price_ma6',
'thai_price_std6',
'thai_price_ma12',
'world_price_thb_kg_ma3',
'world_price_thb_kg_std3',
'world_price_thb_kg_ma6',
'world_price_thb_kg_std6',
'world_price_thb_kg_ma12',
'oil_price_thb_std3',
'oil_price_thb_ma6',
'oil_price_thb_std12',
'china_pmi_ma3',
'china_pmi_std6',
'china_pmi_ma12',
'thai_price_pct1',
'thai_price_pct3',
'thai_price_pct6',
'thai_price_pct12',
'world_price_thb_kg_pct1',
'world_price_thb_kg_pct3',
'world_price_thb_kg_pct6',
'oil_price_thb_pct3',
'oil_price_thb_pct6',
'exchange_rate_usd_pct6',
'exchange_rate_usd_pct12',
'net_export',
'stock_consumption_ratio',
'thai_price_lag1',
'thai_price_lag3',
'thai_price_lag6',
'thai_price_lag12',
'world_price_thb_kg_lag1',
'world_price_thb_kg_lag3',
'world_price_thb_kg_lag6',
'oil_price_thb_lag1',
'oil_price_thb_lag6',
'china_pmi_lag12',
'stoc