# XGBoost Grid Search 

นำ XGBoost มา Grid Search โดยใช้วิธี Rolling Window CV แต่ละ Fold (รอบการประเมิน) จะถูกเทรนด้วยวิธี multi step direct forecasting 

## ตั้งค่า และ นำเข้า Libraries

In [1]:
# 1. COonfiguaration


# System
SEED = 42
N_JOBS = 50  

# Data Split
VALID_SIZE = 155
N_SPLITS = 5
N_TRAIN = 618

# Hyperparameter Search Space
PARAM_GRID = {
    'max_depth': [6, 12],
    'learning_rate': [0.01, 0.05],
    'min_child_weight': [1, 3],
    'gamma': [0.05, 0.1],
    'reg_lambda': [0.1, 0.2],
    'n_estimators': [50, 100]
}

LAGS_GRID = [40, 80]

In [2]:
# 2. Import Libraries

import pandas as pd
import numpy as np
import itertools
from xgboost import XGBRegressor
from skforecast.direct import ForecasterDirect
from sklearn.metrics import mean_squared_error
from joblib import Parallel, delayed
from tqdm import tqdm

# Set Seed
np.random.seed(SEED)
print("Libraries Loaded.")

Libraries Loaded.


## โหลดข้อมูล

In [3]:
# 3. Load Data

sheet_id = "1-hzX_qRFjS7TIhWkTsrWPx7M_cFATCQxvWlRmo97Wac"
sheet_gid = "0"
csv_url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/export?format=csv&gid={sheet_gid}"

try:
    data = pd.read_csv(csv_url)
except:
    print("Error loading data via URL.")

if data['Date'].dtype == 'object':
    data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values('Date').reset_index(drop=True)

# Use only training set for Grid Search
y = data['Y']
y_train = y.iloc[:N_TRAIN]

print(f"Training Data Size: {len(y_train)}")

Training Data Size: 618


## สร้างฟังก์ชั่น สำหรับกระบวนการ Rolling Window CV

In [4]:
# 4. Rolling CV Function


def make_cv_splits(n_obs, valid_size, n_splits):
    """
    Creates observation-based CV splits.
    """
    usable_size = n_obs
    train_size = int((usable_size - valid_size) * 0.75)
    
    start_min = 0
    start_max = usable_size - (train_size + valid_size)
    
    if start_max < start_min:
        return []
        
    step = 0 if n_splits == 1 else max(1, (start_max - start_min) // (n_splits - 1))
    
    splits = []
    for k in range(n_splits):
        s = start_min + k * step
        tr_idx = np.arange(s, s + train_size)
        va_idx = np.arange(s + train_size, s + train_size + valid_size)
        splits.append((tr_idx, va_idx))
    return splits

def evaluate_one_config(y_series, params, lags):
    """
    Evaluates one (params, lags) combination across CV folds.
    """
    splits = make_cv_splits(len(y_series), VALID_SIZE, N_SPLITS)
    rmses = []
    
    for tr_idx, va_idx in splits:
        y_fold_train = y_series.iloc[tr_idx].reset_index(drop=True)
        y_fold_valid = y_series.iloc[va_idx].reset_index(drop=True)
        
        # Setup Model
        regressor = XGBRegressor(
            **params,
            verbosity=0,
            n_jobs=1,  # Single core per model inside CV
            random_state=SEED
        )
        
        # Setup Forecaster (Direct Strategy)
        forecaster = ForecasterDirect(
            regressor=regressor,
            lags=lags,
            steps=VALID_SIZE,
            differentiation=1,
            n_jobs=1
        )
        
        # Fit & Predict
        forecaster.fit(y=y_fold_train)
        preds = forecaster.predict(steps=VALID_SIZE)
        
        # Metrics
        mse = mean_squared_error(y_fold_valid, preds)
        rmses.append(np.sqrt(mse))
        
    return np.mean(rmses)

## ดำเนินการ Grid Search

In [5]:
# 5. Grid Search Process


# Create all combinations
keys, values = zip(*PARAM_GRID.items())
param_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]
full_combinations = list(itertools.product(param_dicts, LAGS_GRID))

print(f"Total Combinations: {len(full_combinations)}")

# prevent parallel errors
def safe_evaluate(params, lags):
    try:
        score = evaluate_one_config(y_train, params, lags)
        return (score, params, lags, None)
    except Exception as e:
        return (None, params, lags, str(e))

# Run
results_gen = Parallel(n_jobs=N_JOBS, return_as="generator")(
    delayed(safe_evaluate)(p, l) for p, l in full_combinations
)

results_list = []
for res in tqdm(results_gen, total=len(full_combinations), desc="Progress"):
    results_list.append(res)

Total Combinations: 128


Progress: 100%|██████████| 128/128 [12:56<00:00,  6.07s/it]


## ผลลัพธ์การ Grid Search

In [6]:
# 6. Results

# Filter Valid Results
valid_results = [(s, p, l) for s, p, l, e in results_list if s is not None]

if valid_results:
    # Create DataFrame
    data_res = []
    for score, params, lags in valid_results:
        row = params.copy()
        row['lags'] = lags
        row['RMSE'] = score
        data_res.append(row)
        
    df_results = pd.DataFrame(data_res)
    df_results = df_results.sort_values(by='RMSE', ascending=True)
    
    print("\n Top 5 Best Configurations:")
    print(df_results.head(5))
    
    # Best
    best_row = df_results.iloc[0]
    print("\n Best Params:")
    print(best_row.to_dict())
else:
    print("No valid results found")


 Top 5 Best Configurations:
    max_depth  learning_rate  min_child_weight  gamma  reg_lambda  \
72         12           0.01                 1   0.10         0.1   
64         12           0.01                 1   0.05         0.1   
68         12           0.01                 1   0.05         0.2   
76         12           0.01                 1   0.10         0.2   
88         12           0.01                 3   0.10         0.1   

    n_estimators  lags       RMSE  
72            50    40  92.356971  
64            50    40  92.374763  
68            50    40  93.784437  
76            50    40  93.800911  
88            50    40  97.488171  

 Best Params:
{'max_depth': 12.0, 'learning_rate': 0.01, 'min_child_weight': 1.0, 'gamma': 0.1, 'reg_lambda': 0.1, 'n_estimators': 50.0, 'lags': 40.0, 'RMSE': 92.35697107454187}
