## Initial Code

In [None]:
# Importing necessary libraries for data analysis and manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# For handling warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
df_aapl = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/stocks/AAPL.csv')

In [None]:
import numpy as np
from scipy.stats import boxcox

df_aapl['Close_log'] = np.log(df_aapl['Close'] + 1)
df_aapl['Close_sqrt'] = np.sqrt(df_aapl['Close'])
df_aapl['Close_boxcox'], _ = boxcox(df_aapl['Close'] + 1)


This code calculates the skewness of the 'Close' column in the `df_aapl` DataFrame before and after applying various transformations:

1. **Original Skewness**: Calculates the skewness of the original 'Close' data.
2. **Log Transformation Skewness**: Calculates the skewness of the 'Close_log' column after applying the log transformation.
3. **Square Root Transformation Skewness**: Calculates the skewness of the 'Close_sqrt' column after applying the square root transformation.
4. **Box-Cox Transformation Skewness**: Calculates the skewness of the 'Close_boxcox' column after applying the Box-Cox transformation.

The printed results help assess how each transformation affects the distribution's symmetry and the success of skewness correction.







In [None]:

skew_original = df_aapl['Close'].skew()
skew_log = df_aapl['Close_log'].skew()
skew_sqrt = df_aapl['Close_sqrt'].skew()
skew_boxcox = pd.Series(df_aapl['Close_boxcox']).skew()

print(f"Original Skewness: {skew_original}")
print(f"Log Transformation Skewness: {skew_log}")
print(f"Square Root Transformation Skewness: {skew_sqrt}")
print(f"Box-Cox Transformation Skewness: {skew_boxcox}")


Original Skewness: 2.5045276102319933
Log Transformation Skewness: 0.8535555176510303
Square Root Transformation Skewness: 1.6211545809555206
Box-Cox Transformation Skewness: 0.43527466713563334


In [None]:

df_aapl['Open_log'] = np.log(df_aapl['Open'])
df_aapl['High_log'] = np.log(df_aapl['High'])
df_aapl['Low_log'] = np.log(df_aapl['Low'])
df_aapl['Adj Close_log'] = np.log(df_aapl['Adj Close'])
df_aapl['Volume_log'] = np.log(df_aapl['Volume'])


df_aapl['Open_sqrt'] = np.sqrt(df_aapl['Open'])
df_aapl['High_sqrt'] = np.sqrt(df_aapl['High'])
df_aapl['Low_sqrt'] = np.sqrt(df_aapl['Low'])
df_aapl['Adj Close_sqrt'] = np.sqrt(df_aapl['Adj Close'])
df_aapl['Volume_sqrt'] = np.sqrt(df_aapl['Volume'])

from scipy.stats import boxcox
df_aapl['Open_boxcox'], _ = boxcox(df_aapl['Open'])
df_aapl['High_boxcox'], _ = boxcox(df_aapl['High'])
df_aapl['Low_boxcox'], _ = boxcox(df_aapl['Low'])
df_aapl['Adj Close_boxcox'], _ = boxcox(df_aapl['Adj Close'])

This helps compare how the transformations reduce skewness in the data, aiming for a more normal distribution.

In [None]:

skewness_before = df_aapl[['Open', 'High', 'Low', 'Adj Close', 'Volume']].skew()
skewness_after = df_aapl[['Open_log', 'High_log', 'Low_log', 'Adj Close_log',
                          'Open_sqrt', 'High_sqrt', 'Low_sqrt', 'Adj Close_sqrt', 'Volume_sqrt',
                          'Open_boxcox', 'High_boxcox', 'Low_boxcox', 'Adj Close_boxcox']].skew()

print("Skewness Before Transformation:\n", skewness_before)
print("\nSkewness After Transformation:\n", skewness_after)


Skewness Before Transformation:
 Open         2.504632
High         2.502208
Low          2.506714
Adj Close    2.550677
Volume       3.565699
dtype: float64

Skewness After Transformation:
 Open_log            0.482872
High_log            0.481997
Low_log             0.484246
Adj Close_log       0.494009
Open_sqrt           1.620771
High_sqrt           1.621456
Low_sqrt            1.620661
Adj Close_sqrt      1.679402
Volume_sqrt         1.299776
Open_boxcox         0.181226
High_boxcox         0.179749
Low_boxcox          0.182882
Adj Close_boxcox    0.180085
dtype: float64


- Applied Box-Cox transformation to the 'Open', 'High', 'Low', 'Adj Close', and 'Close' columns.
- Recalculated skewness after the transformation to reduce skew and normalize the data for modeling.

In [None]:
from scipy import stats

df_aapl['Open_boxcox'], _ = stats.boxcox(df_aapl['Open'] + 1)
df_aapl['High_boxcox'], _ = stats.boxcox(df_aapl['High'] + 1)
df_aapl['Low_boxcox'], _ = stats.boxcox(df_aapl['Low'] + 1)
df_aapl['Adj Close_boxcox'], _ = stats.boxcox(df_aapl['Adj Close'] + 1)
df_aapl['Close_boxcox'], _ = stats.boxcox(df_aapl['Close'] + 1)

skewness_after_boxcox = df_aapl[['Open_boxcox', 'High_boxcox', 'Low_boxcox', 'Adj Close_boxcox', 'Close_boxcox']].skew()

print("Skewness After Box-Cox Transformation:")
print(skewness_after_boxcox)


Skewness After Box-Cox Transformation:
Open_boxcox         0.435237
High_boxcox         0.433381
Low_boxcox          0.437331
Adj Close_boxcox    0.458762
Close_boxcox        0.435275
dtype: float64


Feature Selection

In [None]:

df_aapl_cleaned = df_aapl[['Date', 'Open', 'High', 'Low', 'Adj Close', 'Close', 'Volume',
                           'Open_boxcox', 'High_boxcox', 'Low_boxcox', 'Adj Close_boxcox',
                           'Close_boxcox']]

print(df_aapl_cleaned.head())


         Date      Open      High       Low  Adj Close     Close     Volume  \
0  1980-12-12  0.128348  0.128906  0.128348   0.098943  0.128348  469033600   
1  1980-12-15  0.122210  0.122210  0.121652   0.093781  0.121652  175884800   
2  1980-12-16  0.113281  0.113281  0.112723   0.086898  0.112723  105728000   
3  1980-12-17  0.115513  0.116071  0.115513   0.089049  0.115513   86441600   
4  1980-12-18  0.118862  0.119420  0.118862   0.091630  0.118862   73449600   

   Open_boxcox  High_boxcox  Low_boxcox  Adj Close_boxcox  Close_boxcox  
0     0.117689     0.118173    0.117674          0.092374      0.117689  
1     0.112503     0.112516    0.112016          0.087857      0.112030  
2     0.104886     0.104897    0.104395          0.081785      0.104407  
3     0.106798     0.107287    0.106786          0.083688      0.106798  
4     0.109657     0.110145    0.109644          0.085966      0.109657  


### Train Validation Test Split

The code splits the data into training, validation, and test sets. The features `X` and target `Y` are split as follows:

- 70% for training (`X_train`, `Y_train`)
- 15% for validation (`X_val`, `Y_val`)
- 15% for testing (`X_test`, `Y_test`)

The split is done using a 30% test size, followed by splitting the remaining 70% into validation and test sets without shuffling (time series data).

In [None]:
from sklearn.model_selection import train_test_split

X = df_aapl_cleaned[['Open_boxcox', 'High_boxcox', 'Low_boxcox']]
Y = df_aapl_cleaned['Close_boxcox']

X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, shuffle=False)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, shuffle=False)

print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")


Training set: (7736, 3), Validation set: (1658, 3), Test set: (1658, 3)


## GPU Activation

In [None]:
import torch

# Check GPU status
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is enabled:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU.")


GPU is enabled: Tesla T4


##GRU - xgb

### xgb initial

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        return self.fc(out[:, -1, :])  # Take last time step output

# Set Parameters
input_size = 3
hidden_size = 64
num_layers_list = [2, 3, 5]  # Different GRU layers
learning_rate = 0.001
num_epochs = 100

# MinMax Scaling (helps GRU)
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100   # Avoid divide-by-zero
    return mae, mse, rmse, r2, mape

# DataFrame to store results
columns = ["Layers", "Dataset", "MAE", "MSE", "RMSE", "R²", "MAPE"]
results_df = pd.DataFrame(columns=columns)

# Train multiple GRU models
gru_outputs = {}  # Store GRU embeddings for XGBoost later

for num_layers in num_layers_list:
    print(f"\nTraining GRU with {num_layers} layers...")

    # Initialize model, loss function, and optimizer
    model = GRUModel(input_size, hidden_size, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_torch).cpu().numpy()
        val_pred = model(X_val_torch).cpu().numpy()
        test_pred = model(X_test_torch).cpu().numpy()

    # Inverse transform predictions
    train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

    # Compute metrics for each dataset
    metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
    metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
    metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

    # Append results to DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame([
            [num_layers, "Train", *metrics_train],
            [num_layers, "Validation", *metrics_val],
            [num_layers, "Test", *metrics_test]
        ], columns=columns)
    ], ignore_index=True)

    # Store GRU embeddings for XGBoost
    gru_outputs[num_layers] = {
        "train": train_pred_actual,
        "val": val_pred_actual,
        "test": test_pred_actual
    }

# Find the best GRU model (Lowest Validation MAPE)
best_model = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
best_layers = int(best_model["Layers"])

# Display GRU Results
print("\nGRU Model Performance Comparison (2, 3, and 5 Layers)\n")
print(results_df.to_string(index=False))

print(f"\nBest GRU Model: {best_layers} Layers (Based on Lowest Validation MAPE)\n")

# ---------- XGBoost on Best GRU Embeddings ----------
print("\nTraining XGBoost on Best GRU Embeddings...")

# Use the best GRU's output as features for XGBoost
X_train_xgb = gru_outputs[best_layers]["train"]
X_val_xgb = gru_outputs[best_layers]["val"]
X_test_xgb = gru_outputs[best_layers]["test"]

# XGBoost Model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.05)
xgb_model.fit(X_train_xgb, Y_train.values)

# Predictions
train_pred_xgb = xgb_model.predict(X_train_xgb)
val_pred_xgb = xgb_model.predict(X_val_xgb)
test_pred_xgb = xgb_model.predict(X_test_xgb)

# Compute metrics for XGBoost
metrics_train_xgb = compute_metrics(Y_train.values.flatten(), train_pred_xgb.flatten())
metrics_val_xgb = compute_metrics(Y_val.values.flatten(), val_pred_xgb.flatten())
metrics_test_xgb = compute_metrics(Y_test.values.flatten(), test_pred_xgb.flatten())

# Append XGBoost results to DataFrame
results_df = pd.concat([
    results_df,
    pd.DataFrame([
        [f"GRU({best_layers}) + XGBoost", "Train", *metrics_train_xgb],
        [f"GRU({best_layers}) + XGBoost", "Validation", *metrics_val_xgb],
        [f"GRU({best_layers}) + XGBoost", "Test", *metrics_test_xgb]
    ], columns=columns)
], ignore_index=True)

# Display Final Results
print("\nFinal Model Performance (GRU vs GRU + XGBoost)\n")
print(results_df.to_string(index=False))

# Best Model Selection
best_overall = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
print(f"\nBest Overall Model: {best_overall['Layers']} (Based on Lowest Validation MAPE)")



Training GRU with 2 layers...
Epoch [10/100], Loss: 0.0734
Epoch [20/100], Loss: 0.0733
Epoch [30/100], Loss: 0.0715
Epoch [40/100], Loss: 0.0717
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training GRU with 3 layers...
Epoch [10/100], Loss: 0.1090
Epoch [20/100], Loss: 0.0729
Epoch [30/100], Loss: 0.0734
Epoch [40/100], Loss: 0.0724
Epoch [50/100], Loss: 0.0716
Epoch [60/100], Loss: 0.0716
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training GRU with 5 layers...
Epoch [10/100], Loss: 0.0731
Epoch [20/100], Loss: 0.0729
Epoch [30/100], Loss: 0.0717
Epoch [40/100], Loss: 0.0715
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

GRU Model Performance Comparison 

In [None]:
!pip install xgboost



### xgb optuna

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import xgboost as xgb
import pandas as pd
import optuna
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        return self.fc(out[:, -1, :])  # Last time step output

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100   # Avoid divide-by-zero
    return mae, mse, rmse, r2, mape

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Optuna GRU Optimization
def objective_gru(trial):
    num_layers = trial.suggest_int("num_layers", 2, 5)
    hidden_size = trial.suggest_int("hidden_size", 32, 128)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    num_epochs = 100

    # Train GRU
    model = GRUModel(input_size=3, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate GRU
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_torch).cpu().numpy()

    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    mape = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())[-1]  # MAPE

    return mape  # Minimize Validation MAPE

# Run Optuna for GRU
study_gru = optuna.create_study(direction="minimize")
study_gru.optimize(objective_gru, n_trials=20)
best_gru_params = study_gru.best_params

# Train Best GRU Model
best_gru_model = GRUModel(input_size=3, hidden_size=best_gru_params["hidden_size"], num_layers=best_gru_params["num_layers"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_gru_model.parameters(), lr=best_gru_params["learning_rate"])

for epoch in range(100):
    best_gru_model.train()
    outputs = best_gru_model(X_train_torch)
    loss = criterion(outputs, Y_train_torch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Extract GRU Embeddings
best_gru_model.eval()
with torch.no_grad():
    X_train_xgb = best_gru_model(X_train_torch).cpu().numpy()
    X_val_xgb = best_gru_model(X_val_torch).cpu().numpy()
    X_test_xgb = best_gru_model(X_test_torch).cpu().numpy()

# Optuna XGBoost Optimization
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "objective": "reg:squarederror"
    }

    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train_xgb, Y_train.values)

    val_pred_xgb = xgb_model.predict(X_val_xgb)
    mape = compute_metrics(Y_val.values.flatten(), val_pred_xgb.flatten())[-1]  # MAPE

    return mape  # Minimize Validation MAPE

# Run Optuna for XGBoost
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=20)
best_xgb_params = study_xgb.best_params

# Train Best XGBoost Model
best_xgb_model = xgb.XGBRegressor(**best_xgb_params)
best_xgb_model.fit(X_train_xgb, Y_train.values)

# Predictions
train_pred_xgb = best_xgb_model.predict(X_train_xgb)
val_pred_xgb = best_xgb_model.predict(X_val_xgb)
test_pred_xgb = best_xgb_model.predict(X_test_xgb)

# Compute final metrics
metrics_train_xgb = compute_metrics(Y_train.values.flatten(), train_pred_xgb.flatten())
metrics_val_xgb = compute_metrics(Y_val.values.flatten(), val_pred_xgb.flatten())
metrics_test_xgb = compute_metrics(Y_test.values.flatten(), test_pred_xgb.flatten())

# Print Final Results
print("\nFinal Model Performance (GRU + XGBoost):\n")
print(f"Training:    MAE={metrics_train_xgb[0]:.4f}, MSE={metrics_train_xgb[1]:.4f}, RMSE={metrics_train_xgb[2]:.4f}, R²={metrics_train_xgb[3]:.4f}, MAPE={metrics_train_xgb[4]:.2f}%")
print(f"Validation:  MAE={metrics_val_xgb[0]:.4f}, MSE={metrics_val_xgb[1]:.4f}, RMSE={metrics_val_xgb[2]:.4f}, R²={metrics_val_xgb[3]:.4f}, MAPE={metrics_val_xgb[4]:.2f}%")
print(f"Test:        MAE={metrics_test_xgb[0]:.4f}, MSE={metrics_test_xgb[1]:.4f}, RMSE={metrics_test_xgb[2]:.4f}, R²={metrics_test_xgb[3]:.4f}, MAPE={metrics_test_xgb[4]:.2f}%")

print("\nBest GRU Parameters:", best_gru_params)
print("Best XGBoost Parameters:", best_xgb_params)


[I 2025-02-13 05:22:01,086] A new study created in memory with name: no-name-931e6b07-adda-4472-833b-4dd74f3518b5
[I 2025-02-13 05:22:05,075] Trial 0 finished with value: 72.33646716364942 and parameters: {'num_layers': 3, 'hidden_size': 99, 'learning_rate': 0.00024761085506235436}. Best is trial 0 with value: 72.33646716364942.
[I 2025-02-13 05:22:07,771] Trial 1 finished with value: 74.14669973103528 and parameters: {'num_layers': 2, 'hidden_size': 105, 'learning_rate': 0.0014432661317700867}. Best is trial 0 with value: 72.33646716364942.
[I 2025-02-13 05:22:13,804] Trial 2 finished with value: 73.98521400889024 and parameters: {'num_layers': 3, 'hidden_size': 128, 'learning_rate': 0.0056635588935976125}. Best is trial 0 with value: 72.33646716364942.
[I 2025-02-13 05:22:17,059] Trial 3 finished with value: 73.94752857490033 and parameters: {'num_layers': 4, 'hidden_size': 81, 'learning_rate': 0.004318020227720887}. Best is trial 0 with value: 72.33646716364942.
[I 2025-02-13 05:22:


Final Model Performance (GRU + XGBoost):

Training:    MAE=0.0035, MSE=0.0000, RMSE=0.0054, R²=0.9998, MAPE=1.02%
Validation:  MAE=0.1570, MSE=0.0305, RMSE=0.1747, R²=-4.1882, MAPE=8.81%
Test:        MAE=0.4260, MSE=0.1875, RMSE=0.4330, R²=-30.0683, MAPE=21.01%

Best GRU Parameters: {'num_layers': 4, 'hidden_size': 56, 'learning_rate': 0.00020713388255328886}
Best XGBoost Parameters: {'n_estimators': 219, 'learning_rate': 0.09826901295532306, 'max_depth': 5, 'subsample': 0.5042389444049517, 'colsample_bytree': 0.8291903158037685}


### xgb bohb

In [None]:
!pip install hpbandster ConfigSpace

Collecting hpbandster
  Downloading hpbandster-0.7.4.tar.gz (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ConfigSpace
  Downloading configspace-1.2.1.tar.gz (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting Pyro4 (from hpbandster)
  Downloading Pyro4-4.82-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting serpent (from hpbandster)
  Downloading serpent-1.41-py3-none-any.whl.metadata (5.8 kB)
Collecting netifaces (from hpbandster)
  Downloading netifaces-0.11.0.t

In [None]:
import numpy as np
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB
from hpbandster.core.worker import Worker
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, r2, mape

# Convert datasets to PyTorch tensors
Y_train_torch = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_val_torch = torch.tensor(Y_val.values, dtype=torch.float32).unsqueeze(1)
Y_test_torch = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1)

# GRU Configurations (2, 3, and 5 layers)
gru_layers = [2, 3, 5]
hidden_dim = 64
output_dim = 1
input_dim = X_train.shape[1]

# Dictionary to store GRU feature representations
gru_features = {}

for num_layers in gru_layers:
    print(f"Training GRU with {num_layers} layers...")

    gru_model = GRUModel(input_dim, hidden_dim, num_layers, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
    num_epochs = 100

    for epoch in range(num_epochs):
        gru_model.train()
        optimizer.zero_grad()
        outputs = gru_model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        loss.backward()
        optimizer.step()

    # Extract Feature Representations
    gru_model.eval()
    with torch.no_grad():
        train_features = gru_model(X_train_torch).numpy()
        val_features = gru_model(X_val_torch).numpy()
        test_features = gru_model(X_test_torch).numpy()

    gru_features[num_layers] = (train_features, val_features, test_features)

# Define ConfigSpace for BOHB
def get_config_space():
    cs = CS.ConfigurationSpace()
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("n_estimators", 50, 500, default_value=100))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("learning_rate", 0.01, 0.3, default_value=0.1))
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("max_depth", 3, 10, default_value=6))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("subsample", 0.5, 1.0, default_value=0.8))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("colsample_bytree", 0.5, 1.0, default_value=0.8))
    return cs

# BOHB Worker for XGBoost
class XGBoostWorker(Worker):
    def __init__(self, train_features, val_features, **kwargs):
        super().__init__(**kwargs)
        self.train_features = train_features
        self.val_features = val_features

    def compute(self, config, budget, **kwargs):
        model = xgb.XGBRegressor(
            n_estimators=config["n_estimators"],
            learning_rate=config["learning_rate"],
            max_depth=config["max_depth"],
            subsample=config["subsample"],
            colsample_bytree=config["colsample_bytree"],
            random_state=42
        )
        model.fit(self.train_features, Y_train)
        Y_val_pred = model.predict(self.val_features)
        mae = mean_absolute_error(Y_val, Y_val_pred)
        return {"loss": mae, "info": config}

# Run BOHB for each GRU configuration
best_models = {}

for num_layers in gru_layers:
    print(f"\nRunning BOHB for GRU ({num_layers} layers) + XGBoost...")

    train_features, val_features, test_features = gru_features[num_layers]

    # Start NameServer
    NS = hpns.NameServer(run_id=f"gru_{num_layers}_xgb_bohb", host="127.0.0.1", port=None)
    NS.start()

    worker = XGBoostWorker(
        train_features=train_features,
        val_features=val_features,
        nameserver="127.0.0.1",
        run_id=f"gru_{num_layers}_xgb_bohb"
    )
    worker.run(background=True)

    bohb = BOHB(
        configspace=get_config_space(),
        run_id=f"gru_{num_layers}_xgb_bohb",
        nameserver="127.0.0.1",
        min_budget=1,
        max_budget=3
    )

    res = bohb.run(n_iterations=50)

    # Shutdown BOHB
    bohb.shutdown()
    NS.shutdown()

    # Retrieve Best Configuration
    best_config = res.get_incumbent_id()
    best_params = res.get_id2config_mapping()[best_config]["config"]

    # Train Best XGB Model on GRU Features
    best_xgb_model = xgb.XGBRegressor(
        n_estimators=best_params["n_estimators"],
        learning_rate=best_params["learning_rate"],
        max_depth=best_params["max_depth"],
        subsample=best_params["subsample"],
        colsample_bytree=best_params["colsample_bytree"],
        random_state=42
    )

    best_xgb_model.fit(train_features, Y_train)

    # Make Predictions
    Y_train_pred = best_xgb_model.predict(train_features)
    Y_val_pred = best_xgb_model.predict(val_features)
    Y_test_pred = best_xgb_model.predict(test_features)

    # Calculate Metrics
    train_metrics = calculate_metrics(Y_train, Y_train_pred)
    val_metrics = calculate_metrics(Y_val, Y_val_pred)
    test_metrics = calculate_metrics(Y_test, Y_test_pred)

    # Store best model and metrics
    best_models[num_layers] = {
        "params": best_params,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics
    }

    # Print Results
    print(f"\nBest Parameters for GRU ({num_layers} layers) + XGBoost:")
    print(best_params)

    print("\nTraining set metrics:")
    print(f"MAE: {train_metrics[0]:.4f}, MSE: {train_metrics[1]:.4f}, RMSE: {train_metrics[2]:.4f}, R²: {train_metrics[3]:.4f}, MAPE: {train_metrics[4]:.2f}%")

    print("\nValidation set metrics:")
    print(f"MAE: {val_metrics[0]:.4f}, MSE: {val_metrics[1]:.4f}, RMSE: {val_metrics[2]:.4f}, R²: {val_metrics[3]:.4f}, MAPE: {val_metrics[4]:.2f}%")

    print("\nTest set metrics:")
    print(f"MAE: {test_metrics[0]:.4f}, MSE: {test_metrics[1]:.4f}, RMSE: {test_metrics[2]:.4f}, R²: {test_metrics[3]:.4f}, MAPE: {test_metrics[4]:.2f}%")


Training GRU with 2 layers...
Training GRU with 3 layers...
Training GRU with 5 layers...

Running BOHB for GRU (2 layers) + XGBoost...

Best Parameters for GRU (2 layers) + XGBoost:
{'colsample_bytree': 0.616514951443, 'learning_rate': 0.2351215327059, 'max_depth': 9, 'n_estimators': 268, 'subsample': 0.5158514159354}

Training set metrics:
MAE: 0.0036, MSE: 0.0000, RMSE: 0.0055, R²: 0.9998, MAPE: 1.03%

Validation set metrics:
MAE: 0.1569, MSE: 0.0305, RMSE: 0.1746, R²: -4.1838, MAPE: 8.81%

Test set metrics:
MAE: 0.4259, MSE: 0.1874, RMSE: 0.4329, R²: -30.0568, MAPE: 21.01%

Running BOHB for GRU (3 layers) + XGBoost...

Best Parameters for GRU (3 layers) + XGBoost:
{'colsample_bytree': 0.5077275834783, 'learning_rate': 0.2281085151797, 'max_depth': 10, 'n_estimators': 97, 'subsample': 0.5234115229348}

Training set metrics:
MAE: 0.0036, MSE: 0.0000, RMSE: 0.0055, R²: 0.9998, MAPE: 1.03%

Validation set metrics:
MAE: 0.1567, MSE: 0.0304, RMSE: 0.1744, R²: -4.1705, MAPE: 8.79%

Test s

## GRU - catboost

### initial

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        return self.fc(out[:, -1, :])

# Set Parameters
input_size = 3
hidden_size = 64
num_layers_list = [2, 3, 5]
learning_rate = 0.001
num_epochs = 100

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100
    return mae, mse, rmse, r2, mape

# DataFrame to store results
columns = ["Layers", "Dataset", "MAE", "MSE", "RMSE", "R²", "MAPE"]
results_df = pd.DataFrame(columns=columns)

# Train multiple GRU models
gru_outputs = {}

for num_layers in num_layers_list:
    print(f"\nTraining GRU with {num_layers} layers...")

    model = GRUModel(input_size, hidden_size, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_torch).cpu().numpy()
        val_pred = model(X_val_torch).cpu().numpy()
        test_pred = model(X_test_torch).cpu().numpy()

    # Inverse transform predictions
    train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

    # Compute metrics for each dataset
    metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
    metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
    metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

    # Append results to DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame([
            [num_layers, "Train", *metrics_train],
            [num_layers, "Validation", *metrics_val],
            [num_layers, "Test", *metrics_test]
        ], columns=columns)
    ], ignore_index=True)

    # Store GRU embeddings for CatBoost
    gru_outputs[num_layers] = {
        "train": train_pred_actual,
        "val": val_pred_actual,
        "test": test_pred_actual
    }

# Find the best GRU model (Lowest Validation MAPE)
best_model = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
best_layers = int(best_model["Layers"])

# Display GRU Results
print("\nGRU Model Performance Comparison (2, 3, and 5 Layers)\n")
print(results_df.to_string(index=False))

print(f"\nBest GRU Model: {best_layers} Layers (Based on Lowest Validation MAPE)\n")

# ---------- CatBoost on Best GRU Embeddings ----------
print("\nTraining CatBoost on Best GRU Embeddings...")

# Use the best GRU's output as features for CatBoost
X_train_cat = gru_outputs[best_layers]["train"]
X_val_cat = gru_outputs[best_layers]["val"]
X_test_cat = gru_outputs[best_layers]["test"]

# CatBoost Model
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, loss_function="RMSE", verbose=100)
cat_model.fit(X_train_cat, Y_train.values, eval_set=(X_val_cat, Y_val.values), early_stopping_rounds=100)

# Predictions
train_pred_cat = cat_model.predict(X_train_cat)
val_pred_cat = cat_model.predict(X_val_cat)
test_pred_cat = cat_model.predict(X_test_cat)

# Compute metrics for CatBoost
metrics_train_cat = compute_metrics(Y_train.values.flatten(), train_pred_cat.flatten())
metrics_val_cat = compute_metrics(Y_val.values.flatten(), val_pred_cat.flatten())
metrics_test_cat = compute_metrics(Y_test.values.flatten(), test_pred_cat.flatten())

# Append CatBoost results to DataFrame
results_df = pd.concat([
    results_df,
    pd.DataFrame([
        [f"GRU({best_layers}) + CatBoost", "Train", *metrics_train_cat],
        [f"GRU({best_layers}) + CatBoost", "Validation", *metrics_val_cat],
        [f"GRU({best_layers}) + CatBoost", "Test", *metrics_test_cat]
    ], columns=columns)
], ignore_index=True)

# Display Final Results
print("\nFinal Model Performance (GRU vs GRU + CatBoost)\n")
print(results_df.to_string(index=False))

# Best Model Selection
best_overall = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
print(f"\nBest Overall Model: {best_overall['Layers']} (Based on Lowest Validation MAPE)")



Training GRU with 2 layers...
Epoch [10/100], Loss: 0.0899
Epoch [20/100], Loss: 0.0746
Epoch [30/100], Loss: 0.0731
Epoch [40/100], Loss: 0.0719
Epoch [50/100], Loss: 0.0716
Epoch [60/100], Loss: 0.0716
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training GRU with 3 layers...
Epoch [10/100], Loss: 0.0729
Epoch [20/100], Loss: 0.0732
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0716
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training GRU with 5 layers...
Epoch [10/100], Loss: 0.0770
Epoch [20/100], Loss: 0.0742
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0717
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

GRU Model Performance Comparison 

### optuna

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        return self.fc(out[:, -1, :])

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100
    return mae, mse, rmse, r2, mape

# Optuna objective function
def objective(trial):
    hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128])
    num_layers = trial.suggest_int("num_layers", 2, 5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)

    model = GRUModel(input_size=3, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(50):  # Reduce epochs for faster tuning
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_torch).cpu().numpy()

    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    _, _, _, _, val_mape = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())

    return val_mape

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Get best hyperparameters
best_params = study.best_params
print(f"\nBest GRU Hyperparameters: {best_params}")

# Train Best GRU Model
best_gru = GRUModel(input_size=3, hidden_size=best_params["hidden_size"], num_layers=best_params["num_layers"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_gru.parameters(), lr=best_params["learning_rate"])

for epoch in range(100):
    best_gru.train()
    outputs = best_gru(X_train_torch)
    loss = criterion(outputs, Y_train_torch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Evaluate Best GRU Model
best_gru.eval()
with torch.no_grad():
    train_pred = best_gru(X_train_torch).cpu().numpy()
    val_pred = best_gru(X_val_torch).cpu().numpy()
    test_pred = best_gru(X_test_torch).cpu().numpy()

# Inverse transform predictions
train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

# Compute metrics
metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

# Store GRU embeddings
gru_outputs = {
    "train": train_pred_actual,
    "val": val_pred_actual,
    "test": test_pred_actual
}

# Display GRU Results
print("\nGRU Model Performance (Best Hyperparameters):\n")
print(f"Train: {metrics_train}\nValidation: {metrics_val}\nTest: {metrics_test}")

# ---------- CatBoost on Best GRU Embeddings ----------
print("\nTraining CatBoost on Best GRU Embeddings...")

# Use GRU embeddings as input for CatBoost
X_train_cat = gru_outputs["train"]
X_val_cat = gru_outputs["val"]
X_test_cat = gru_outputs["test"]

# CatBoost Model
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, loss_function="RMSE", verbose=100)
cat_model.fit(X_train_cat, Y_train.values, eval_set=(X_val_cat, Y_val.values), early_stopping_rounds=100)

# Predictions
train_pred_cat = cat_model.predict(X_train_cat)
val_pred_cat = cat_model.predict(X_val_cat)
test_pred_cat = cat_model.predict(X_test_cat)

# Compute metrics for CatBoost
metrics_train_cat = compute_metrics(Y_train.values.flatten(), train_pred_cat.flatten())
metrics_val_cat = compute_metrics(Y_val.values.flatten(), val_pred_cat.flatten())
metrics_test_cat = compute_metrics(Y_test.values.flatten(), test_pred_cat.flatten())

# Display Final Results
print("\nFinal Model Performance (GRU vs GRU + CatBoost)\n")
print(f"Train: {metrics_train_cat}\nValidation: {metrics_val_cat}\nTest: {metrics_test_cat}")

# Best Model Selection
best_model = "GRU + CatBoost" if metrics_val_cat[4] < metrics_val[4] else "GRU Only"
print(f"\nBest Overall Model: {best_model} (Based on Lowest Validation MAPE)")


[I 2025-02-13 05:39:00,191] A new study created in memory with name: no-name-705da87e-27e6-4ac4-98cb-c90333f0609c
[I 2025-02-13 05:39:03,948] Trial 0 finished with value: 75.25414628370612 and parameters: {'hidden_size': 128, 'num_layers': 4, 'learning_rate': 0.00043776742528689987}. Best is trial 0 with value: 75.25414628370612.
[I 2025-02-13 05:39:04,379] Trial 1 finished with value: 71.58392829532345 and parameters: {'hidden_size': 32, 'num_layers': 2, 'learning_rate': 0.0003291705614879124}. Best is trial 1 with value: 71.58392829532345.
[I 2025-02-13 05:39:05,868] Trial 2 finished with value: 73.45893080145923 and parameters: {'hidden_size': 64, 'num_layers': 5, 'learning_rate': 0.0009869375397354883}. Best is trial 1 with value: 71.58392829532345.
[I 2025-02-13 05:39:06,422] Trial 3 finished with value: 73.42811734123924 and parameters: {'hidden_size': 32, 'num_layers': 4, 'learning_rate': 0.0008073318783337196}. Best is trial 1 with value: 71.58392829532345.
[I 2025-02-13 05:39:


Best GRU Hyperparameters: {'hidden_size': 128, 'num_layers': 4, 'learning_rate': 0.00022159920806409506}

GRU Model Performance (Best Hyperparameters):

Train: (0.3253094109411231, 0.17286601109499716, 0.4157715852424227, 0.01231278033712535, 124.0138336851776)
Validation: (1.2859226419819814, 1.6594217853177897, 1.2881854623142546, -281.1405305423759, 73.57781479240901)
Test: (1.5537205315550298, 2.4200290598228467, 1.555644258763181, -399.9943385967086, 77.05267364610829)

Training CatBoost on Best GRU Embeddings...
0:	learn: 0.3980547	test: 1.2474612	best: 1.2474612 (0)	total: 1.65ms	remaining: 1.64s
100:	learn: 0.0093264	test: 0.1989873	best: 0.1989873 (100)	total: 124ms	remaining: 1.1s
200:	learn: 0.0073850	test: 0.1775091	best: 0.1775091 (200)	total: 247ms	remaining: 981ms
300:	learn: 0.0069603	test: 0.1762935	best: 0.1762873 (299)	total: 367ms	remaining: 851ms
400:	learn: 0.0067976	test: 0.1757961	best: 0.1757961 (400)	total: 491ms	remaining: 733ms
500:	learn: 0.0066976	test: 0

### bohb


In [None]:
import numpy as np
import catboost as cb
import torch
import torch.nn as nn
import torch.optim as optim
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB
from hpbandster.core.worker import Worker
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, r2, mape

# Convert datasets to PyTorch tensors
Y_train_torch = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_val_torch = torch.tensor(Y_val.values, dtype=torch.float32).unsqueeze(1)
Y_test_torch = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1)

# GRU Configurations (2, 3, and 5 layers)
gru_layers = [2, 3, 5]
hidden_dim = 64
output_dim = 1
input_dim = X_train.shape[1]

# Dictionary to store GRU feature representations
gru_features = {}

for num_layers in gru_layers:
    print(f"Training GRU with {num_layers} layers...")

    gru_model = GRUModel(input_dim, hidden_dim, num_layers, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
    num_epochs = 100

    for epoch in range(num_epochs):
        gru_model.train()
        optimizer.zero_grad()
        outputs = gru_model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        loss.backward()
        optimizer.step()

    # Extract Feature Representations
    gru_model.eval()
    with torch.no_grad():
        train_features = gru_model(X_train_torch).numpy()
        val_features = gru_model(X_val_torch).numpy()
        test_features = gru_model(X_test_torch).numpy()

    gru_features[num_layers] = (train_features, val_features, test_features)

# Define ConfigSpace for BOHB
def get_config_space():
    cs = CS.ConfigurationSpace()
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("iterations", 50, 500, default_value=100))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("learning_rate", 0.01, 0.3, default_value=0.1))
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("depth", 3, 10, default_value=6))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("bagging_temperature", 0.0, 1.0, default_value=0.8))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("colsample_bylevel", 0.5, 1.0, default_value=0.8))
    return cs

# BOHB Worker for CatBoost
class CatBoostWorker(Worker):
    def __init__(self, train_features, val_features, **kwargs):
        super().__init__(**kwargs)
        self.train_features = train_features
        self.val_features = val_features

    def compute(self, config, budget, **kwargs):
        model = cb.CatBoostRegressor(
            iterations=config["iterations"],
            learning_rate=config["learning_rate"],
            depth=config["depth"],
            bagging_temperature=config["bagging_temperature"],
            colsample_bylevel=config["colsample_bylevel"],
            random_seed=42,
            verbose=0
        )
        model.fit(self.train_features, Y_train)
        Y_val_pred = model.predict(self.val_features)
        mae = mean_absolute_error(Y_val, Y_val_pred)
        return {"loss": mae, "info": config}

# Run BOHB for each GRU configuration
best_models = {}

for num_layers in gru_layers:
    print(f"\nRunning BOHB for GRU ({num_layers} layers) + CatBoost...")

    train_features, val_features, test_features = gru_features[num_layers]

    # Start NameServer
    NS = hpns.NameServer(run_id=f"gru_{num_layers}_catboost_bohb", host="127.0.0.1", port=None)
    NS.start()

    worker = CatBoostWorker(
        train_features=train_features,
        val_features=val_features,
        nameserver="127.0.0.1",
        run_id=f"gru_{num_layers}_catboost_bohb"
    )
    worker.run(background=True)

    bohb = BOHB(
        configspace=get_config_space(),
        run_id=f"gru_{num_layers}_catboost_bohb",
        nameserver="127.0.0.1",
        min_budget=1,
        max_budget=3
    )

    res = bohb.run(n_iterations=50)

    # Shutdown BOHB
    bohb.shutdown()
    NS.shutdown()

    # Retrieve Best Configuration
    best_config = res.get_incumbent_id()
    best_params = res.get_id2config_mapping()[best_config]["config"]

    # Train Best CatBoost Model on GRU Features
    best_catboost_model = cb.CatBoostRegressor(
        iterations=best_params["iterations"],
        learning_rate=best_params["learning_rate"],
        depth=best_params["depth"],
        bagging_temperature=best_params["bagging_temperature"],
        colsample_bylevel=best_params["colsample_bylevel"],
        random_seed=42,
        verbose=0
    )

    best_catboost_model.fit(train_features, Y_train)

    # Make Predictions
    Y_train_pred = best_catboost_model.predict(train_features)
    Y_val_pred = best_catboost_model.predict(val_features)
    Y_test_pred = best_catboost_model.predict(test_features)

    # Calculate Metrics
    train_metrics = calculate_metrics(Y_train, Y_train_pred)
    val_metrics = calculate_metrics(Y_val, Y_val_pred)
    test_metrics = calculate_metrics(Y_test, Y_test_pred)

    # Store best model and metrics
    best_models[num_layers] = {
        "params": best_params,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics
    }

    # Print Results
    print(f"\nBest Parameters for GRU ({num_layers} layers) + CatBoost:")
    print(best_params)

    print("\nTraining set metrics:")
    print(f"MAE: {train_metrics[0]:.4f}, MSE: {train_metrics[1]:.4f}, RMSE: {train_metrics[2]:.4f}, R²: {train_metrics[3]:.4f}, MAPE: {train_metrics[4]:.2f}%")

    print("\nValidation set metrics:")
    print(f"MAE: {val_metrics[0]:.4f}, MSE: {val_metrics[1]:.4f}, RMSE: {val_metrics[2]:.4f}, R²: {val_metrics[3]:.4f}, MAPE: {val_metrics[4]:.2f}%")

    print("\nTest set metrics:")
    print(f"MAE: {test_metrics[0]:.4f}, MSE: {test_metrics[1]:.4f}, RMSE: {test_metrics[2]:.4f}, R²: {test_metrics[3]:.4f}, MAPE: {test_metrics[4]:.2f}%")


Training GRU with 2 layers...
Training GRU with 3 layers...
Training GRU with 5 layers...

Running BOHB for GRU (2 layers) + CatBoost...

Best Parameters for GRU (2 layers) + CatBoost:
{'bagging_temperature': 0.0685761918533, 'colsample_bylevel': 0.983520067695, 'depth': 9, 'iterations': 496, 'learning_rate': 0.2875706042572}

Training set metrics:
MAE: 0.0036, MSE: 0.0000, RMSE: 0.0055, R²: 0.9998, MAPE: 1.03%

Validation set metrics:
MAE: 0.1575, MSE: 0.0307, RMSE: 0.1751, R²: -4.2144, MAPE: 8.84%

Test set metrics:
MAE: 0.4265, MSE: 0.1879, RMSE: 0.4335, R²: -30.1376, MAPE: 21.04%

Running BOHB for GRU (3 layers) + CatBoost...

Best Parameters for GRU (3 layers) + CatBoost:
{'bagging_temperature': 0.3787428249561, 'colsample_bylevel': 0.9942424487158, 'depth': 8, 'iterations': 416, 'learning_rate': 0.1835957608924}

Training set metrics:
MAE: 0.0036, MSE: 0.0000, RMSE: 0.0055, R²: 0.9998, MAPE: 1.05%

Validation set metrics:
MAE: 0.1575, MSE: 0.0307, RMSE: 0.1752, R²: -4.2160, MAPE:

## GRU - lightboost

In [None]:
!pip install lightgbm

### initial

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        return self.fc(out[:, -1, :])  # Take last time step output

# Set Parameters
input_size = 3
hidden_size = 64
num_layers_list = [2, 3, 5]  # Different GRU layers
learning_rate = 0.001
num_epochs = 100

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100  # Avoid divide-by-zero
    return mae, mse, rmse, r2, mape

# DataFrame to store results
columns = ["Layers", "Dataset", "MAE", "MSE", "RMSE", "R²", "MAPE"]
results_df = pd.DataFrame(columns=columns)

# Train multiple GRU models
gru_outputs = {}  # Store GRU embeddings for LGBM

for num_layers in num_layers_list:
    print(f"\nTraining GRU with {num_layers} layers...")

    # Initialize model, loss function, and optimizer
    model = GRUModel(input_size, hidden_size, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_torch).cpu().numpy()
        val_pred = model(X_val_torch).cpu().numpy()
        test_pred = model(X_test_torch).cpu().numpy()

    # Inverse transform predictions
    train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

    # Compute metrics for each dataset
    metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
    metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
    metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

    # Append results to DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame([
            [num_layers, "Train", *metrics_train],
            [num_layers, "Validation", *metrics_val],
            [num_layers, "Test", *metrics_test]
        ], columns=columns)
    ], ignore_index=True)

    # Store GRU embeddings for LGBM
    gru_outputs[num_layers] = {
        "train": train_pred_actual,
        "val": val_pred_actual,
        "test": test_pred_actual
    }

# Find the best GRU model (Lowest Validation MAPE)
best_model = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
best_layers = int(best_model["Layers"])

# Display GRU Results
print("\nGRU Model Performance Comparison (2, 3, and 5 Layers)\n")
print(results_df.to_string(index=False))

print(f"\nBest GRU Model: {best_layers} Layers (Based on Lowest Validation MAPE)\n")

# ---------- LightGBM on Best GRU Embeddings ----------
print("\nTraining LightGBM on Best GRU Embeddings...")

# Use the best GRU's output as features for LGBM
X_train_lgb = gru_outputs[best_layers]["train"]
X_val_lgb = gru_outputs[best_layers]["val"]
X_test_lgb = gru_outputs[best_layers]["test"]

# LightGBM Dataset
lgb_train = lgb.Dataset(X_train_lgb, label=Y_train)
lgb_val = lgb.Dataset(X_val_lgb, label=Y_val, reference=lgb_train)

# LGBM Parameters
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31
}

# Train LGBM
lgb_model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=200, callbacks=[lgb.log_evaluation(50)])

# Predictions
train_pred_lgb = lgb_model.predict(X_train_lgb)
val_pred_lgb = lgb_model.predict(X_val_lgb)
test_pred_lgb = lgb_model.predict(X_test_lgb)

# Compute metrics for LGBM
metrics_train_lgb = compute_metrics(Y_train.values.flatten(), train_pred_lgb.flatten())
metrics_val_lgb = compute_metrics(Y_val.values.flatten(), val_pred_lgb.flatten())
metrics_test_lgb = compute_metrics(Y_test.values.flatten(), test_pred_lgb.flatten())

# Append LGBM results to DataFrame
results_df = pd.concat([
    results_df,
    pd.DataFrame([
        [f"GRU({best_layers}) + LGBM", "Train", *metrics_train_lgb],
        [f"GRU({best_layers}) + LGBM", "Validation", *metrics_val_lgb],
        [f"GRU({best_layers}) + LGBM", "Test", *metrics_test_lgb]
    ], columns=columns)
], ignore_index=True)

# Display Final Results
print("\nFinal Model Performance (GRU vs GRU + LGBM)\n")
print(results_df.to_string(index=False))

# Best Model Selection
best_overall = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
print(f"\nBest Overall Model: {best_overall['Layers']} (Based on Lowest Validation MAPE)")



Training GRU with 2 layers...
Epoch [10/100], Loss: 0.1396
Epoch [20/100], Loss: 0.0852
Epoch [30/100], Loss: 0.0803
Epoch [40/100], Loss: 0.0740
Epoch [50/100], Loss: 0.0726
Epoch [60/100], Loss: 0.0716
Epoch [70/100], Loss: 0.0716
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training GRU with 3 layers...
Epoch [10/100], Loss: 0.0831
Epoch [20/100], Loss: 0.0739
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0720
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training GRU with 5 layers...
Epoch [10/100], Loss: 0.0738
Epoch [20/100], Loss: 0.0734
Epoch [30/100], Loss: 0.0717
Epoch [40/100], Loss: 0.0716
Epoch [50/100], Loss: 0.0716
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

GRU Model Performance Comparison 

### optuna

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import lightgbm as lgb
import pandas as pd
import optuna
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        return self.fc(out[:, -1, :])

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100
    return mae, mse, rmse, r2, mape

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# ----------- OPTUNA OPTIMIZATION FUNCTION -----------

def objective(trial):
    # Sample GRU hyperparameters
    hidden_size = trial.suggest_int("hidden_size", 32, 128, step=16)
    num_layers = trial.suggest_int("num_layers", 2, 5)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)

    # Initialize model, loss function, and optimizer
    model = GRUModel(input_size=3, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    num_epochs = 50
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_torch).cpu().numpy()

    # Inverse transform predictions
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))

    # Compute MAPE (minimization objective)
    _, _, _, _, mape = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())

    return mape  # Optuna minimizes MAPE

# Run Optuna for GRU
study_gru = optuna.create_study(direction="minimize")
study_gru.optimize(objective, n_trials=20)

# Best GRU Model Parameters
best_gru_params = study_gru.best_params
print("\nBest GRU Model:", best_gru_params)

# ----------- Train Best GRU and Get Embeddings -----------

best_gru = GRUModel(input_size=3, hidden_size=best_gru_params["hidden_size"], num_layers=best_gru_params["num_layers"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_gru.parameters(), lr=best_gru_params["learning_rate"])

# Train Best GRU
for epoch in range(50):
    best_gru.train()
    outputs = best_gru(X_train_torch)
    loss = criterion(outputs, Y_train_torch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Get GRU embeddings
best_gru.eval()
with torch.no_grad():
    train_pred = best_gru(X_train_torch).cpu().numpy()
    val_pred = best_gru(X_val_torch).cpu().numpy()
    test_pred = best_gru(X_test_torch).cpu().numpy()

# Inverse transform predictions
X_train_lgb = scaler.inverse_transform(train_pred.reshape(-1, 1))
X_val_lgb = scaler.inverse_transform(val_pred.reshape(-1, 1))
X_test_lgb = scaler.inverse_transform(test_pred.reshape(-1, 1))

# ----------- OPTUNA OPTIMIZATION FOR LIGHTGBM -----------

def objective_lgb(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 30)
    }

    lgb_train = lgb.Dataset(X_train_lgb, label=Y_train)
    lgb_val = lgb.Dataset(X_val_lgb, label=Y_val, reference=lgb_train)

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=200, callbacks=[lgb.log_evaluation(50)])

    val_pred = model.predict(X_val_lgb)
    _, _, _, _, mape = compute_metrics(Y_val.values.flatten(), val_pred.flatten())

    return mape  # Optuna minimizes MAPE

# Run Optuna for LGBM
study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective_lgb, n_trials=20)

# Best LGBM Model Parameters
best_lgb_params = study_lgb.best_params
print("\nBest LightGBM Model:", best_lgb_params)

# Train Final LightGBM Model
lgb_train = lgb.Dataset(X_train_lgb, label=Y_train)
lgb_val = lgb.Dataset(X_val_lgb, label=Y_val, reference=lgb_train)

final_lgb = lgb.train(best_lgb_params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=200, callbacks=[lgb.log_evaluation(50)])

# Predictions
train_pred_lgb = final_lgb.predict(X_train_lgb)
val_pred_lgb = final_lgb.predict(X_val_lgb)
test_pred_lgb = final_lgb.predict(X_test_lgb)

# Compute metrics
metrics_train_lgb = compute_metrics(Y_train.values.flatten(), train_pred_lgb.flatten())
metrics_val_lgb = compute_metrics(Y_val.values.flatten(), val_pred_lgb.flatten())
metrics_test_lgb = compute_metrics(Y_test.values.flatten(), test_pred_lgb.flatten())

print("\nFinal GRU + LightGBM Performance:")
print("Train:", metrics_train_lgb)
print("Validation:", metrics_val_lgb)
print("Test:", metrics_test_lgb)


[I 2025-02-13 08:00:22,495] A new study created in memory with name: no-name-7e72d292-957d-41ed-9c7a-4903017cb99a
[I 2025-02-13 08:00:58,796] Trial 0 finished with value: 73.88033346772791 and parameters: {'hidden_size': 64, 'num_layers': 4, 'learning_rate': 0.00629976609866542}. Best is trial 0 with value: 73.88033346772791.
[I 2025-02-13 08:01:40,251] Trial 1 finished with value: 74.86668047947921 and parameters: {'hidden_size': 96, 'num_layers': 4, 'learning_rate': 0.0017233513072134636}. Best is trial 0 with value: 73.88033346772791.
[I 2025-02-13 08:02:23,469] Trial 2 finished with value: 73.26963817216257 and parameters: {'hidden_size': 48, 'num_layers': 4, 'learning_rate': 0.0019326251186914116}. Best is trial 2 with value: 73.26963817216257.
[I 2025-02-13 08:02:58,134] Trial 3 finished with value: 73.31971603797183 and parameters: {'hidden_size': 64, 'num_layers': 4, 'learning_rate': 0.0030998377124945363}. Best is trial 2 with value: 73.26963817216257.
[I 2025-02-13 08:03:38,3


Best GRU Model: {'hidden_size': 96, 'num_layers': 2, 'learning_rate': 0.0002974606806431787}


[I 2025-02-13 08:14:17,361] A new study created in memory with name: no-name-a1596d54-627f-4d76-87a0-69410628f2eb
[I 2025-02-13 08:14:17,480] Trial 0 finished with value: 9.284707649008874 and parameters: {'num_leaves': 21, 'learning_rate': 0.024559450904080706, 'max_depth': 5, 'min_data_in_leaf': 29}. Best is trial 0 with value: 9.284707649008874.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000128 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.121061	valid_1's rmse: 0.507117
[100]	training's rmse: 0.0353741	valid_1's rmse: 0.270387
[150]	training's rmse: 0.0114053	valid_1's rmse: 0.201863
[200]	training's rmse: 0.00602047	valid_1's rmse: 0.182164
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00690072	valid_1's rmse: 0.186297
[100]	training's rmse: 0.00526441

[I 2025-02-13 08:14:17,644] Trial 1 finished with value: 8.774870806580424 and parameters: {'num_leaves': 23, 'learning_rate': 0.08699143020849764, 'max_depth': 5, 'min_data_in_leaf': 29}. Best is trial 1 with value: 8.774870806580424.


[150]	training's rmse: 0.00526166	valid_1's rmse: 0.174084
[200]	training's rmse: 0.0052606	valid_1's rmse: 0.174082
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0202	valid_1's rmse: 0.245083
[100]	training's rmse: 0.0054162	valid_1's rmse: 0.178263


[I 2025-02-13 08:14:17,838] Trial 2 finished with value: 8.783026034208394 and parameters: {'num_leaves': 27, 'learning_rate': 0.06062983439633724, 'max_depth': 3, 'min_data_in_leaf': 17}. Best is trial 1 with value: 8.774870806580424.


[150]	training's rmse: 0.00532204	valid_1's rmse: 0.174435
[200]	training's rmse: 0.00530908	valid_1's rmse: 0.174211
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0292675	valid_1's rmse: 0.273141
[100]	training's rmse: 0.00574755	valid_1's rmse: 0.189846
[150]	training's rmse: 0.00536464	valid_1's rmse: 0.183145


[I 2025-02-13 08:14:18,043] Trial 3 finished with value: 9.31894420880134 and parameters: {'num_leaves': 31, 'learning_rate': 0.05305921810180837, 'max_depth': 3, 'min_data_in_leaf': 30}. Best is trial 1 with value: 8.774870806580424.


[200]	training's rmse: 0.00535433	valid_1's rmse: 0.182707
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000136 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0658092	valid_1's rmse: 0.360832
[100]	training's rmse: 0.0115753	valid_1's rmse: 0.204619
[150]	training's rmse: 0.00550657	valid_1's rmse: 0.179029


[I 2025-02-13 08:14:18,265] Trial 4 finished with value: 8.825320068877827 and parameters: {'num_leaves': 25, 'learning_rate': 0.036509663439804094, 'max_depth': 4, 'min_data_in_leaf': 6}. Best is trial 1 with value: 8.774870806580424.


[200]	training's rmse: 0.00526779	valid_1's rmse: 0.174883
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0178593	valid_1's rmse: 0.217473
[100]	training's rmse: 0.00530577	valid_1's rmse: 0.175843
[150]	training's rmse: 0.00525977	valid_1's rmse: 0.174151


[I 2025-02-13 08:14:18,473] Trial 5 finished with value: 8.774699090367243 and parameters: {'num_leaves': 46, 'learning_rate': 0.06200051037688981, 'max_depth': 10, 'min_data_in_leaf': 10}. Best is trial 5 with value: 8.774699090367243.
[I 2025-02-13 08:14:18,609] Trial 6 finished with value: 8.77626998649586 and parameters: {'num_leaves': 27, 'learning_rate': 0.052658649654771725, 'max_depth': 7, 'min_data_in_leaf': 21}. Best is trial 5 with value: 8.774699090367243.


[200]	training's rmse: 0.00525969	valid_1's rmse: 0.174079
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0285643	valid_1's rmse: 0.249444
[100]	training's rmse: 0.00558834	valid_1's rmse: 0.179103
[150]	training's rmse: 0.00526166	valid_1's rmse: 0.174436
[200]	training's rmse: 0.00525981	valid_1's rmse: 0.174104
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 08:14:18,755] Trial 7 finished with value: 8.85427073294276 and parameters: {'num_leaves': 31, 'learning_rate': 0.033359213030683905, 'max_depth': 10, 'min_data_in_leaf': 5}. Best is trial 5 with value: 8.774699090367243.


[50]	training's rmse: 0.0769913	valid_1's rmse: 0.380441
[100]	training's rmse: 0.0150548	valid_1's rmse: 0.211027
[150]	training's rmse: 0.00586372	valid_1's rmse: 0.180874
[200]	training's rmse: 0.00528153	valid_1's rmse: 0.175342
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.150936	valid_1's rmse: 0.573999
[100]	training's rmse: 0.0546485	valid_1's rmse: 0.315288
[150]	training's rmse: 0.0203081	valid_1's rmse: 0.224024


[I 2025-02-13 08:14:18,922] Trial 8 finished with value: 9.895610287227246 and parameters: {'num_leaves': 46, 'learning_rate': 0.0202004221782512, 'max_depth': 6, 'min_data_in_leaf': 18}. Best is trial 5 with value: 8.774699090367243.


[200]	training's rmse: 0.00881497	valid_1's rmse: 0.19189
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0301913	valid_1's rmse: 0.251642
[100]	training's rmse: 0.00566883	valid_1's rmse: 0.179533
[150]	training's rmse: 0.00526191	valid_1's rmse: 0.174474
[200]	training's rmse: 0.0052597	valid_1's rmse: 0.174105


[I 2025-02-13 08:14:19,089] Trial 9 finished with value: 8.776317955428594 and parameters: {'num_leaves': 37, 'learning_rate': 0.05153823313364949, 'max_depth': 8, 'min_data_in_leaf': 22}. Best is trial 5 with value: 8.774699090367243.
[I 2025-02-13 08:14:19,294] Trial 10 finished with value: 14.240342787674273 and parameters: {'num_leaves': 50, 'learning_rate': 0.012377138790418293, 'max_depth': 10, 'min_data_in_leaf': 11}. Best is trial 5 with value: 8.774699090367243.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.224526	valid_1's rmse: 0.772978
[100]	training's rmse: 0.120559	valid_1's rmse: 0.492069
[150]	training's rmse: 0.0648425	valid_1's rmse: 0.342428
[200]	training's rmse: 0.0350759	valid_1's rmse: 0.263058
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000214 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00646222	valid_1's rmse: 0.183668
[100]	training's rmse: 0.00525988	v

[I 2025-02-13 08:14:19,537] Trial 11 finished with value: 8.774508473755759 and parameters: {'num_leaves': 40, 'learning_rate': 0.09001021067402358, 'max_depth': 8, 'min_data_in_leaf': 12}. Best is trial 11 with value: 8.774508473755759.
[I 2025-02-13 08:14:19,746] Trial 12 finished with value: 8.774498773612361 and parameters: {'num_leaves': 41, 'learning_rate': 0.09273856181583953, 'max_depth': 9, 'min_data_in_leaf': 12}. Best is trial 12 with value: 8.774498773612361.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000119 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00617302	valid_1's rmse: 0.182306
[100]	training's rmse: 0.00525979	valid_1's rmse: 0.174147
[150]	training's rmse: 0.00525969	valid_1's rmse: 0.174076
[200]	training's rmse: 0.00525969	valid_1's rmse: 0.174076


[I 2025-02-13 08:14:19,947] Trial 13 finished with value: 8.774514755670513 and parameters: {'num_leaves': 39, 'learning_rate': 0.0804746502143998, 'max_depth': 8, 'min_data_in_leaf': 12}. Best is trial 12 with value: 8.774498773612361.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000272 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00822244	valid_1's rmse: 0.190218
[100]	training's rmse: 0.00526066	valid_1's rmse: 0.17433
[150]	training's rmse: 0.0052597	valid_1's rmse: 0.17408
[200]	training's rmse: 0.00525969	valid_1's rmse: 0.174076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 08:14:20,167] Trial 14 finished with value: 8.774511737718617 and parameters: {'num_leaves': 41, 'learning_rate': 0.09251837122823287, 'max_depth': 8, 'min_data_in_leaf': 14}. Best is trial 12 with value: 8.774498773612361.


[50]	training's rmse: 0.00619365	valid_1's rmse: 0.182346
[100]	training's rmse: 0.00525982	valid_1's rmse: 0.174147
[150]	training's rmse: 0.0052597	valid_1's rmse: 0.174076
[200]	training's rmse: 0.00525969	valid_1's rmse: 0.174076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 08:14:20,422] Trial 15 finished with value: 8.774497539715709 and parameters: {'num_leaves': 43, 'learning_rate': 0.09866766139024867, 'max_depth': 9, 'min_data_in_leaf': 8}. Best is trial 15 with value: 8.774497539715709.


[50]	training's rmse: 0.00575226	valid_1's rmse: 0.179972
[100]	training's rmse: 0.00525974	valid_1's rmse: 0.174113
[150]	training's rmse: 0.00525969	valid_1's rmse: 0.174076
[200]	training's rmse: 0.00525969	valid_1's rmse: 0.174076


[I 2025-02-13 08:14:20,617] Trial 16 finished with value: 16.48861040800895 and parameters: {'num_leaves': 44, 'learning_rate': 0.01068319956236507, 'max_depth': 9, 'min_data_in_leaf': 8}. Best is trial 15 with value: 8.774497539715709.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000209 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.244607	valid_1's rmse: 0.828217
[100]	training's rmse: 0.143062	valid_1's rmse: 0.553592
[150]	training's rmse: 0.0837438	valid_1's rmse: 0.393357
[200]	training's rmse: 0.0491429	valid_1's rmse: 0.300762
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 08:14:20,790] Trial 17 finished with value: 8.79172021843407 and parameters: {'num_leaves': 34, 'learning_rate': 0.040824840574976415, 'max_depth': 9, 'min_data_in_leaf': 15}. Best is trial 15 with value: 8.774497539715709.


[50]	training's rmse: 0.0523915	valid_1's rmse: 0.312149
[100]	training's rmse: 0.00835908	valid_1's rmse: 0.190975
[150]	training's rmse: 0.00532202	valid_1's rmse: 0.176198
[200]	training's rmse: 0.00526069	valid_1's rmse: 0.17435
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0103513	valid_1's rmse: 0.196534


[I 2025-02-13 08:14:21,010] Trial 18 finished with value: 8.774513905713668 and parameters: {'num_leaves': 50, 'learning_rate': 0.07410499478552804, 'max_depth': 7, 'min_data_in_leaf': 8}. Best is trial 15 with value: 8.774497539715709.


[100]	training's rmse: 0.00526324	valid_1's rmse: 0.174557
[150]	training's rmse: 0.0052597	valid_1's rmse: 0.174086
[200]	training's rmse: 0.00525969	valid_1's rmse: 0.174076
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000124 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.177125	valid_1's rmse: 0.646086


[I 2025-02-13 08:14:21,224] Trial 19 finished with value: 10.911566432374446 and parameters: {'num_leaves': 43, 'learning_rate': 0.017056505991077046, 'max_depth': 9, 'min_data_in_leaf': 24}. Best is trial 15 with value: 8.774497539715709.


[100]	training's rmse: 0.075118	valid_1's rmse: 0.370402
[150]	training's rmse: 0.0321484	valid_1's rmse: 0.255734
[200]	training's rmse: 0.0144174	valid_1's rmse: 0.208243

Best LightGBM Model: {'num_leaves': 43, 'learning_rate': 0.09866766139024867, 'max_depth': 9, 'min_data_in_leaf': 8}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's l2: 3.30885e-05	valid_1's l2: 0.0323899
[100]	training's l2: 2.76648e-05	valid_1's l2: 0.0303152
[150]	training's l2: 2.76643e-05	valid_1's l2: 0.0303024
[200]	training's l2: 2.76643e-05	valid_1's l2: 0.0303023

Final GRU + LightGBM Performance:
Train: (0.0034931986437253893, 2.7664292305943547e-05, 0.005259685571015015, 0.9998419373028942, 1.006557120

### bohb

In [None]:
!pip install hpbandster ConfigSpace

Collecting hpbandster
  Downloading hpbandster-0.7.4.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ConfigSpace
  Downloading configspace-1.2.1.tar.gz (130 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting Pyro4 (from hpbandster)
  Downloading Pyro4-4.82-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting serpent (from hpbandster)
  Downloading serpent-1.41-py3-none-any.whl.metadata (5.8 kB)
Collecting netifaces (from hpbandster)
  Downloading netifaces-0.11.0.tar.gz (30 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading Pyro4-4.82-py2.py3-none-any.whl (89

In [None]:
import numpy as np
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.optim as optim
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB
from hpbandster.core.worker import Worker
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Define GRU Model
class GRUModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(GRUModel, self).__init__()
        self.gru = nn.GRU(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out, _ = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, r2, mape

# Convert datasets to PyTorch tensors
Y_train_torch = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_val_torch = torch.tensor(Y_val.values, dtype=torch.float32).unsqueeze(1)
Y_test_torch = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1)

# GRU Configurations (2, 3, and 5 layers)
gru_layers = [2, 3, 5]
hidden_dim = 64
output_dim = 1
input_dim = X_train.shape[1]

# Dictionary to store GRU feature representations
gru_features = {}

for num_layers in gru_layers:
    print(f"Training GRU with {num_layers} layers...")

    gru_model = GRUModel(input_dim, hidden_dim, num_layers, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(gru_model.parameters(), lr=0.001)
    num_epochs = 100

    for epoch in range(num_epochs):
        gru_model.train()
        optimizer.zero_grad()
        outputs = gru_model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        loss.backward()
        optimizer.step()

    # Extract Feature Representations
    gru_model.eval()
    with torch.no_grad():
        train_features = gru_model(X_train_torch).numpy()
        val_features = gru_model(X_val_torch).numpy()
        test_features = gru_model(X_test_torch).numpy()

    gru_features[num_layers] = (train_features, val_features, test_features)

# Define ConfigSpace for BOHB (LightGBM)
def get_config_space():
    cs = CS.ConfigurationSpace()
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("num_leaves", 20, 300, default_value=50))
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("max_depth", 3, 12, default_value=6))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("learning_rate", 0.01, 0.3, default_value=0.1))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("feature_fraction", 0.5, 1.0, default_value=0.8))
    return cs

# BOHB Worker for LightGBM
class LightGBMWorker(Worker):
    def __init__(self, train_features, val_features, **kwargs):
        super().__init__(**kwargs)
        self.train_features = train_features
        self.val_features = val_features

    def compute(self, config, budget, **kwargs):
        model = lgb.LGBMRegressor(
            num_leaves=config["num_leaves"],
            max_depth=config["max_depth"],
            learning_rate=config["learning_rate"],
            feature_fraction=config["feature_fraction"],
            random_state=42
        )
        model.fit(self.train_features, Y_train)
        Y_val_pred = model.predict(self.val_features)
        mae = mean_absolute_error(Y_val, Y_val_pred)
        return {"loss": mae, "info": config}

# Run BOHB for each GRU configuration
best_models = {}

for num_layers in gru_layers:
    print(f"\nRunning BOHB for GRU ({num_layers} layers) + LightGBM...")

    train_features, val_features, test_features = gru_features[num_layers]

    # Start NameServer
    NS = hpns.NameServer(run_id=f"gru_{num_layers}_lgb_bohb", host="127.0.0.1", port=None)
    NS.start()

    worker = LightGBMWorker(
        train_features=train_features,
        val_features=val_features,
        nameserver="127.0.0.1",
        run_id=f"gru_{num_layers}_lgb_bohb"
    )
    worker.run(background=True)

    bohb = BOHB(
        configspace=get_config_space(),
        run_id=f"gru_{num_layers}_lgb_bohb",
        nameserver="127.0.0.1",
        min_budget=1,
        max_budget=3
    )

    res = bohb.run(n_iterations=50)

    # Shutdown BOHB
    bohb.shutdown()
    NS.shutdown()

    # Retrieve Best Configuration
    best_config = res.get_incumbent_id()
    best_params = res.get_id2config_mapping()[best_config]["config"]

    # Train Best LightGBM Model on GRU Features
    best_lgb_model = lgb.LGBMRegressor(
        num_leaves=best_params["num_leaves"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        feature_fraction=best_params["feature_fraction"],
        random_state=42
    )

    best_lgb_model.fit(train_features, Y_train)

    # Make Predictions
    Y_train_pred = best_lgb_model.predict(train_features)
    Y_val_pred = best_lgb_model.predict(val_features)
    Y_test_pred = best_lgb_model.predict(test_features)

    # Calculate Metrics
    train_metrics = calculate_metrics(Y_train, Y_train_pred)
    val_metrics = calculate_metrics(Y_val, Y_val_pred)
    test_metrics = calculate_metrics(Y_test, Y_test_pred)

    # Store best model and metrics
    best_models[num_layers] = {
        "params": best_params,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics
    }

    # Print Results
    print(f"\nBest Parameters for GRU ({num_layers} layers) + LightGBM:")
    print(best_params)

    print("\nTraining set metrics:")
    print(f"MAE: {train_metrics[0]:.4f}, MSE: {train_metrics[1]:.4f}, RMSE: {train_metrics[2]:.4f}, R²: {train_metrics[3]:.4f}, MAPE: {train_metrics[4]:.2f}%")

    print("\nValidation set metrics:")
    print(f"MAE: {val_metrics[0]:.4f}, MSE: {val_metrics[1]:.4f}, RMSE: {val_metrics[2]:.4f}, R²: {val_metrics[3]:.4f}, MAPE: {val_metrics[4]:.2f}%")

    print("\nTest set metrics:")
    print(f"MAE: {test_metrics[0]:.4f}, MSE: {test_metrics[1]:.4f}, RMSE: {test_metrics[2]:.4f}, R²: {test_metrics[3]:.4f}, MAPE: {test_metrics[4]:.2f}%")


Training GRU with 2 layers...
Training GRU with 3 layers...
Training GRU with 5 layers...

Running BOHB for GRU (2 layers) + LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038

Best Parameters for GRU (2 layers) + LightGBM:
{'feature_fraction': 0.5162360818571, 'learning_rate': 0.2994566217115, 'max_depth': 11, 'num_leaves': 186}

Training set metrics:
MAE: 0.0037, MSE: 0.0000, RMSE: 0.0055, R²: 0.9998, MAPE: 1.07%

Validation set metrics:
MAE: 0.1565, MSE: 0.0303, RMSE: 0.1742, R²: -4.1586, MAPE: 8.78%

Test set metrics:
MAE: 0.4254, MSE: 0.1870, RMSE: 0.4325, R²: -29.9898, MAPE: 20.99%

Running BOHB for GRU (3 layers) + LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, 