## Initial Code

In [1]:
# Importing necessary libraries for data analysis and manipulation
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# For handling warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
df_aapl = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/stocks/AAPL.csv')

In [5]:
import numpy as np
from scipy.stats import boxcox

df_aapl['Close_log'] = np.log(df_aapl['Close'] + 1)
df_aapl['Close_sqrt'] = np.sqrt(df_aapl['Close'])
df_aapl['Close_boxcox'], _ = boxcox(df_aapl['Close'] + 1)


This code calculates the skewness of the 'Close' column in the `df_aapl` DataFrame before and after applying various transformations:

1. **Original Skewness**: Calculates the skewness of the original 'Close' data.
2. **Log Transformation Skewness**: Calculates the skewness of the 'Close_log' column after applying the log transformation.
3. **Square Root Transformation Skewness**: Calculates the skewness of the 'Close_sqrt' column after applying the square root transformation.
4. **Box-Cox Transformation Skewness**: Calculates the skewness of the 'Close_boxcox' column after applying the Box-Cox transformation.

The printed results help assess how each transformation affects the distribution's symmetry and the success of skewness correction.







In [6]:

skew_original = df_aapl['Close'].skew()
skew_log = df_aapl['Close_log'].skew()
skew_sqrt = df_aapl['Close_sqrt'].skew()
skew_boxcox = pd.Series(df_aapl['Close_boxcox']).skew()

print(f"Original Skewness: {skew_original}")
print(f"Log Transformation Skewness: {skew_log}")
print(f"Square Root Transformation Skewness: {skew_sqrt}")
print(f"Box-Cox Transformation Skewness: {skew_boxcox}")


Original Skewness: 2.5045276102319933
Log Transformation Skewness: 0.8535555176510308
Square Root Transformation Skewness: 1.6211545809555206
Box-Cox Transformation Skewness: 0.4352746472149233


In [7]:

df_aapl['Open_log'] = np.log(df_aapl['Open'])
df_aapl['High_log'] = np.log(df_aapl['High'])
df_aapl['Low_log'] = np.log(df_aapl['Low'])
df_aapl['Adj Close_log'] = np.log(df_aapl['Adj Close'])
df_aapl['Volume_log'] = np.log(df_aapl['Volume'])


df_aapl['Open_sqrt'] = np.sqrt(df_aapl['Open'])
df_aapl['High_sqrt'] = np.sqrt(df_aapl['High'])
df_aapl['Low_sqrt'] = np.sqrt(df_aapl['Low'])
df_aapl['Adj Close_sqrt'] = np.sqrt(df_aapl['Adj Close'])
df_aapl['Volume_sqrt'] = np.sqrt(df_aapl['Volume'])

from scipy.stats import boxcox
df_aapl['Open_boxcox'], _ = boxcox(df_aapl['Open'])
df_aapl['High_boxcox'], _ = boxcox(df_aapl['High'])
df_aapl['Low_boxcox'], _ = boxcox(df_aapl['Low'])
df_aapl['Adj Close_boxcox'], _ = boxcox(df_aapl['Adj Close'])

This helps compare how the transformations reduce skewness in the data, aiming for a more normal distribution.

In [8]:

skewness_before = df_aapl[['Open', 'High', 'Low', 'Adj Close', 'Volume']].skew()
skewness_after = df_aapl[['Open_log', 'High_log', 'Low_log', 'Adj Close_log',
                          'Open_sqrt', 'High_sqrt', 'Low_sqrt', 'Adj Close_sqrt', 'Volume_sqrt',
                          'Open_boxcox', 'High_boxcox', 'Low_boxcox', 'Adj Close_boxcox']].skew()

print("Skewness Before Transformation:\n", skewness_before)
print("\nSkewness After Transformation:\n", skewness_after)


Skewness Before Transformation:
 Open         2.504632
High         2.502208
Low          2.506714
Adj Close    2.550677
Volume       3.565699
dtype: float64

Skewness After Transformation:
 Open_log            0.482872
High_log            0.481997
Low_log             0.484246
Adj Close_log       0.494009
Open_sqrt           1.620771
High_sqrt           1.621456
Low_sqrt            1.620661
Adj Close_sqrt      1.679402
Volume_sqrt         1.299776
Open_boxcox         0.181226
High_boxcox         0.179749
Low_boxcox          0.182882
Adj Close_boxcox    0.180085
dtype: float64


- Applied Box-Cox transformation to the 'Open', 'High', 'Low', 'Adj Close', and 'Close' columns.
- Recalculated skewness after the transformation to reduce skew and normalize the data for modeling.

In [9]:
from scipy import stats

df_aapl['Open_boxcox'], _ = stats.boxcox(df_aapl['Open'] + 1)
df_aapl['High_boxcox'], _ = stats.boxcox(df_aapl['High'] + 1)
df_aapl['Low_boxcox'], _ = stats.boxcox(df_aapl['Low'] + 1)
df_aapl['Adj Close_boxcox'], _ = stats.boxcox(df_aapl['Adj Close'] + 1)
df_aapl['Close_boxcox'], _ = stats.boxcox(df_aapl['Close'] + 1)

skewness_after_boxcox = df_aapl[['Open_boxcox', 'High_boxcox', 'Low_boxcox', 'Adj Close_boxcox', 'Close_boxcox']].skew()

print("Skewness After Box-Cox Transformation:")
print(skewness_after_boxcox)


Skewness After Box-Cox Transformation:
Open_boxcox         0.435237
High_boxcox         0.433381
Low_boxcox          0.437331
Adj Close_boxcox    0.458762
Close_boxcox        0.435275
dtype: float64


Feature Selection

In [10]:

df_aapl_cleaned = df_aapl[['Date', 'Open', 'High', 'Low', 'Adj Close', 'Close', 'Volume',
                           'Open_boxcox', 'High_boxcox', 'Low_boxcox', 'Adj Close_boxcox',
                           'Close_boxcox']]

print(df_aapl_cleaned.head())


         Date      Open      High       Low  Adj Close     Close     Volume  \
0  1980-12-12  0.128348  0.128906  0.128348   0.098943  0.128348  469033600   
1  1980-12-15  0.122210  0.122210  0.121652   0.093781  0.121652  175884800   
2  1980-12-16  0.113281  0.113281  0.112723   0.086898  0.112723  105728000   
3  1980-12-17  0.115513  0.116071  0.115513   0.089049  0.115513   86441600   
4  1980-12-18  0.118862  0.119420  0.118862   0.091630  0.118862   73449600   

   Open_boxcox  High_boxcox  Low_boxcox  Adj Close_boxcox  Close_boxcox  
0     0.117689     0.118173    0.117674          0.092374      0.117689  
1     0.112503     0.112516    0.112016          0.087857      0.112030  
2     0.104886     0.104897    0.104395          0.081785      0.104407  
3     0.106798     0.107287    0.106786          0.083688      0.106798  
4     0.109657     0.110145    0.109644          0.085966      0.109657  


### Train Validation Test Split

The code splits the data into training, validation, and test sets. The features `X` and target `Y` are split as follows:

- 70% for training (`X_train`, `Y_train`)
- 15% for validation (`X_val`, `Y_val`)
- 15% for testing (`X_test`, `Y_test`)

The split is done using a 30% test size, followed by splitting the remaining 70% into validation and test sets without shuffling (time series data).

In [11]:
from sklearn.model_selection import train_test_split

X = df_aapl_cleaned[['Open_boxcox', 'High_boxcox', 'Low_boxcox']]
Y = df_aapl_cleaned['Close_boxcox']

X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.3, shuffle=False)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.5, shuffle=False)

print(f"Training set: {X_train.shape}, Validation set: {X_val.shape}, Test set: {X_test.shape}")


Training set: (7736, 3), Validation set: (1658, 3), Test set: (1658, 3)


## GPU Activation

In [12]:
import torch

# Check GPU status
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is enabled:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("No GPU found, using CPU.")


GPU is enabled: Tesla T4


## Bi-LSTM - XGB

In [13]:
!pip install xgboost




### Initial

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Bi-directional → *2

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return self.fc(out[:, -1, :])  # Last time step output

# Parameters
input_size = 3
hidden_size = 64
num_layers_list = [2, 3, 5]  # Different Bi-LSTM layers
learning_rate = 0.001
num_epochs = 100

# MinMax Scaling (helps LSTM)
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100   # Avoid divide-by-zero
    return mae, mse, rmse, r2, mape

# DataFrame to store results
columns = ["Layers", "Dataset", "MAE", "MSE", "RMSE", "R²", "MAPE"]
results_df = pd.DataFrame(columns=columns)

# Train multiple Bi-LSTM models
bilstm_outputs = {}  # Store Bi-LSTM embeddings for XGBoost later

for num_layers in num_layers_list:
    print(f"\nTraining Bi-LSTM with {num_layers} layers...")

    # Initialize model, loss function, and optimizer
    model = BiLSTMModel(input_size, hidden_size, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_torch).cpu().numpy()
        val_pred = model(X_val_torch).cpu().numpy()
        test_pred = model(X_test_torch).cpu().numpy()

    # Inverse transform predictions
    train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

    # Compute metrics for each dataset
    metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
    metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
    metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

    # Append results to DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame([
            [num_layers, "Train", *metrics_train],
            [num_layers, "Validation", *metrics_val],
            [num_layers, "Test", *metrics_test]
        ], columns=columns)
    ], ignore_index=True)

    # Store Bi-LSTM embeddings for XGBoost
    bilstm_outputs[num_layers] = {
        "train": train_pred_actual,
        "val": val_pred_actual,
        "test": test_pred_actual
    }

# Find the best Bi-LSTM model (Lowest Validation MAPE)
best_model = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
best_layers = int(best_model["Layers"])

# Display Bi-LSTM Results
print("\nBi-LSTM Model Performance Comparison (2, 3, and 5 Layers)\n")
print(results_df.to_string(index=False))

print(f"\nBest Bi-LSTM Model: {best_layers} Layers (Based on Lowest Validation MAPE)\n")

# ---------- XGBoost on Best Bi-LSTM Embeddings ----------
print("\nTraining XGBoost on Best Bi-LSTM Embeddings...")

# Use the best Bi-LSTM's output as features for XGBoost
X_train_xgb = bilstm_outputs[best_layers]["train"]
X_val_xgb = bilstm_outputs[best_layers]["val"]
X_test_xgb = bilstm_outputs[best_layers]["test"]

# XGBoost Model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.05)
xgb_model.fit(X_train_xgb, Y_train.values)

# Predictions
train_pred_xgb = xgb_model.predict(X_train_xgb)
val_pred_xgb = xgb_model.predict(X_val_xgb)
test_pred_xgb = xgb_model.predict(X_test_xgb)

# Compute metrics for XGBoost
metrics_train_xgb = compute_metrics(Y_train.values.flatten(), train_pred_xgb.flatten())
metrics_val_xgb = compute_metrics(Y_val.values.flatten(), val_pred_xgb.flatten())
metrics_test_xgb = compute_metrics(Y_test.values.flatten(), test_pred_xgb.flatten())

# Append XGBoost results to DataFrame
results_df = pd.concat([
    results_df,
    pd.DataFrame([
        [f"Bi-LSTM({best_layers}) + XGBoost", "Train", *metrics_train_xgb],
        [f"Bi-LSTM({best_layers}) + XGBoost", "Validation", *metrics_val_xgb],
        [f"Bi-LSTM({best_layers}) + XGBoost", "Test", *metrics_test_xgb]
    ], columns=columns)
], ignore_index=True)

# Display Final Results
print("\nFinal Model Performance (Bi-LSTM vs Bi-LSTM + XGBoost)\n")
print(results_df.to_string(index=False))

# Best Model Selection
best_overall = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
print(f"\nBest Overall Model: {best_overall['Layers']} (Based on Lowest Validation MAPE)")



Training Bi-LSTM with 2 layers...
Epoch [10/100], Loss: 0.0902
Epoch [20/100], Loss: 0.0734
Epoch [30/100], Loss: 0.0731
Epoch [40/100], Loss: 0.0719
Epoch [50/100], Loss: 0.0716
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training Bi-LSTM with 3 layers...
Epoch [10/100], Loss: 0.0751
Epoch [20/100], Loss: 0.0738
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0717
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training Bi-LSTM with 5 layers...
Epoch [10/100], Loss: 0.1117
Epoch [20/100], Loss: 0.0739
Epoch [30/100], Loss: 0.0715
Epoch [40/100], Loss: 0.0724
Epoch [50/100], Loss: 0.0716
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Bi-LSTM Model Perform

### optuna

In [15]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.1-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.9-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.1-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.6/233.6 kB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.9-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import xgboost as xgb
import pandas as pd
import optuna
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Bi-directional LSTM has 2x hidden size

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return self.fc(out[:, -1, :])  # Last time step output

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100   # Avoid divide-by-zero
    return mae, mse, rmse, r2, mape

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Optuna Bi-LSTM Optimization
def objective_bilstm(trial):
    num_layers = trial.suggest_int("num_layers", 2, 5)
    hidden_size = trial.suggest_int("hidden_size", 32, 128)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)
    num_epochs = 100

    # Train Bi-LSTM
    model = BiLSTMModel(input_size=3, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluate Bi-LSTM
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_torch).cpu().numpy()

    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    mape = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())[-1]  # MAPE

    return mape  # Minimize Validation MAPE

# Run Optuna for Bi-LSTM
study_bilstm = optuna.create_study(direction="minimize")
study_bilstm.optimize(objective_bilstm, n_trials=20)
best_bilstm_params = study_bilstm.best_params

# Train Best Bi-LSTM Model
best_bilstm_model = BiLSTMModel(input_size=3, hidden_size=best_bilstm_params["hidden_size"], num_layers=best_bilstm_params["num_layers"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_bilstm_model.parameters(), lr=best_bilstm_params["learning_rate"])

for epoch in range(100):
    best_bilstm_model.train()
    outputs = best_bilstm_model(X_train_torch)
    loss = criterion(outputs, Y_train_torch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Extract Bi-LSTM Embeddings
best_bilstm_model.eval()
with torch.no_grad():
    X_train_xgb = best_bilstm_model(X_train_torch).cpu().numpy()
    X_val_xgb = best_bilstm_model(X_val_torch).cpu().numpy()
    X_test_xgb = best_bilstm_model(X_test_torch).cpu().numpy()

# Optuna XGBoost Optimization
def objective_xgb(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.2),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "objective": "reg:squarederror"
    }

    xgb_model = xgb.XGBRegressor(**params)
    xgb_model.fit(X_train_xgb, Y_train.values)

    val_pred_xgb = xgb_model.predict(X_val_xgb)
    mape = compute_metrics(Y_val.values.flatten(), val_pred_xgb.flatten())[-1]  # MAPE

    return mape  # Minimize Validation MAPE

# Run Optuna for XGBoost
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective_xgb, n_trials=20)
best_xgb_params = study_xgb.best_params

# Train Best XGBoost Model
best_xgb_model = xgb.XGBRegressor(**best_xgb_params)
best_xgb_model.fit(X_train_xgb, Y_train.values)

# Predictions
train_pred_xgb = best_xgb_model.predict(X_train_xgb)
val_pred_xgb = best_xgb_model.predict(X_val_xgb)
test_pred_xgb = best_xgb_model.predict(X_test_xgb)

# Compute final metrics
metrics_train_xgb = compute_metrics(Y_train.values.flatten(), train_pred_xgb.flatten())
metrics_val_xgb = compute_metrics(Y_val.values.flatten(), val_pred_xgb.flatten())
metrics_test_xgb = compute_metrics(Y_test.values.flatten(), test_pred_xgb.flatten())

# Print Final Results
print("\nFinal Model Performance (Bi-LSTM + XGBoost):\n")
print(f"Training:    MAE={metrics_train_xgb[0]:.4f}, MSE={metrics_train_xgb[1]:.4f}, RMSE={metrics_train_xgb[2]:.4f}, R²={metrics_train_xgb[3]:.4f}, MAPE={metrics_train_xgb[4]:.2f}%")
print(f"Validation:  MAE={metrics_val_xgb[0]:.4f}, MSE={metrics_val_xgb[1]:.4f}, RMSE={metrics_val_xgb[2]:.4f}, R²={metrics_val_xgb[3]:.4f}, MAPE={metrics_val_xgb[4]:.2f}%")
print(f"Test:        MAE={metrics_test_xgb[0]:.4f}, MSE={metrics_test_xgb[1]:.4f}, RMSE={metrics_test_xgb[2]:.4f}, R²={metrics_test_xgb[3]:.4f}, MAPE={metrics_test_xgb[4]:.2f}%")

print("\nBest Bi-LSTM Parameters:", best_bilstm_params)
print("Best XGBoost Parameters:", best_xgb_params)


[I 2025-02-13 11:01:45,781] A new study created in memory with name: no-name-b1424143-bf1f-4f67-9cbb-f3684a142838
[I 2025-02-13 11:01:48,119] Trial 0 finished with value: 73.34624428452885 and parameters: {'num_layers': 3, 'hidden_size': 38, 'learning_rate': 0.0006057502407408701}. Best is trial 0 with value: 73.34624428452885.
[I 2025-02-13 11:01:54,593] Trial 1 finished with value: 73.7891409624451 and parameters: {'num_layers': 4, 'hidden_size': 83, 'learning_rate': 0.0006177757617505723}. Best is trial 0 with value: 73.34624428452885.
[I 2025-02-13 11:02:07,135] Trial 2 finished with value: 73.98659026626059 and parameters: {'num_layers': 4, 'hidden_size': 115, 'learning_rate': 0.0007457937543603483}. Best is trial 0 with value: 73.34624428452885.
[I 2025-02-13 11:02:26,266] Trial 3 finished with value: 73.9826529459119 and parameters: {'num_layers': 5, 'hidden_size': 112, 'learning_rate': 0.006138532859104829}. Best is trial 0 with value: 73.34624428452885.
[I 2025-02-13 11:02:36,


Final Model Performance (Bi-LSTM + XGBoost):

Training:    MAE=0.0034, MSE=0.0000, RMSE=0.0051, R²=0.9998, MAPE=0.98%
Validation:  MAE=0.1572, MSE=0.0306, RMSE=0.1749, R²=-4.2004, MAPE=8.83%
Test:        MAE=0.4262, MSE=0.1877, RMSE=0.4332, R²=-30.1007, MAPE=21.03%

Best Bi-LSTM Parameters: {'num_layers': 2, 'hidden_size': 105, 'learning_rate': 0.00022141301540287622}
Best XGBoost Parameters: {'n_estimators': 57, 'learning_rate': 0.19856376662696132, 'max_depth': 6, 'subsample': 0.6688614379208031, 'colsample_bytree': 0.6656730417110022}


### bohb

In [17]:
!pip install ConfigSpace hpbandster

Collecting ConfigSpace
  Downloading configspace-1.2.1.tar.gz (130 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/131.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m131.0/131.0 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting hpbandster
  Downloading hpbandster-0.7.4.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting Pyro4 (from hpbandster)
  Downloading Pyro4-4.82-py2.py3-none-any.whl.metadata (2.2 kB)
Collecting serpent (from hpbandster)
  Downloading serpent-1.41-py3-none-any.whl.metadata (5.8 kB)
Collecting netifaces (from hpbandster)
  Downloading netifaces-0.11.0.

In [18]:
import numpy as np
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB
from hpbandster.core.worker import Worker
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(BiLSTMModel, self).__init__()
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bi-LSTM has 2x hidden_dim

    def forward(self, x):
        out, _ = self.bilstm(x)
        out = self.fc(out[:, -1, :])
        return out

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, r2, mape

# Convert datasets to PyTorch tensors
Y_train_torch = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_val_torch = torch.tensor(Y_val.values, dtype=torch.float32).unsqueeze(1)
Y_test_torch = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1)

# Bi-LSTM Configurations (2, 3, and 5 layers)
bilstm_layers = [2, 3, 5]
hidden_dim = 64
output_dim = 1
input_dim = X_train.shape[1]

# Dictionary to store Bi-LSTM feature representations
bilstm_features = {}

for num_layers in bilstm_layers:
    print(f"Training Bi-LSTM with {num_layers} layers...")

    bilstm_model = BiLSTMModel(input_dim, hidden_dim, num_layers, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(bilstm_model.parameters(), lr=0.001)
    num_epochs = 100

    for epoch in range(num_epochs):
        bilstm_model.train()
        optimizer.zero_grad()
        outputs = bilstm_model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        loss.backward()
        optimizer.step()

    # Extract Feature Representations
    bilstm_model.eval()
    with torch.no_grad():
        train_features = bilstm_model(X_train_torch).numpy()
        val_features = bilstm_model(X_val_torch).numpy()
        test_features = bilstm_model(X_test_torch).numpy()

    bilstm_features[num_layers] = (train_features, val_features, test_features)

# Define ConfigSpace for BOHB
def get_config_space():
    cs = CS.ConfigurationSpace()
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("n_estimators", 50, 500, default_value=100))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("learning_rate", 0.01, 0.3, default_value=0.1))
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("max_depth", 3, 10, default_value=6))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("subsample", 0.5, 1.0, default_value=0.8))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("colsample_bytree", 0.5, 1.0, default_value=0.8))
    return cs

# BOHB Worker for XGBoost
class XGBoostWorker(Worker):
    def __init__(self, train_features, val_features, **kwargs):
        super().__init__(**kwargs)
        self.train_features = train_features
        self.val_features = val_features

    def compute(self, config, budget, **kwargs):
        model = xgb.XGBRegressor(
            n_estimators=config["n_estimators"],
            learning_rate=config["learning_rate"],
            max_depth=config["max_depth"],
            subsample=config["subsample"],
            colsample_bytree=config["colsample_bytree"],
            random_state=42
        )
        model.fit(self.train_features, Y_train)
        Y_val_pred = model.predict(self.val_features)
        mae = mean_absolute_error(Y_val, Y_val_pred)
        return {"loss": mae, "info": config}

# Run BOHB for each Bi-LSTM configuration
best_models = {}

for num_layers in bilstm_layers:
    print(f"\nRunning BOHB for Bi-LSTM ({num_layers} layers) + XGBoost...")

    train_features, val_features, test_features = bilstm_features[num_layers]

    # Start NameServer
    NS = hpns.NameServer(run_id=f"bilstm_{num_layers}_xgb_bohb", host="127.0.0.1", port=None)
    NS.start()

    worker = XGBoostWorker(
        train_features=train_features,
        val_features=val_features,
        nameserver="127.0.0.1",
        run_id=f"bilstm_{num_layers}_xgb_bohb"
    )
    worker.run(background=True)

    bohb = BOHB(
        configspace=get_config_space(),
        run_id=f"bilstm_{num_layers}_xgb_bohb",
        nameserver="127.0.0.1",
        min_budget=1,
        max_budget=3
    )

    res = bohb.run(n_iterations=50)

    # Shutdown BOHB
    bohb.shutdown()
    NS.shutdown()

    # Retrieve Best Configuration
    best_config = res.get_incumbent_id()
    best_params = res.get_id2config_mapping()[best_config]["config"]

    # Train Best XGB Model on Bi-LSTM Features
    best_xgb_model = xgb.XGBRegressor(
        n_estimators=best_params["n_estimators"],
        learning_rate=best_params["learning_rate"],
        max_depth=best_params["max_depth"],
        subsample=best_params["subsample"],
        colsample_bytree=best_params["colsample_bytree"],
        random_state=42
    )

    best_xgb_model.fit(train_features, Y_train)

    # Make Predictions
    Y_train_pred = best_xgb_model.predict(train_features)
    Y_val_pred = best_xgb_model.predict(val_features)
    Y_test_pred = best_xgb_model.predict(test_features)

    # Calculate Metrics
    train_metrics = calculate_metrics(Y_train, Y_train_pred)
    val_metrics = calculate_metrics(Y_val, Y_val_pred)
    test_metrics = calculate_metrics(Y_test, Y_test_pred)

    # Store best model and metrics
    best_models[num_layers] = {
        "params": best_params,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics
    }

    # Print Results
    print(f"\nBest Parameters for Bi-LSTM ({num_layers} layers) + XGBoost:")
    print(best_params)

    print("\nTraining set metrics:")
    print(f"MAE: {train_metrics[0]:.4f}, MSE: {train_metrics[1]:.4f}, RMSE: {train_metrics[2]:.4f}, R²: {train_metrics[3]:.4f}, MAPE: {train_metrics[4]:.2f}%")

    print("\nValidation set metrics:")
    print(f"MAE: {val_metrics[0]:.4f}, MSE: {val_metrics[1]:.4f}, RMSE: {val_metrics[2]:.4f}, R²: {val_metrics[3]:.4f}, MAPE: {val_metrics[4]:.2f}%")

    print("\nTest set metrics:")
    print(f"MAE: {test_metrics[0]:.4f}, MSE: {test_metrics[1]:.4f}, RMSE: {test_metrics[2]:.4f}, R²: {test_metrics[3]:.4f}, MAPE: {test_metrics[4]:.2f}%")


Training Bi-LSTM with 2 layers...
Training Bi-LSTM with 3 layers...
Training Bi-LSTM with 5 layers...

Running BOHB for Bi-LSTM (2 layers) + XGBoost...

Best Parameters for Bi-LSTM (2 layers) + XGBoost:
{'colsample_bytree': 0.9861118922457, 'learning_rate': 0.2824946990358, 'max_depth': 6, 'n_estimators': 494, 'subsample': 0.7901141660438}

Training set metrics:
MAE: 0.0036, MSE: 0.0000, RMSE: 0.0055, R²: 0.9998, MAPE: 1.04%

Validation set metrics:
MAE: 0.1568, MSE: 0.0304, RMSE: 0.1745, R²: -4.1756, MAPE: 8.80%

Test set metrics:
MAE: 0.4258, MSE: 0.1873, RMSE: 0.4328, R²: -30.0351, MAPE: 21.00%

Running BOHB for Bi-LSTM (3 layers) + XGBoost...

Best Parameters for Bi-LSTM (3 layers) + XGBoost:
{'colsample_bytree': 0.9656658580314, 'learning_rate': 0.1676121612285, 'max_depth': 5, 'n_estimators': 82, 'subsample': 0.5409891053982}

Training set metrics:
MAE: 0.0037, MSE: 0.0000, RMSE: 0.0056, R²: 0.9998, MAPE: 1.08%

Validation set metrics:
MAE: 0.1567, MSE: 0.0304, RMSE: 0.1744, R²: 

## Bi-LSTM [CatBoost]

In [19]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


### initial

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define BiLSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # *2 for bidirectional

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # *2 for bidirectional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return self.fc(out[:, -1, :])

# Set Parameters
input_size = 3
hidden_size = 64
num_layers_list = [2, 3, 5]
learning_rate = 0.001
num_epochs = 100

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100
    return mae, mse, rmse, r2, mape

# DataFrame to store results
columns = ["Layers", "Dataset", "MAE", "MSE", "RMSE", "R²", "MAPE"]
results_df = pd.DataFrame(columns=columns)

# Train multiple Bi-LSTM models
bilstm_outputs = {}

for num_layers in num_layers_list:
    print(f"\nTraining Bi-LSTM with {num_layers} layers...")

    model = BiLSTMModel(input_size, hidden_size, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_torch).cpu().numpy()
        val_pred = model(X_val_torch).cpu().numpy()
        test_pred = model(X_test_torch).cpu().numpy()

    # Inverse transform predictions
    train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

    # Compute metrics for each dataset
    metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
    metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
    metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

    # Append results to DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame([
            [num_layers, "Train", *metrics_train],
            [num_layers, "Validation", *metrics_val],
            [num_layers, "Test", *metrics_test]
        ], columns=columns)
    ], ignore_index=True)

    # Store Bi-LSTM embeddings for CatBoost
    bilstm_outputs[num_layers] = {
        "train": train_pred_actual,
        "val": val_pred_actual,
        "test": test_pred_actual
    }

# Find the best Bi-LSTM model (Lowest Validation MAPE)
best_model = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
best_layers = int(best_model["Layers"])

# Display Bi-LSTM Results
print("\nBi-LSTM Model Performance Comparison (2, 3, and 5 Layers)\n")
print(results_df.to_string(index=False))

print(f"\nBest Bi-LSTM Model: {best_layers} Layers (Based on Lowest Validation MAPE)\n")

# ---------- CatBoost on Best Bi-LSTM Embeddings ----------
print("\nTraining CatBoost on Best Bi-LSTM Embeddings...")

# Use the best Bi-LSTM's output as features for CatBoost
X_train_cat = bilstm_outputs[best_layers]["train"]
X_val_cat = bilstm_outputs[best_layers]["val"]
X_test_cat = bilstm_outputs[best_layers]["test"]

# CatBoost Model
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, loss_function="RMSE", verbose=100)
cat_model.fit(X_train_cat, Y_train.values, eval_set=(X_val_cat, Y_val.values), early_stopping_rounds=100)

# Predictions
train_pred_cat = cat_model.predict(X_train_cat)
val_pred_cat = cat_model.predict(X_val_cat)
test_pred_cat = cat_model.predict(X_test_cat)

# Compute metrics for CatBoost
metrics_train_cat = compute_metrics(Y_train.values.flatten(), train_pred_cat.flatten())
metrics_val_cat = compute_metrics(Y_val.values.flatten(), val_pred_cat.flatten())
metrics_test_cat = compute_metrics(Y_test.values.flatten(), test_pred_cat.flatten())

# Append CatBoost results to DataFrame
results_df = pd.concat([
    results_df,
    pd.DataFrame([
        [f"Bi-LSTM({best_layers}) + CatBoost", "Train", *metrics_train_cat],
        [f"Bi-LSTM({best_layers}) + CatBoost", "Validation", *metrics_val_cat],
        [f"Bi-LSTM({best_layers}) + CatBoost", "Test", *metrics_test_cat]
    ], columns=columns)
], ignore_index=True)

# Display Final Results
print("\nFinal Model Performance (Bi-LSTM vs Bi-LSTM + CatBoost)\n")
print(results_df.to_string(index=False))

# Best Model Selection
best_overall = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
print(f"\nBest Overall Model: {best_overall['Layers']} (Based on Lowest Validation MAPE)")



Training Bi-LSTM with 2 layers...
Epoch [10/100], Loss: 0.1042
Epoch [20/100], Loss: 0.0741
Epoch [30/100], Loss: 0.0754
Epoch [40/100], Loss: 0.0720
Epoch [50/100], Loss: 0.0719
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training Bi-LSTM with 3 layers...
Epoch [10/100], Loss: 0.0743
Epoch [20/100], Loss: 0.0734
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0716
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training Bi-LSTM with 5 layers...
Epoch [10/100], Loss: 0.0885
Epoch [20/100], Loss: 0.0755
Epoch [30/100], Loss: 0.0715
Epoch [40/100], Loss: 0.0719
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Bi-LSTM Model Perform

### optuna

In [21]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import optuna
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Bi-directional => hidden_size * 2

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # *2 for bidirectional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return self.fc(out[:, -1, :])

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100
    return mae, mse, rmse, r2, mape

# Optuna objective function
def objective(trial):
    hidden_size = trial.suggest_categorical("hidden_size", [32, 64, 128])
    num_layers = trial.suggest_int("num_layers", 2, 5)
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-4, 1e-2)

    model = BiLSTMModel(input_size=3, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(50):  # Reduce epochs for faster tuning
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_torch).cpu().numpy()

    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    _, _, _, _, val_mape = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())

    return val_mape

# Run Optuna optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

# Get best hyperparameters
best_params = study.best_params
print(f"\nBest Bi-LSTM Hyperparameters: {best_params}")

# Train Best Bi-LSTM Model
best_bilstm = BiLSTMModel(input_size=3, hidden_size=best_params["hidden_size"], num_layers=best_params["num_layers"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_bilstm.parameters(), lr=best_params["learning_rate"])

for epoch in range(100):
    best_bilstm.train()
    outputs = best_bilstm(X_train_torch)
    loss = criterion(outputs, Y_train_torch)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Evaluate Best Bi-LSTM Model
best_bilstm.eval()
with torch.no_grad():
    train_pred = best_bilstm(X_train_torch).cpu().numpy()
    val_pred = best_bilstm(X_val_torch).cpu().numpy()
    test_pred = best_bilstm(X_test_torch).cpu().numpy()

# Inverse transform predictions
train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

# Compute metrics
metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

# Store Bi-LSTM embeddings
bilstm_outputs = {
    "train": train_pred_actual,
    "val": val_pred_actual,
    "test": test_pred_actual
}

# Display Bi-LSTM Results
print("\nBi-LSTM Model Performance (Best Hyperparameters):\n")
print(f"Train: {metrics_train}\nValidation: {metrics_val}\nTest: {metrics_test}")

# ---------- CatBoost on Best Bi-LSTM Embeddings ----------
print("\nTraining CatBoost on Best Bi-LSTM Embeddings...")

# Use Bi-LSTM embeddings as input for CatBoost
X_train_cat = bilstm_outputs["train"]
X_val_cat = bilstm_outputs["val"]
X_test_cat = bilstm_outputs["test"]

# CatBoost Model
cat_model = CatBoostRegressor(iterations=1000, learning_rate=0.05, depth=6, loss_function="RMSE", verbose=100)
cat_model.fit(X_train_cat, Y_train.values, eval_set=(X_val_cat, Y_val.values), early_stopping_rounds=100)

# Predictions
train_pred_cat = cat_model.predict(X_train_cat)
val_pred_cat = cat_model.predict(X_val_cat)
test_pred_cat = cat_model.predict(X_test_cat)

# Compute metrics for CatBoost
metrics_train_cat = compute_metrics(Y_train.values.flatten(), train_pred_cat.flatten())
metrics_val_cat = compute_metrics(Y_val.values.flatten(), val_pred_cat.flatten())
metrics_test_cat = compute_metrics(Y_test.values.flatten(), test_pred_cat.flatten())

# Display Final Results
print("\nFinal Model Performance (Bi-LSTM vs Bi-LSTM + CatBoost)\n")
print(f"Train: {metrics_train_cat}\nValidation: {metrics_val_cat}\nTest: {metrics_test_cat}")

# Best Model Selection
best_model = "Bi-LSTM + CatBoost" if metrics_val_cat[4] < metrics_val[4] else "Bi-LSTM Only"
print(f"\nBest Overall Model: {best_model} (Based on Lowest Validation MAPE)")


[I 2025-02-13 11:23:52,988] A new study created in memory with name: no-name-207cd5da-a387-4f42-8bee-34e212ad8413
[I 2025-02-13 11:23:56,949] Trial 0 finished with value: 73.24154832223407 and parameters: {'hidden_size': 64, 'num_layers': 5, 'learning_rate': 0.002925754877493491}. Best is trial 0 with value: 73.24154832223407.
[I 2025-02-13 11:24:07,531] Trial 1 finished with value: 73.35417958185919 and parameters: {'hidden_size': 128, 'num_layers': 4, 'learning_rate': 0.0019432425149269342}. Best is trial 0 with value: 73.24154832223407.
[I 2025-02-13 11:24:08,208] Trial 2 finished with value: 71.77385283640025 and parameters: {'hidden_size': 32, 'num_layers': 2, 'learning_rate': 0.0013283048016925125}. Best is trial 2 with value: 71.77385283640025.
[I 2025-02-13 11:24:11,753] Trial 3 finished with value: 73.13190834486093 and parameters: {'hidden_size': 64, 'num_layers': 5, 'learning_rate': 0.0028865533935638573}. Best is trial 2 with value: 71.77385283640025.
[I 2025-02-13 11:24:25


Best Bi-LSTM Hyperparameters: {'hidden_size': 32, 'num_layers': 3, 'learning_rate': 0.000644812874377265}

Bi-LSTM Model Performance (Best Hyperparameters):

Train: (0.32683288987496467, 0.1744029795416491, 0.4176158276953223, 0.0035310996003891093, 124.62617348662235)
Validation: (1.2914852984509062, 1.6738109496480915, 1.2937584587735422, -283.58707968360284, 73.89607738874251)
Test: (1.5604630193532787, 2.441078961798212, 1.562395264265164, -403.48237692783965, 77.38704655921136)

Training CatBoost on Best Bi-LSTM Embeddings...
0:	learn: 0.3980532	test: 1.2474559	best: 1.2474559 (0)	total: 1.7ms	remaining: 1.7s
100:	learn: 0.0080367	test: 0.1988989	best: 0.1988989 (100)	total: 156ms	remaining: 1.39s
200:	learn: 0.0056610	test: 0.1772462	best: 0.1772389 (199)	total: 297ms	remaining: 1.18s
300:	learn: 0.0050505	test: 0.1758311	best: 0.1758311 (300)	total: 434ms	remaining: 1.01s
400:	learn: 0.0047966	test: 0.1754897	best: 0.1754742 (392)	total: 574ms	remaining: 858ms
500:	learn: 0.004

### BOHB

In [22]:
import numpy as np
import catboost as cb
import torch
import torch.nn as nn
import torch.optim as optim
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB
from hpbandster.core.worker import Worker
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(BiLSTMModel, self).__init__()
        self.bilstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # Bi-directional => hidden_dim * 2

    def forward(self, x):
        out, _ = self.bilstm(x)
        out = self.fc(out[:, -1, :])  # Take last timestep
        return out

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, r2, mape

# Convert datasets to PyTorch tensors
Y_train_torch = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_val_torch = torch.tensor(Y_val.values, dtype=torch.float32).unsqueeze(1)
Y_test_torch = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1)

# Bi-LSTM Configurations (2, 3, and 5 layers)
bilstm_layers = [2, 3, 5]
hidden_dim = 64
output_dim = 1
input_dim = X_train.shape[1]

# Dictionary to store Bi-LSTM feature representations
bilstm_features = {}

for num_layers in bilstm_layers:
    print(f"Training Bi-LSTM with {num_layers} layers...")

    bilstm_model = BiLSTMModel(input_dim, hidden_dim, num_layers, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(bilstm_model.parameters(), lr=0.001)
    num_epochs = 100

    for epoch in range(num_epochs):
        bilstm_model.train()
        optimizer.zero_grad()
        outputs = bilstm_model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        loss.backward()
        optimizer.step()

    # Extract Feature Representations
    bilstm_model.eval()
    with torch.no_grad():
        train_features = bilstm_model(X_train_torch).numpy()
        val_features = bilstm_model(X_val_torch).numpy()
        test_features = bilstm_model(X_test_torch).numpy()

    bilstm_features[num_layers] = (train_features, val_features, test_features)

# Define ConfigSpace for BOHB
def get_config_space():
    cs = CS.ConfigurationSpace()
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("iterations", 50, 500, default_value=100))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("learning_rate", 0.01, 0.3, default_value=0.1))
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("depth", 3, 10, default_value=6))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("bagging_temperature", 0.0, 1.0, default_value=0.8))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("colsample_bylevel", 0.5, 1.0, default_value=0.8))
    return cs

# BOHB Worker for CatBoost
class CatBoostWorker(Worker):
    def __init__(self, train_features, val_features, **kwargs):
        super().__init__(**kwargs)
        self.train_features = train_features
        self.val_features = val_features

    def compute(self, config, budget, **kwargs):
        model = cb.CatBoostRegressor(
            iterations=config["iterations"],
            learning_rate=config["learning_rate"],
            depth=config["depth"],
            bagging_temperature=config["bagging_temperature"],
            colsample_bylevel=config["colsample_bylevel"],
            random_seed=42,
            verbose=0
        )
        model.fit(self.train_features, Y_train)
        Y_val_pred = model.predict(self.val_features)
        mae = mean_absolute_error(Y_val, Y_val_pred)
        return {"loss": mae, "info": config}

# Run BOHB for each Bi-LSTM configuration
best_models = {}

for num_layers in bilstm_layers:
    print(f"\nRunning BOHB for Bi-LSTM ({num_layers} layers) + CatBoost...")

    train_features, val_features, test_features = bilstm_features[num_layers]

    # Start NameServer
    NS = hpns.NameServer(run_id=f"bilstm_{num_layers}_catboost_bohb", host="127.0.0.1", port=None)
    NS.start()

    worker = CatBoostWorker(
        train_features=train_features,
        val_features=val_features,
        nameserver="127.0.0.1",
        run_id=f"bilstm_{num_layers}_catboost_bohb"
    )
    worker.run(background=True)

    bohb = BOHB(
        configspace=get_config_space(),
        run_id=f"bilstm_{num_layers}_catboost_bohb",
        nameserver="127.0.0.1",
        min_budget=1,
        max_budget=3
    )

    res = bohb.run(n_iterations=50)

    # Shutdown BOHB
    bohb.shutdown()
    NS.shutdown()

    # Retrieve Best Configuration
    best_config = res.get_incumbent_id()
    best_params = res.get_id2config_mapping()[best_config]["config"]

    # Train Best CatBoost Model on Bi-LSTM Features
    best_catboost_model = cb.CatBoostRegressor(
        iterations=best_params["iterations"],
        learning_rate=best_params["learning_rate"],
        depth=best_params["depth"],
        bagging_temperature=best_params["bagging_temperature"],
        colsample_bylevel=best_params["colsample_bylevel"],
        random_seed=42,
        verbose=0
    )

    best_catboost_model.fit(train_features, Y_train)

    # Make Predictions
    Y_train_pred = best_catboost_model.predict(train_features)
    Y_val_pred = best_catboost_model.predict(val_features)
    Y_test_pred = best_catboost_model.predict(test_features)

    # Calculate Metrics
    train_metrics = calculate_metrics(Y_train, Y_train_pred)
    val_metrics = calculate_metrics(Y_val, Y_val_pred)
    test_metrics = calculate_metrics(Y_test, Y_test_pred)

    # Store best model and metrics
    best_models[num_layers] = {
        "params": best_params,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics
    }

    # Print Results
    print(f"\nBest Parameters for Bi-LSTM ({num_layers} layers) + CatBoost:")
    print(best_params)

    print("\nTraining set metrics:")
    print(train_metrics)

    print("\nValidation set metrics:")
    print(val_metrics)

    print("\nTest set metrics:")
    print(test_metrics)


Training Bi-LSTM with 2 layers...
Training Bi-LSTM with 3 layers...
Training Bi-LSTM with 5 layers...

Running BOHB for Bi-LSTM (2 layers) + CatBoost...

Best Parameters for Bi-LSTM (2 layers) + CatBoost:
{'bagging_temperature': 0.5488155518478, 'colsample_bylevel': 0.9577685831541, 'depth': 9, 'iterations': 471, 'learning_rate': 0.2826543846672}

Training set metrics:
(0.0035974328653817647, 3.0181861275629613e-05, 0.005493802078308757, 0.9998275529111006, 1.0361633306967521)

Validation set metrics:
(0.15750859843334464, 0.030671020072395385, 0.17513143656235847, -4.214792049935686, 8.840991623781303)

Test set metrics:
(0.4264838004683717, 0.18792350083388817, 0.4335014427125799, -30.138584817386725, 21.039113477536315)

Running BOHB for Bi-LSTM (3 layers) + CatBoost...

Best Parameters for Bi-LSTM (3 layers) + CatBoost:
{'bagging_temperature': 0.2762614862038, 'colsample_bylevel': 0.9827069728425, 'depth': 9, 'iterations': 481, 'learning_rate': 0.2800476871542}

Training set metric

## Bi-LSTM [Lightboost]

### initial

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Bi-directional (2x hidden size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)  # Bi-directional
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.bilstm(x, (h0, c0))
        return self.fc(out[:, -1, :])  # Take last time step output

# Set Parameters
input_size = 3
hidden_size = 64
num_layers_list = [2, 3, 5]  # Different Bi-LSTM layers
learning_rate = 0.001
num_epochs = 100

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100  # Avoid divide-by-zero
    return mae, mse, rmse, r2, mape

# DataFrame to store results
columns = ["Layers", "Dataset", "MAE", "MSE", "RMSE", "R²", "MAPE"]
results_df = pd.DataFrame(columns=columns)

# Train multiple Bi-LSTM models
bilstm_outputs = {}  # Store Bi-LSTM embeddings for LGBM

for num_layers in num_layers_list:
    print(f"\nTraining Bi-LSTM with {num_layers} layers...")

    # Initialize model, loss function, and optimizer
    model = BiLSTMModel(input_size, hidden_size, num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

    # Evaluation
    model.eval()
    with torch.no_grad():
        train_pred = model(X_train_torch).cpu().numpy()
        val_pred = model(X_val_torch).cpu().numpy()
        test_pred = model(X_test_torch).cpu().numpy()

    # Inverse transform predictions
    train_pred_actual = scaler.inverse_transform(train_pred.reshape(-1, 1))
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))
    test_pred_actual = scaler.inverse_transform(test_pred.reshape(-1, 1))

    # Compute metrics for each dataset
    metrics_train = compute_metrics(Y_train.values.flatten(), train_pred_actual.flatten())
    metrics_val = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())
    metrics_test = compute_metrics(Y_test.values.flatten(), test_pred_actual.flatten())

    # Append results to DataFrame
    results_df = pd.concat([
        results_df,
        pd.DataFrame([
            [num_layers, "Train", *metrics_train],
            [num_layers, "Validation", *metrics_val],
            [num_layers, "Test", *metrics_test]
        ], columns=columns)
    ], ignore_index=True)

    # Store Bi-LSTM embeddings for LGBM
    bilstm_outputs[num_layers] = {
        "train": train_pred_actual,
        "val": val_pred_actual,
        "test": test_pred_actual
    }

# Find the best Bi-LSTM model (Lowest Validation MAPE)
best_model = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
best_layers = int(best_model["Layers"])

# Display Bi-LSTM Results
print("\nBi-LSTM Model Performance Comparison (2, 3, and 5 Layers)\n")
print(results_df.to_string(index=False))

print(f"\nBest Bi-LSTM Model: {best_layers} Layers (Based on Lowest Validation MAPE)\n")

# ---------- LightGBM on Best Bi-LSTM Embeddings ----------
print("\nTraining LightGBM on Best Bi-LSTM Embeddings...")

# Use the best Bi-LSTM's output as features for LGBM
X_train_lgb = bilstm_outputs[best_layers]["train"]
X_val_lgb = bilstm_outputs[best_layers]["val"]
X_test_lgb = bilstm_outputs[best_layers]["test"]

# LightGBM Dataset
lgb_train = lgb.Dataset(X_train_lgb, label=Y_train)
lgb_val = lgb.Dataset(X_val_lgb, label=Y_val, reference=lgb_train)

# LGBM Parameters
lgb_params = {
    "objective": "regression",
    "metric": "rmse",
    "boosting_type": "gbdt",
    "learning_rate": 0.05,
    "num_leaves": 31
}

# Train LGBM
lgb_model = lgb.train(lgb_params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=200, callbacks=[lgb.log_evaluation(50)])

# Predictions
train_pred_lgb = lgb_model.predict(X_train_lgb)
val_pred_lgb = lgb_model.predict(X_val_lgb)
test_pred_lgb = lgb_model.predict(X_test_lgb)

# Compute metrics for LGBM
metrics_train_lgb = compute_metrics(Y_train.values.flatten(), train_pred_lgb.flatten())
metrics_val_lgb = compute_metrics(Y_val.values.flatten(), val_pred_lgb.flatten())
metrics_test_lgb = compute_metrics(Y_test.values.flatten(), test_pred_lgb.flatten())

# Append LGBM results to DataFrame
results_df = pd.concat([
    results_df,
    pd.DataFrame([
        [f"Bi-LSTM({best_layers}) + LGBM", "Train", *metrics_train_lgb],
        [f"Bi-LSTM({best_layers}) + LGBM", "Validation", *metrics_val_lgb],
        [f"Bi-LSTM({best_layers}) + LGBM", "Test", *metrics_test_lgb]
    ], columns=columns)
], ignore_index=True)

# Display Final Results
print("\nFinal Model Performance (Bi-LSTM vs Bi-LSTM + LGBM)\n")
print(results_df.to_string(index=False))

# Best Model Selection
best_overall = results_df[results_df["Dataset"] == "Validation"].sort_values("MAPE").iloc[0]
print(f"\nBest Overall Model: {best_overall['Layers']} (Based on Lowest Validation MAPE)")



Training Bi-LSTM with 2 layers...
Epoch [10/100], Loss: 0.1079
Epoch [20/100], Loss: 0.0741
Epoch [30/100], Loss: 0.0755
Epoch [40/100], Loss: 0.0720
Epoch [50/100], Loss: 0.0719
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training Bi-LSTM with 3 layers...
Epoch [10/100], Loss: 0.0759
Epoch [20/100], Loss: 0.0741
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0717
Epoch [50/100], Loss: 0.0715
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Training Bi-LSTM with 5 layers...
Epoch [10/100], Loss: 0.0880
Epoch [20/100], Loss: 0.0762
Epoch [30/100], Loss: 0.0716
Epoch [40/100], Loss: 0.0719
Epoch [50/100], Loss: 0.0716
Epoch [60/100], Loss: 0.0715
Epoch [70/100], Loss: 0.0715
Epoch [80/100], Loss: 0.0715
Epoch [90/100], Loss: 0.0715
Epoch [100/100], Loss: 0.0715

Bi-LSTM Model Perform

### optuna

In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import lightgbm as lgb
import pandas as pd
import optuna
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(BiLSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # Bi-directional doubles the hidden size

    def forward(self, x):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        return self.fc(out[:, -1, :])

# Function to compute evaluation metrics
def compute_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.maximum(np.abs(y_true), 1e-8))) * 100
    return mae, mse, rmse, r2, mape

# MinMax Scaling
scaler = MinMaxScaler()
Y_train_scaled = scaler.fit_transform(Y_train.values.reshape(-1, 1))
Y_val_scaled = scaler.transform(Y_val.values.reshape(-1, 1))
Y_test_scaled = scaler.transform(Y_test.values.reshape(-1, 1))

# Convert data to PyTorch tensors
X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_train_torch = torch.tensor(Y_train_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_val_torch = torch.tensor(Y_val_scaled, dtype=torch.float32).unsqueeze(1).to(device)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1).to(device)
Y_test_torch = torch.tensor(Y_test_scaled, dtype=torch.float32).unsqueeze(1).to(device)

# ----------- OPTUNA OPTIMIZATION FUNCTION -----------

def objective(trial):
    # Sample Bi-LSTM hyperparameters
    hidden_size = trial.suggest_int("hidden_size", 32, 128, step=16)
    num_layers = trial.suggest_int("num_layers", 2, 5)
    learning_rate = trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True)

    # Initialize model, loss function, and optimizer
    model = BiLSTMModel(input_size=3, hidden_size=hidden_size, num_layers=num_layers).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Training loop
    num_epochs = 50
    for epoch in range(num_epochs):
        model.train()
        outputs = model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    with torch.no_grad():
        val_pred = model(X_val_torch).cpu().numpy()

    # Inverse transform predictions
    val_pred_actual = scaler.inverse_transform(val_pred.reshape(-1, 1))

    # Compute MAPE (minimization objective)
    _, _, _, _, mape = compute_metrics(Y_val.values.flatten(), val_pred_actual.flatten())

    return mape  # Optuna minimizes MAPE

# Run Optuna for Bi-LSTM
study_bilstm = optuna.create_study(direction="minimize")
study_bilstm.optimize(objective, n_trials=20)

# Best Bi-LSTM Model Parameters
best_bilstm_params = study_bilstm.best_params
print("\nBest Bi-LSTM Model:", best_bilstm_params)

# ----------- Train Best Bi-LSTM and Get Embeddings -----------

best_bilstm = BiLSTMModel(input_size=3, hidden_size=best_bilstm_params["hidden_size"], num_layers=best_bilstm_params["num_layers"]).to(device)
criterion = nn.MSELoss()
optimizer = optim.Adam(best_bilstm.parameters(), lr=best_bilstm_params["learning_rate"])

# Train Best Bi-LSTM
for epoch in range(50):
    best_bilstm.train()
    outputs = best_bilstm(X_train_torch)
    loss = criterion(outputs, Y_train_torch)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# Get Bi-LSTM embeddings
best_bilstm.eval()
with torch.no_grad():
    train_pred = best_bilstm(X_train_torch).cpu().numpy()
    val_pred = best_bilstm(X_val_torch).cpu().numpy()
    test_pred = best_bilstm(X_test_torch).cpu().numpy()

# Inverse transform predictions
X_train_lgb = scaler.inverse_transform(train_pred.reshape(-1, 1))
X_val_lgb = scaler.inverse_transform(val_pred.reshape(-1, 1))
X_test_lgb = scaler.inverse_transform(test_pred.reshape(-1, 1))

# ----------- OPTUNA OPTIMIZATION FOR LIGHTGBM -----------

def objective_lgb(trial):
    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": trial.suggest_int("num_leaves", 20, 50),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 5, 30)
    }

    lgb_train = lgb.Dataset(X_train_lgb, label=Y_train)
    lgb_val = lgb.Dataset(X_val_lgb, label=Y_val, reference=lgb_train)

    model = lgb.train(params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=200, callbacks=[lgb.log_evaluation(50)])

    val_pred = model.predict(X_val_lgb)
    _, _, _, _, mape = compute_metrics(Y_val.values.flatten(), val_pred.flatten())

    return mape  # Optuna minimizes MAPE

# Run Optuna for LGBM
study_lgb = optuna.create_study(direction="minimize")
study_lgb.optimize(objective_lgb, n_trials=20)

# Best LGBM Model Parameters
best_lgb_params = study_lgb.best_params
print("\nBest LightGBM Model:", best_lgb_params)

# Train Final LightGBM Model
lgb_train = lgb.Dataset(X_train_lgb, label=Y_train)
lgb_val = lgb.Dataset(X_val_lgb, label=Y_val, reference=lgb_train)

final_lgb = lgb.train(best_lgb_params, lgb_train, valid_sets=[lgb_train, lgb_val], num_boost_round=200, callbacks=[lgb.log_evaluation(50)])

# Predictions
train_pred_lgb = final_lgb.predict(X_train_lgb)
val_pred_lgb = final_lgb.predict(X_val_lgb)
test_pred_lgb = final_lgb.predict(X_test_lgb)

# Compute metrics
metrics_train_lgb = compute_metrics(Y_train.values.flatten(), train_pred_lgb.flatten())
metrics_val_lgb = compute_metrics(Y_val.values.flatten(), val_pred_lgb.flatten())
metrics_test_lgb = compute_metrics(Y_test.values.flatten(), test_pred_lgb.flatten())

print("\nFinal Bi-LSTM + LightGBM Performance:")
print("Train:", metrics_train_lgb)
print("Validation:", metrics_val_lgb)
print("Test:", metrics_test_lgb)


[I 2025-02-13 11:42:50,180] A new study created in memory with name: no-name-2c9b39e7-ec8d-4c27-aa90-62f453f115fa
[I 2025-02-13 11:42:51,494] Trial 0 finished with value: 65.96145833836951 and parameters: {'hidden_size': 32, 'num_layers': 3, 'learning_rate': 0.0005810662956926598}. Best is trial 0 with value: 65.96145833836951.
[I 2025-02-13 11:42:56,596] Trial 1 finished with value: 74.33416777285065 and parameters: {'hidden_size': 96, 'num_layers': 3, 'learning_rate': 0.004190507737909965}. Best is trial 0 with value: 65.96145833836951.
[I 2025-02-13 11:42:58,587] Trial 2 finished with value: 73.82826630362119 and parameters: {'hidden_size': 48, 'num_layers': 3, 'learning_rate': 0.0095897632099731}. Best is trial 0 with value: 65.96145833836951.
[I 2025-02-13 11:43:04,613] Trial 3 finished with value: 74.6627267901344 and parameters: {'hidden_size': 128, 'num_layers': 2, 'learning_rate': 0.006302973092539772}. Best is trial 0 with value: 65.96145833836951.
[I 2025-02-13 11:43:06,044]


Best Bi-LSTM Model: {'hidden_size': 32, 'num_layers': 3, 'learning_rate': 0.0005810662956926598}


[I 2025-02-13 11:43:57,657] A new study created in memory with name: no-name-b853f1a7-9e7f-4b9b-bd28-22fd840a42c5


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.129609	valid_1's rmse: 0.516168
[100]	training's rmse: 0.0404497	valid_1's rmse: 0.277452
[150]	training's rmse: 0.0135326	valid_1's rmse: 0.205513


[I 2025-02-13 11:43:57,944] Trial 0 finished with value: 9.388178685176745 and parameters: {'num_leaves': 50, 'learning_rate': 0.02318593982807697, 'max_depth': 10, 'min_data_in_leaf': 10}. Best is trial 0 with value: 9.388178685176745.


[200]	training's rmse: 0.00662389	valid_1's rmse: 0.183806
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.146501	valid_1's rmse: 0.563857
[100]	training's rmse: 0.0515211	valid_1's rmse: 0.30778


[I 2025-02-13 11:43:58,213] Trial 1 finished with value: 9.783983640556343 and parameters: {'num_leaves': 41, 'learning_rate': 0.020788337592759658, 'max_depth': 10, 'min_data_in_leaf': 19}. Best is trial 0 with value: 9.388178685176745.


[150]	training's rmse: 0.018727	valid_1's rmse: 0.220096
[200]	training's rmse: 0.00827725	valid_1's rmse: 0.190108
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0365886	valid_1's rmse: 0.269326


[I 2025-02-13 11:43:58,444] Trial 2 finished with value: 8.785363864378352 and parameters: {'num_leaves': 33, 'learning_rate': 0.04781283066332281, 'max_depth': 5, 'min_data_in_leaf': 23}. Best is trial 2 with value: 8.785363864378352.


[100]	training's rmse: 0.00623922	valid_1's rmse: 0.18236
[150]	training's rmse: 0.00540303	valid_1's rmse: 0.174902
[200]	training's rmse: 0.00539539	valid_1's rmse: 0.174249
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.216208	valid_1's rmse: 0.76089


[I 2025-02-13 11:43:58,701] Trial 3 finished with value: 14.129720627047753 and parameters: {'num_leaves': 47, 'learning_rate': 0.013164449104261794, 'max_depth': 4, 'min_data_in_leaf': 5}. Best is trial 2 with value: 8.785363864378352.


[100]	training's rmse: 0.111821	valid_1's rmse: 0.485557
[150]	training's rmse: 0.0579822	valid_1's rmse: 0.339247
[200]	training's rmse: 0.0303336	valid_1's rmse: 0.261214
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00613784	valid_1's rmse: 0.181551
[100]	training's rmse: 0.00539474	valid_1's rmse: 0.174242
[150]	training's rmse: 0.00539465	valid_1's rmse: 0.174186
[200]	training's rmse: 0.00539465	valid_1's rmse: 0.174185


[I 2025-02-13 11:43:58,992] Trial 4 finished with value: 8.781365066291881 and parameters: {'num_leaves': 47, 'learning_rate': 0.09452580397374, 'max_depth': 10, 'min_data_in_leaf': 10}. Best is trial 4 with value: 8.781365066291881.
[I 2025-02-13 11:43:59,182] Trial 5 finished with value: 12.261347124451238 and parameters: {'num_leaves': 23, 'learning_rate': 0.014940011649719134, 'max_depth': 6, 'min_data_in_leaf': 21}. Best is trial 4 with value: 8.781365066291881.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.19739	valid_1's rmse: 0.710676
[100]	training's rmse: 0.0932389	valid_1's rmse: 0.429762
[150]	training's rmse: 0.0442447	valid_1's rmse: 0.294247
[200]	training's rmse: 0.0214104	valid_1's rmse: 0.230281
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 11:43:59,412] Trial 6 finished with value: 9.50836507020083 and parameters: {'num_leaves': 35, 'learning_rate': 0.022472208158916023, 'max_depth': 6, 'min_data_in_leaf': 25}. Best is trial 4 with value: 8.781365066291881.


[50]	training's rmse: 0.134474	valid_1's rmse: 0.534735
[100]	training's rmse: 0.0434966	valid_1's rmse: 0.28788
[150]	training's rmse: 0.0148776	valid_1's rmse: 0.21018
[200]	training's rmse: 0.00699693	valid_1's rmse: 0.185717
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 11:43:59,627] Trial 7 finished with value: 8.976299915408646 and parameters: {'num_leaves': 34, 'learning_rate': 0.02892787457788649, 'max_depth': 6, 'min_data_in_leaf': 15}. Best is trial 4 with value: 8.781365066291881.


[50]	training's rmse: 0.096652	valid_1's rmse: 0.432252
[100]	training's rmse: 0.022908	valid_1's rmse: 0.232456
[150]	training's rmse: 0.00745016	valid_1's rmse: 0.187557
[200]	training's rmse: 0.00552489	valid_1's rmse: 0.177275
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 11:43:59,842] Trial 8 finished with value: 8.936054812576947 and parameters: {'num_leaves': 35, 'learning_rate': 0.030059955572117446, 'max_depth': 7, 'min_data_in_leaf': 14}. Best is trial 4 with value: 8.781365066291881.


[50]	training's rmse: 0.0911915	valid_1's rmse: 0.416773
[100]	training's rmse: 0.0205331	valid_1's rmse: 0.225952
[150]	training's rmse: 0.00690742	valid_1's rmse: 0.185351
[200]	training's rmse: 0.00547676	valid_1's rmse: 0.176637
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0882904	valid_1's rmse: 0.423154
[100]	training's rmse: 0.0193738	valid_1's rmse: 0.233379
[150]	training's rmse: 0.00672324	valid_1's rmse: 0.193452
[200]	training's rmse: 0.00551912	valid_1's rmse: 0.184888


[I 2025-02-13 11:44:00,092] Trial 9 finished with value: 9.456247396252861 and parameters: {'num_leaves': 40, 'learning_rate': 0.030778126778320317, 'max_depth': 4, 'min_data_in_leaf': 30}. Best is trial 4 with value: 8.781365066291881.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00680973	valid_1's rmse: 0.185436
[100]	training's rmse: 0.00539689	valid_1's rmse: 0.174348
[150]	training's rmse: 0.00539512	valid_1's rmse: 0.174201
[200]	training's rmse: 0.00539477	valid_1's rmse: 0.174192


[I 2025-02-13 11:44:00,359] Trial 10 finished with value: 8.78183713944408 and parameters: {'num_leaves': 24, 'learning_rate': 0.08830786394418103, 'max_depth': 8, 'min_data_in_leaf': 5}. Best is trial 4 with value: 8.781365066291881.
[I 2025-02-13 11:44:00,594] Trial 11 finished with value: 8.782039992982915 and parameters: {'num_leaves': 23, 'learning_rate': 0.09941330980510743, 'max_depth': 8, 'min_data_in_leaf': 5}. Best is trial 4 with value: 8.781365066291881.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0058571	valid_1's rmse: 0.180419
[100]	training's rmse: 0.0053979	valid_1's rmse: 0.174258
[150]	training's rmse: 0.00539544	valid_1's rmse: 0.174205
[200]	training's rmse: 0.00539484	valid_1's rmse: 0.174196
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.00600705	valid_1's rmse: 0.181251
[100]	training's rmse: 0.005395

[I 2025-02-13 11:44:00,849] Trial 12 finished with value: 8.781569511554514 and parameters: {'num_leaves': 28, 'learning_rate': 0.09651676726393442, 'max_depth': 9, 'min_data_in_leaf': 10}. Best is trial 4 with value: 8.781365066291881.
[I 2025-02-13 11:44:01,066] Trial 13 finished with value: 8.78186685093259 and parameters: {'num_leaves': 29, 'learning_rate': 0.061744197640949385, 'max_depth': 9, 'min_data_in_leaf': 11}. Best is trial 4 with value: 8.781365066291881.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000202 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.018162	valid_1's rmse: 0.220222
[100]	training's rmse: 0.00544371	valid_1's rmse: 0.176121
[150]	training's rmse: 0.00539496	valid_1's rmse: 0.17428
[200]	training's rmse: 0.00539471	valid_1's rmse: 0.174193


[I 2025-02-13 11:44:01,311] Trial 14 finished with value: 8.78183931884714 and parameters: {'num_leaves': 28, 'learning_rate': 0.061653104371179185, 'max_depth': 9, 'min_data_in_leaf': 10}. Best is trial 4 with value: 8.781365066291881.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000211 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0182472	valid_1's rmse: 0.220561
[100]	training's rmse: 0.00544481	valid_1's rmse: 0.176166
[150]	training's rmse: 0.00539501	valid_1's rmse: 0.174285
[200]	training's rmse: 0.00539473	valid_1's rmse: 0.174192
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0126401	valid_1's rmse: 0.203249
[100]	training's rmse: 0.005403

[I 2025-02-13 11:44:01,636] Trial 15 finished with value: 8.781405558547345 and parameters: {'num_leaves': 43, 'learning_rate': 0.0694979712502263, 'max_depth': 9, 'min_data_in_leaf': 14}. Best is trial 4 with value: 8.781365066291881.


[200]	training's rmse: 0.00539465	valid_1's rmse: 0.174186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000226 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0357354	valid_1's rmse: 0.265424


[I 2025-02-13 11:44:01,935] Trial 16 finished with value: 8.784998652275856 and parameters: {'num_leaves': 44, 'learning_rate': 0.04824985150947946, 'max_depth': 10, 'min_data_in_leaf': 15}. Best is trial 4 with value: 8.781365066291881.


[100]	training's rmse: 0.00616577	valid_1's rmse: 0.181727
[150]	training's rmse: 0.00540068	valid_1's rmse: 0.174837
[200]	training's rmse: 0.0053947	valid_1's rmse: 0.174243
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038


[I 2025-02-13 11:44:02,223] Trial 17 finished with value: 8.781405457572564 and parameters: {'num_leaves': 42, 'learning_rate': 0.06991333527652448, 'max_depth': 8, 'min_data_in_leaf': 13}. Best is trial 4 with value: 8.781365066291881.


[50]	training's rmse: 0.0124133	valid_1's rmse: 0.202585
[100]	training's rmse: 0.00540328	valid_1's rmse: 0.174958
[150]	training's rmse: 0.00539468	valid_1's rmse: 0.174206
[200]	training's rmse: 0.00539465	valid_1's rmse: 0.174186
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000545 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0382904	valid_1's rmse: 0.271586
[100]	training's rmse: 0.00639663	valid_1's rmse: 0.182791
[150]	training's rmse: 0.00540383	valid_1's rmse: 0.174971
[200]	training's rmse: 0.00539475	valid_1's rmse: 0.174257


[I 2025-02-13 11:44:02,495] Trial 18 finished with value: 8.785889269054644 and parameters: {'num_leaves': 50, 'learning_rate': 0.04690101596653362, 'max_depth': 8, 'min_data_in_leaf': 8}. Best is trial 4 with value: 8.781365066291881.
[I 2025-02-13 11:44:02,687] Trial 19 finished with value: 8.781406431394585 and parameters: {'num_leaves': 39, 'learning_rate': 0.07339324118154672, 'max_depth': 7, 'min_data_in_leaf': 17}. Best is trial 4 with value: 8.781365066291881.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038
[50]	training's rmse: 0.0107279	valid_1's rmse: 0.197937
[100]	training's rmse: 0.00539906	valid_1's rmse: 0.174723
[150]	training's rmse: 0.00539471	valid_1's rmse: 0.174197
[200]	training's rmse: 0.00539466	valid_1's rmse: 0.174186

Best LightGBM Model: {'num_leaves': 47, 'learning_rate': 0.09452580397374, 'max_depth': 10, 'min_data_in_leaf': 10}
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start 

### BOHB

In [26]:
import numpy as np
import lightgbm as lgb
import torch
import torch.nn as nn
import torch.optim as optim
import ConfigSpace as CS
import ConfigSpace.hyperparameters as CSH
import hpbandster.core.nameserver as hpns
from hpbandster.optimizers import BOHB
from hpbandster.core.worker import Worker
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# Define Bi-LSTM Model
class BiLSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim):
        super(BiLSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)  # *2 for bidirectional

    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

# Function to calculate metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred) * 100
    return mae, mse, rmse, r2, mape

# Convert datasets to PyTorch tensors
Y_train_torch = torch.tensor(Y_train.values, dtype=torch.float32).unsqueeze(1)
Y_val_torch = torch.tensor(Y_val.values, dtype=torch.float32).unsqueeze(1)
Y_test_torch = torch.tensor(Y_test.values, dtype=torch.float32).unsqueeze(1)

X_train_torch = torch.tensor(X_train.values, dtype=torch.float32).unsqueeze(1)
X_val_torch = torch.tensor(X_val.values, dtype=torch.float32).unsqueeze(1)
X_test_torch = torch.tensor(X_test.values, dtype=torch.float32).unsqueeze(1)

# Bi-LSTM Configurations (2, 3, and 5 layers)
bilstm_layers = [2, 3, 5]
hidden_dim = 64
output_dim = 1
input_dim = X_train.shape[1]

# Dictionary to store Bi-LSTM feature representations
bilstm_features = {}

for num_layers in bilstm_layers:
    print(f"Training Bi-LSTM with {num_layers} layers...")

    bilstm_model = BiLSTMModel(input_dim, hidden_dim, num_layers, output_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(bilstm_model.parameters(), lr=0.001)
    num_epochs = 100

    for epoch in range(num_epochs):
        bilstm_model.train()
        optimizer.zero_grad()
        outputs = bilstm_model(X_train_torch)
        loss = criterion(outputs, Y_train_torch)
        loss.backward()
        optimizer.step()

    # Extract Feature Representations
    bilstm_model.eval()
    with torch.no_grad():
        train_features = bilstm_model(X_train_torch).numpy()
        val_features = bilstm_model(X_val_torch).numpy()
        test_features = bilstm_model(X_test_torch).numpy()

    bilstm_features[num_layers] = (train_features, val_features, test_features)

# Define ConfigSpace for BOHB (LightGBM)
def get_config_space():
    cs = CS.ConfigurationSpace()
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("num_leaves", 20, 300, default_value=50))
    cs.add_hyperparameter(CSH.UniformIntegerHyperparameter("max_depth", 3, 12, default_value=6))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("learning_rate", 0.01, 0.3, default_value=0.1))
    cs.add_hyperparameter(CSH.UniformFloatHyperparameter("feature_fraction", 0.5, 1.0, default_value=0.8))
    return cs

# BOHB Worker for LightGBM
class LightGBMWorker(Worker):
    def __init__(self, train_features, val_features, **kwargs):
        super().__init__(**kwargs)
        self.train_features = train_features
        self.val_features = val_features

    def compute(self, config, budget, **kwargs):
        model = lgb.LGBMRegressor(
            num_leaves=config["num_leaves"],
            max_depth=config["max_depth"],
            learning_rate=config["learning_rate"],
            feature_fraction=config["feature_fraction"],
            random_state=42
        )
        model.fit(self.train_features, Y_train)
        Y_val_pred = model.predict(self.val_features)
        mae = mean_absolute_error(Y_val, Y_val_pred)
        return {"loss": mae, "info": config}

# Run BOHB for each Bi-LSTM configuration
best_models = {}

for num_layers in bilstm_layers:
    print(f"\nRunning BOHB for Bi-LSTM ({num_layers} layers) + LightGBM...")

    train_features, val_features, test_features = bilstm_features[num_layers]

    # Start NameServer
    NS = hpns.NameServer(run_id=f"bilstm_{num_layers}_lgb_bohb", host="127.0.0.1", port=None)
    NS.start()

    worker = LightGBMWorker(
        train_features=train_features,
        val_features=val_features,
        nameserver="127.0.0.1",
        run_id=f"bilstm_{num_layers}_lgb_bohb"
    )
    worker.run(background=True)

    bohb = BOHB(
        configspace=get_config_space(),
        run_id=f"bilstm_{num_layers}_lgb_bohb",
        nameserver="127.0.0.1",
        min_budget=1,
        max_budget=3
    )

    res = bohb.run(n_iterations=50)

    # Shutdown BOHB
    bohb.shutdown()
    NS.shutdown()

    # Retrieve Best Configuration
    best_config = res.get_incumbent_id()
    best_params = res.get_id2config_mapping()[best_config]["config"]

    # Train Best LightGBM Model on Bi-LSTM Features
    best_lgb_model = lgb.LGBMRegressor(
        num_leaves=best_params["num_leaves"],
        max_depth=best_params["max_depth"],
        learning_rate=best_params["learning_rate"],
        feature_fraction=best_params["feature_fraction"],
        random_state=42
    )

    best_lgb_model.fit(train_features, Y_train)

    # Make Predictions
    Y_train_pred = best_lgb_model.predict(train_features)
    Y_val_pred = best_lgb_model.predict(val_features)
    Y_test_pred = best_lgb_model.predict(test_features)

    # Calculate Metrics
    train_metrics = calculate_metrics(Y_train, Y_train_pred)
    val_metrics = calculate_metrics(Y_val, Y_val_pred)
    test_metrics = calculate_metrics(Y_test, Y_test_pred)

    # Store best model and metrics
    best_models[num_layers] = {
        "params": best_params,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics
    }

    # Print Results
    print(f"\nBest Parameters for Bi-LSTM ({num_layers} layers) + LightGBM:")
    print(best_params)

    print("\nTraining set metrics:")
    print(f"MAE: {train_metrics[0]:.4f}, MSE: {train_metrics[1]:.4f}, RMSE: {train_metrics[2]:.4f}, R²: {train_metrics[3]:.4f}, MAPE: {train_metrics[4]:.2f}%")

    print("\nValidation set metrics:")
    print(f"MAE: {val_metrics[0]:.4f}, MSE: {val_metrics[1]:.4f}, RMSE: {val_metrics[2]:.4f}, R²: {val_metrics[3]:.4f}, MAPE: {val_metrics[4]:.2f}%")

    print("\nTest set metrics:")
    print(f"MAE: {test_metrics[0]:.4f}, MSE: {test_metrics[1]:.4f}, RMSE: {test_metrics[2]:.4f}, R²: {test_metrics[3]:.4f}, MAPE: {test_metrics[4]:.2f}%")


Training Bi-LSTM with 2 layers...
Training Bi-LSTM with 3 layers...
Training Bi-LSTM with 5 layers...

Running BOHB for Bi-LSTM (2 layers) + LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 7736, number of used features: 1
[LightGBM] [Info] Start training from score 0.454038

Best Parameters for Bi-LSTM (2 layers) + LightGBM:
{'feature_fraction': 0.9800558313223, 'learning_rate': 0.2845468961482, 'max_depth': 9, 'num_leaves': 229}

Training set metrics:
MAE: 0.0036, MSE: 0.0000, RMSE: 0.0054, R²: 0.9998, MAPE: 1.02%

Validation set metrics:
MAE: 0.1563, MSE: 0.0303, RMSE: 0.1741, R²: -4.1521, MAPE: 8.77%

Test set metrics:
MAE: 0.4253, MSE: 0.1869, RMSE: 0.4323, R²: -29.9727, MAPE: 20.98%

Running BOHB for Bi-LSTM (3 layers) + LightGBM...
[LightGBM] [Info] Auto-choosing col