# Experiments
## 1. Basic hyperparameter training
First we utilize the **train.py** script to train a model with some basic hyperparameters to see if the architecture works fine.

In [8]:
%run notebook_setup
from src.train import train_model, evaluate, config
from src.data_loader import get_dataloader, get_all_dataloaders
from src.metrics import mae, mse, rmse, r_squared, mape, smape, explained_variance, peak_error
from src.model import GRUNetwork
import logging
import torch
import torch.nn as nn

logger = logging.getLogger('train')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Set random seed for reproducibility
torch.manual_seed(config.data.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.data.seed)

# Get dataloaders
dataloaders = get_all_dataloaders(config)

# Get sample batch to determine input size
sample_batch, _ = next(iter(dataloaders['train']))
input_size = sample_batch.shape[2]  # (batch_size, seq_len, input_size)

# Define model
model = GRUNetwork(
    input_size=input_size,
    hidden_size=64,
    num_layers=2,
    dropout=0.2,
    output_size=len(config.preprocess.target_idx),
    bidirectional=False,
    return_sequences=False
)
model.to(device)
    
# Define loss function
criterion = nn.MSELoss()
    
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
# Define learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5
)

verbose_modes = {
    'training': False,
    'validation': False,
    'testing': False
}
    
# Train model
trained_model, history, run_time = train_model(
    model=model,
    dataloaders=dataloaders,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    num_epochs=100,
    device=device,
    checkpoint_dir=config.dirs.checkpoint_dir,
    patience=15,
    verbose_modes=verbose_modes
)

# Evaluate model
evaluate(
    model=trained_model,
    dataloader=dataloaders['test'],
    criterion=criterion,
    device=device,
    verbose=verbose_modes['testing'],
    run_time=run_time
)

2025-05-05 04:23:44,850 - train - INFO - Using device: cuda
2025-05-05 04:23:44,860 - data_loader - INFO - Creating train dataloader
2025-05-05 04:23:44,900 - data_loader - INFO - Creating val dataloader
2025-05-05 04:23:44,915 - data_loader - INFO - Creating test dataloader
2025-05-05 04:23:45,100 - model - INFO - Initialized GRUNetwork with 2 layers, hidden sizes [64, 64], input size 14, output size 1
2025-05-05 04:23:46,656 - train - INFO - Starting training for 100 epochs
2025-05-05 04:23:48,504 - train - INFO - Epoch 1 completed in 1.85s
2025-05-05 04:24:02,976 - train - INFO - Epoch 11 completed in 1.33s
2025-05-05 04:24:17,112 - train - INFO - Epoch 21 completed in 1.48s
2025-05-05 04:24:20,455 - train - INFO - Early stopping triggered after 23 epochs
2025-05-05 04:24:20,458 - train - INFO - Evaluating model on test set...
2025-05-05 04:24:20,810 - train - INFO - Test MAE: 0.0556
2025-05-05 04:24:20,811 - train - INFO - Test MSE: 0.0054
2025-05-05 04:24:20,811 - train - INFO - T

As we can see, model's training loop works fine.

## 2. Hyperparameters search
After we see that the architecture works, we can start hyperparameter search. We'll start with random search to get a baseline.

### Define hyperparameter space
Start with the following set of hyperparameters to search over:

- hidden size: **32, 64, 128, 256, 512**
- num layers: **1, 2, 3, 4, 5**
- dropout: **0.0, 0.1, 0.2, 0.3, 0.4, 0.5**
- learning rate: **0.01, 0.001, 0.0001, 0.00001**
- weight decay: **0.0, 0.0001, 0.001, 0.00001**
- bidirectional: **False, True**
- early stopping patience: **5, 10, 15**
- learning rate factor: **0.5, 0.25, 0.1**
- learning rate patience: **5, 10**

### 2.1 Random search

In [15]:
%run notebook_setup
import logging
from src.search import HyperparameterSearch

logger = logging.getLogger('train')

# Define parameter grid
param_grid = {
    'hidden_size': [32, 64, 128, 256, 512],
    'num_layers': [1, 2, 3, 4, 5],
    'dropout': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'learning_rate': [0.01, 0.001, 0.0001, 0.00001],
    'weight_decay': [0.0, 0.0001, 0.001, 0.00001],
    'bidirectional': [False, True],
    'num_epochs': [100],
    'early_stopping_patience': [5, 10, 15],
    'lr_factor': [0.5, 0.25, 0.1],
    'lr_patience': [5, 10]
}

# Create and run the search
search = HyperparameterSearch(
    base_config_path="/home/kitne/University/2lvl/SU/bike-gru-experiments/config/default.yaml",
    param_grid=param_grid,
    search_method='random',  # 'random'
    n_trials=40,            # Only used for random and bayesian search
    metric='val_rmse',       # Metric to optimize
    direction='minimize',    # 'minimize' or 'maximize'
    n_jobs=1,               # Number of parallel jobs
    experiment_name="random_search",
    verbose=False
)

# Run the search
results = search.run()

# Print the best result
completed_results = [r for r in results if r['status'] == 'completed']
if completed_results:
    if search.direction == 'minimize':
        best_result = min(completed_results, key=lambda x: x['best_value'])
    else:
        best_result = max(completed_results, key=lambda x: x['best_value'])
    
    print(f"\nBest configuration:")
    print(f"  {search.metric}: {best_result['best_value']:.6f}")
    print(f"  Parameters: {best_result['params']}")
else:
    print("No successful trials completed")

2025-05-05 04:28:11,580 - search - INFO - Initialized random search with 40 configurations
2025-05-05 04:28:11,581 - search - INFO - Starting random search with 1 parallel jobs
2025-05-05 04:28:11,599 - search - INFO - Random search with 40 trials
2025-05-05 04:28:11,600 - search - INFO - Starting trial_0000
2025-05-05 04:28:11,605 - data_loader - INFO - Creating train dataloader
2025-05-05 04:28:11,639 - data_loader - INFO - Creating val dataloader
2025-05-05 04:28:11,651 - data_loader - INFO - Creating test dataloader
2025-05-05 04:28:11,761 - model - INFO - Initialized GRUNetwork with 4 layers, hidden sizes [64, 64, 64, 64], input size 14, output size 1
2025-05-05 04:28:11,774 - train - INFO - Starting training for 100 epochs
2025-05-05 04:28:14,038 - train - INFO - Epoch 1 completed in 2.26s
2025-05-05 04:28:35,973 - train - INFO - Epoch 11 completed in 2.16s
2025-05-05 04:28:40,385 - train - INFO - Early stopping triggered after 13 epochs
2025-05-05 04:28:40,387 - search - INFO - 


Best configuration:
  val_rmse: 0.054382
  Parameters: {'hidden_size': 512, 'num_layers': 1, 'dropout': 0.0, 'learning_rate': 0.0001, 'weight_decay': 0.0001, 'bidirectional': 'True', 'num_epochs': 100, 'early_stopping_patience': 15, 'lr_factor': 0.5, 'lr_patience': 10}


### 2.2 Bayesian Optimization search

In [17]:
%run notebook_setup
import logging
from src.search import HyperparameterSearch

# For Bayesian optimization
try:
    import optuna
    OPTUNA_AVAILABLE = True
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Warning: optuna not installed. Bayesian optimization will not be available.")
    print("Install with: pip install optuna")

logger = logging.getLogger('train')

# Define parameter grid
param_grid = {
    'hidden_size': [32, 64, 128, 256, 512],
    'num_layers': [1, 2, 3, 4, 5],
    'dropout': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5],
    'learning_rate': [0.01, 0.001, 0.0001, 0.00001],
    'weight_decay': [0.0, 0.0001, 0.001, 0.00001],
    'bidirectional': [False, True],
    'num_epochs': [100],
    'early_stopping_patience': [5, 10, 15],
    'lr_factor': [0.5, 0.25, 0.1],
    'lr_patience': [5, 10]
}

# Create and run the search
search = HyperparameterSearch(
    base_config_path="/home/kitne/University/2lvl/SU/bike-gru-experiments/config/default.yaml",
    param_grid=param_grid,
    search_method='bayesian',  # 'bayesian'
    n_trials=40,            # Only used for random and bayesian search
    metric='val_rmse',       # Metric to optimize
    direction='minimize',    # 'minimize' or 'maximize'
    n_jobs=2,               # Number of parallel jobs
    experiment_name="bayesian_search",
    verbose=False
)

# Run the search
results = search.run()

# Print the best result
completed_results = [r for r in results if r['status'] == 'completed']
if completed_results:
    if search.direction == 'minimize':
        best_result = min(completed_results, key=lambda x: x['best_value'])
    else:
        best_result = max(completed_results, key=lambda x: x['best_value'])
    
    print(f"\nBest configuration:")
    print(f"  {search.metric}: {best_result['best_value']:.6f}")
    print(f"  Parameters: {best_result['params']}")
else:
    print("No successful trials completed")

2025-05-05 05:42:18,915 - search - INFO - Initialized bayesian search with 40 configurations
2025-05-05 05:42:18,916 - search - INFO - Starting bayesian search with 2 parallel jobs
[I 2025-05-05 05:42:18,917] A new study created in memory with name: no-name-f456b269-c3ff-4c5f-b090-7a5d19eacd9b
2025-05-05 05:42:18,920 - search - INFO - Starting trial_0000
2025-05-05 05:42:18,924 - data_loader - INFO - Creating train dataloader
2025-05-05 05:42:19,002 - data_loader - INFO - Creating val dataloader
2025-05-05 05:42:19,021 - data_loader - INFO - Creating test dataloader
2025-05-05 05:42:19,199 - model - INFO - Initialized GRUNetwork with 5 layers, hidden sizes [256, 256, 256, 256, 256], input size 14, output size 1
2025-05-05 05:42:19,252 - train - INFO - Starting training for 100 epochs
2025-05-05 05:42:33,730 - train - INFO - Epoch 1 completed in 14.48s
2025-05-05 05:45:00,393 - train - INFO - Epoch 11 completed in 14.71s
2025-05-05 05:47:28,551 - train - INFO - Epoch 21 completed in 14.


Best configuration:
  val_rmse: 0.054269
  Parameters: {'hidden_size': 128, 'num_layers': 2, 'dropout': 0.1, 'learning_rate': 0.01, 'weight_decay': 0.0, 'bidirectional': True, 'num_epochs': 100, 'early_stopping_patience': 15, 'lr_factor': 0.1, 'lr_patience': 10}


### Most promising hyperparameters

Let's examine the results of the random and bayesian searches to find the most promising hyperparameters.

In [1]:
%run notebook_setup
from src.utils import get_best_trials, format_parameter_output, extract_best_parameter_values
    
args = {
    "json_files": ["/home/kitne/University/2lvl/SU/bike-gru-experiments/experiments/checkpoints/search/random_search/all_results.json", "/home/kitne/University/2lvl/SU/bike-gru-experiments/experiments/checkpoints/search/bayesian_search/study_results.json"],
    "direction": "minimize",
    "top": 3
}
print(f"Finding top {args['top']} trials ({args['direction']} metric)...")
best_trials = get_best_trials(args['json_files'], args['direction'], args['top'])

if not best_trials:
    print("No valid trials found.")
else:
    print(f"\nTop {len(best_trials)} Parameter Sets:")
    print("-" * 50)
    for i, trial in enumerate(best_trials, 1):
        print(f"Rank #{i}:")
        print(format_parameter_output(trial))
        print("-" * 50)

Finding top 3 trials (minimize metric)...

Top 3 Parameter Sets:
--------------------------------------------------
Rank #1:
Trial ID: 11
Best Value: 0.054269
Best Epoch: COMPLETE
File: study_results.json
Parameters:
{
  "hidden_size": 128,
  "num_layers": 2,
  "dropout": 0.1,
  "learning_rate": 0.01,
  "weight_decay": 0.0,
  "bidirectional": true,
  "num_epochs": 100,
  "early_stopping_patience": 15,
  "lr_factor": 0.1,
  "lr_patience": 10
}

--------------------------------------------------
Rank #2:
Trial ID: 9
Best Value: 0.054382
Best Epoch: 15
File: all_results.json
Parameters:
{
  "hidden_size": 512,
  "num_layers": 1,
  "dropout": 0.0,
  "learning_rate": 0.0001,
  "weight_decay": 0.0001,
  "bidirectional": "True",
  "num_epochs": 100,
  "early_stopping_patience": 15,
  "lr_factor": 0.5,
  "lr_patience": 10
}

--------------------------------------------------
Rank #3:
Trial ID: 6
Best Value: 0.054632
Best Epoch: 18
File: all_results.json
Parameters:
{
  "hidden_size": 64,
  "nu

Nice, based on the results of the random search and bayesian search, we can find best parameters for grid search.

In [2]:
%run notebook_setup
from src.utils import extract_best_parameter_values

# Extract and display best parameter values from the top trials
best_param_values = extract_best_parameter_values(best_trials, max_values_per_param=3)
if best_param_values:
    print("\nBest Parameter Values for Grid Search:")
    print("-" * 50)
    for param, values in best_param_values.items():
        print(f"{param}: {values}")
    print("-" * 50)



Best Parameter Values for Grid Search:
--------------------------------------------------
lr_factor: [0.1, 0.5]
num_layers: [2, 1]
bidirectional: ['True', 'False']
early_stopping_patience: [15, 10]
weight_decay: [0.0, 0.0001, 1e-05]
lr_patience: [10]
hidden_size: [128, 512, 64]
num_epochs: [100]
learning_rate: [0.01, 0.0001]
dropout: [0.1, 0.0]
--------------------------------------------------


### 2.3 Grid search

Now let's try to search across all possible parameters from range of best 3 models.
Some of the parameters are not included due to their slow training time and low impact on the model performance.

Selected parameters:

- hidden size: **64, 128, 256**
- num layers: **2, 3**
- dropout: **0.0, 0.3**
- learning rate: **0.001, 0.0001, 0.00001**
- weight decay: **0.0001, 0.00001**
- bidirectional: **True**
- early stopping patience: **10**
- learning rate factor: **0.25**
- learning rate patience: **5, 10**

In [None]:
%run notebook_setup
import logging
from src.search import HyperparameterSearch

logger = logging.getLogger('train')

# Define parameter grid according to the best parameters found across previous searches
param_grid = {
    'hidden_size': [64, 128, 512],
    'num_layers': [1, 2],
    'dropout': [0.0, 0.1],
    'learning_rate': [0.01, 0.0001],
    'weight_decay': [0.0, 0.0001, 1e-05],
    'bidirectional': [True, False],
    'num_epochs': [100],
    'early_stopping_patience': [15, 10],
    'lr_factor': [0.1, 0.5],
    'lr_patience': [10]
}
# Some of the parameters were changed to speed up the search

# Create and run the search
search = HyperparameterSearch(
    base_config_path="/home/kitne/University/2lvl/SU/bike-gru-experiments/config/default.yaml",
    param_grid=param_grid,
    search_method='grid',  # 'grid'
    metric='val_rmse',       # Metric to optimize
    direction='minimize',    # 'minimize' or 'maximize'
    n_jobs=1,               # Number of parallel jobs
    experiment_name="grid_search",
    verbose=False
)

# Run the search
results = search.run()

# Print the best result
completed_results = [r for r in results if r['status'] == 'completed']
if completed_results:
    if search.direction == 'minimize':
        best_result = min(completed_results, key=lambda x: x['best_value'])
    else:
        best_result = max(completed_results, key=lambda x: x['best_value'])
    
    print(f"\nBest configuration:")
    print(f"  {search.metric}: {best_result['best_value']:.6f}")
    print(f"  Parameters: {best_result['params']}")
else:
    print("No successful trials completed")

Output is cleared because of it's size.
Grid search ended up with a failure on last 50 trials. Because of this, all the results were not saved into one file. Let's make it manually.

In [1]:
%run notebook_setup
import json
import glob
import os

# Path to the grid search results
grid_search_dir = '/home/kitne/University/2lvl/SU/bike-gru-experiments/experiments/checkpoints/search/grid_search'

# Pattern to match all trial result files
pattern = os.path.join(grid_search_dir, 'trial_*_result.json')

# List to store all results
all_results = []

# Find all result files
result_files = glob.glob(pattern)

print(f"Found {len(result_files)} result files")

# Process each file
for result_file in result_files:
    try:
        with open(result_file, 'r') as f:
            # Load the JSON content
            result_data = json.load(f)
            
            # Add to our list
            all_results.append(result_data)
            
        print(f"Processed {os.path.basename(result_file)}")
    except Exception as e:
        print(f"Error processing {result_file}: {str(e)}")

# Sort results by trial_id to maintain order
all_results.sort(key=lambda x: x.get('trial_id', 0))

# Save the combined results
output_file = os.path.join(grid_search_dir, 'all_results.json')

with open(output_file, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"Combined {len(all_results)} results into {output_file}")

Found 576 result files
Processed trial_0000_result.json
Processed trial_0001_result.json
Processed trial_0002_result.json
Processed trial_0003_result.json
Processed trial_0004_result.json
Processed trial_0005_result.json
Processed trial_0006_result.json
Processed trial_0007_result.json
Processed trial_0008_result.json
Processed trial_0009_result.json
Processed trial_0010_result.json
Processed trial_0011_result.json
Processed trial_0012_result.json
Processed trial_0013_result.json
Processed trial_0014_result.json
Processed trial_0015_result.json
Processed trial_0016_result.json
Processed trial_0017_result.json
Processed trial_0018_result.json
Processed trial_0019_result.json
Processed trial_0020_result.json
Processed trial_0021_result.json
Processed trial_0022_result.json
Processed trial_0023_result.json
Processed trial_0024_result.json
Processed trial_0025_result.json
Processed trial_0026_result.json
Processed trial_0027_result.json
Processed trial_0028_result.json
Processed trial_0029

## Best model parameters
Grid search has shown the following best parameters:

In [1]:
%run notebook_setup
from src.utils import get_best_trials, format_parameter_output, extract_best_parameter_values
    
args = {
    "json_files": ["/home/kitne/University/2lvl/SU/bike-gru-experiments/experiments/checkpoints/search/grid_search/all_results.json"],
    "direction": "minimize",
    "top": 3
}
print(f"Finding top {args['top']} trials ({args['direction']} metric)...")
best_trials = get_best_trials(args['json_files'], args['direction'], args['top'])

if not best_trials:
    print("No valid trials found.")
else:
    print(f"\nTop {len(best_trials)} Parameter Sets:")
    print("-" * 50)
    for i, trial in enumerate(best_trials, 1):
        print(f"Rank #{i}:")
        print(format_parameter_output(trial))
        print("-" * 50)

Finding top 3 trials (minimize metric)...

Top 3 Parameter Sets:
--------------------------------------------------
Rank #1:
Trial ID: 116
Best Value: 0.052874
Best Epoch: 22
File: all_results.json
Parameters:
{
  "hidden_size": 64,
  "num_layers": 2,
  "dropout": 0.0,
  "learning_rate": 0.01,
  "weight_decay": 1e-05,
  "bidirectional": false,
  "num_epochs": 100,
  "early_stopping_patience": 15,
  "lr_factor": 0.1,
  "lr_patience": 10
}

--------------------------------------------------
Rank #2:
Trial ID: 21
Best Value: 0.052911
Best Epoch: 31
File: all_results.json
Parameters:
{
  "hidden_size": 64,
  "num_layers": 1,
  "dropout": 0.0,
  "learning_rate": 0.01,
  "weight_decay": 1e-05,
  "bidirectional": false,
  "num_epochs": 100,
  "early_stopping_patience": 15,
  "lr_factor": 0.5,
  "lr_patience": 10
}

--------------------------------------------------
Rank #3:
Trial ID: 257
Best Value: 0.052915
Best Epoch: 23
File: all_results.json
Parameters:
{
  "hidden_size": 128,
  "num_laye

### Best model

Now since we have the best parameters, let's train the model with them.

In [1]:
%run notebook_setup
from src.train import train_model, evaluate, config
from src.data_loader import get_dataloader, get_all_dataloaders
from src.metrics import mae, mse, rmse, r_squared, mape, smape, explained_variance, peak_error
from src.model import GRUNetwork
import logging
import torch
import torch.nn as nn

logger = logging.getLogger('train')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Set random seed for reproducibility
torch.manual_seed(config.data.seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(config.data.seed)

# Get dataloaders
dataloaders = get_all_dataloaders(config)

# Get sample batch to determine input size
sample_batch, _ = next(iter(dataloaders['train']))
input_size = sample_batch.shape[2]  # (batch_size, seq_len, input_size)

# Define model
model = GRUNetwork(
    input_size=input_size,
    hidden_size=64,
    num_layers=2,
    dropout=0.0,
    output_size=len(config.preprocess.target_idx),
    bidirectional=False,
    return_sequences=False
)
model.to(device)
    
# Define loss function
criterion = nn.MSELoss()
    
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
    
# Define learning rate scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.1, patience=10
)

verbose_modes = {
    'training': False,
    'validation': False,
    'testing': False
}
    
# Train model
trained_model, history, run_time = train_model(
    model=model,
    dataloaders=dataloaders,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    num_epochs=100,
    device=device,
    checkpoint_dir=config.dirs.checkpoint_dir,
    patience=15,
    verbose_modes=verbose_modes
)

# Evaluate model
evaluate(
    model=trained_model,
    dataloader=dataloaders['test'],
    criterion=criterion,
    device=device,
    verbose=verbose_modes['testing'],
    run_time=run_time
)

2025-05-07 23:18:43,960 - train - INFO - Using device: cuda
2025-05-07 23:18:44,285 - model - INFO - Initialized GRUNetwork with 2 layers, hidden sizes [64, 64], input size 14, output size 1
2025-05-07 23:18:45,798 - train - INFO - Starting training for 100 epochs
2025-05-07 23:18:48,784 - train - INFO - Epoch 1 completed in 2.98s
2025-05-07 23:19:02,956 - train - INFO - Epoch 11 completed in 1.34s
2025-05-07 23:19:16,726 - train - INFO - Epoch 21 completed in 1.38s
2025-05-07 23:19:31,266 - train - INFO - Epoch 31 completed in 1.52s
2025-05-07 23:19:43,690 - train - INFO - Early stopping triggered after 40 epochs
2025-05-07 23:19:43,693 - train - INFO - Evaluating model on test set...
2025-05-07 23:19:44,058 - train - INFO - Test MAE: 0.0516
2025-05-07 23:19:44,059 - train - INFO - Test MSE: 0.0046
2025-05-07 23:19:44,060 - train - INFO - Test RMSE: 0.0675
2025-05-07 23:19:44,060 - train - INFO - Test R²: 0.8312


Trained model got even better results than the best model from grid search

Now it's time to do some visualization in visualization notebook