In [8]:
%load_ext autoreload
%autoreload 2

import os
import sys
sys.path.insert(0, "biotrainer")
import optuna
from biotrainer.protocols import Protocol
from biotrainer.utilities.cli import train

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
config = {
    "input_file": "data/sampled_3Dii.fasta",
    "protocol": Protocol.residue_to_class.name,
    "model_choice": "CNN",
    "device": "mps",
    "optimizer_choice": "adam",
    "learning_rate": 1e-3,
    "loss_choice": "cross_entropy_loss",
    "num_epochs": 200,
    "batch_size": 128,
    "patience": 3,
    "ignore_file_inconsistencies": True,
    "cross_validation_config": {
        "method": "hold_out"
    },
    "embeddings_file": "output/residue_to_class/ProstT5/embeddings_file_ProstT5.h5",
    #"embedder_name": "RostLab/ProstT5",
    "model_params": {
            "dropout_rate": 0.15,
            "n_layers": 3,
            "kernel_sizes": [(7, 1), (7, 1), (5, 1)],
            "padding": [(3, 0), (3, 0), (2, 0)],
            "hidden_dims": [256, 32]
    },
}

In [None]:
def objective(trial):
    config["output_dir"] = f"optuna_results/{trial.number}"
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    config["learning_rate"] = learning_rate
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)
    n_layers = trial.suggest_int("n_layers", 1, 3)
    kernel_sizes = [(trial.suggest_categorical(f"kernel_size_{i}", [3, 5, 7]), 1) for i in range(n_layers)]
    padding = [(k[0] // 2, 0) for k in kernel_sizes]
    hidden_dims = [trial.suggest_int(f"hidden_dim_{i}", 32, 512) for i in range(n_layers-1)]
    last_layer_FNN = trial.suggest_categorical("last_layer_FNN", [True, False])
    config["model_params"] = {
        "dropout_rate": dropout_rate,
        "kernel_sizes": kernel_sizes,
        "padding": padding,
        "hidden_dims": hidden_dims,
        "last_layer_FNN": last_layer_FNN
    }

    print(config)
    res = train(config)
    return res["training_results"]["hold_out"]["best_training_epoch_metrics"]["training"]["accuracy"]

In [None]:
study = optuna.create_study(direction="maximize")

''

Sequential(
  (0): Conv2d(1024, 78, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
  (1): ReLU()
  (2): Dropout(p=0.21810307861772615, inplace=False)
  (3): Conv2d(78, 321, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0))
  (4): ReLU()
  (5): Dropout(p=0.21810307861772615, inplace=False)
  (6): Conv2d(321, 12, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
)
0.0009110601168612554


Sequential(
  (0): Conv2d(1024, 78, kernel_size=(3, 1), stride=(1, 1), padding=(1, 0))
  (1): ReLU()
  (2): Dropout(p=0.21810307861772615, inplace=False)
  (3): Conv2d(78, 321, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0))
  (4): ReLU()
  (5): Dropout(p=0.21810307861772615, inplace=False)
  (6): Conv2d(321, 12, kernel_size=(5, 1), stride=(1, 1), padding=(2, 0))
)
0.0009110601168612554


[I 2025-08-11 13:45:04,813] Trial 0 finished with value: 0.5684136152267456 and parameters: {'learning_rate': 0.0009110601168612554, 'dropout_rate': 0.21810307861772615, 'n_layers': 3, 'kernel_size_0_1': 3, 'kernel_size_1_1': 7, 'kernel_size_2_1': 5, 'hidden_dim_0': 78, 'hidden_dim_1': 321}. Best is trial 0 with value: 0.5684136152267456.


In [45]:
study.optimize(objective, n_trials=1)

''

Sequential(
  (0): Conv2d(1024, 363, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0))
  (1): ReLU()
  (2): Dropout(p=0.11514664243414385, inplace=False)
  (3): Conv2d(363, 12, kernel_size=(7, 1), stride=(1, 1), padding=(3, 0))
)
3.443709673629252e-05


[W 2025-08-11 18:32:02,188] Trial 7 failed with parameters: {'learning_rate': 3.443709673629252e-05, 'dropout_rate': 0.11514664243414385, 'n_layers': 2, 'kernel_size_0': 7, 'kernel_size_1': 7, 'hidden_dim_0': 363} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/janleusch/anaconda3/envs/biotrainer/lib/python3.12/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/6b/fx6pp2w50tjc2skb5z2mjxph0000gn/T/ipykernel_10698/3304014329.py", line 19, in objective
    res = train(config)
          ^^^^^^^^^^^^^
  File "/Users/janleusch/Documents/phd/biotrainer/biotrainer/biotrainer/utilities/cli.py", line 28, in train
    return parse_config_file_and_execute_run(config)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/janleusch/Documents/phd/biotrainer/biotrainer/biotrainer/utilities/executer.py", line 46, in parse_config_fil

KeyboardInterrupt: 

In [41]:
study.best_trial

FrozenTrial(number=4, state=1, values=[0.5685243010520935], datetime_start=datetime.datetime(2025, 8, 11, 14, 9, 14, 485260), datetime_complete=datetime.datetime(2025, 8, 11, 15, 2, 39, 923205), params={'learning_rate': 0.000918391052072852, 'dropout_rate': 0.20172967629806815, 'n_layers': 2, 'kernel_size_0_1': 3, 'kernel_size_1_1': 5, 'hidden_dim_0': 123}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': FloatDistribution(high=0.01, log=True, low=1e-05, step=None), 'dropout_rate': FloatDistribution(high=0.5, log=False, low=0.1, step=None), 'n_layers': IntDistribution(high=3, log=False, low=1, step=1), 'kernel_size_0_1': IntDistribution(high=7, log=False, low=3, step=1), 'kernel_size_1_1': IntDistribution(high=7, log=False, low=3, step=1), 'hidden_dim_0': IntDistribution(high=512, log=False, low=32, step=1)}, trial_id=4, value=None)

In [7]:
res = train(config)

''

{'device': device(type='mps'), 'seed': 42, 'save_split_ids': False, 'sanity_check': True, 'ignore_file_inconsistencies': False, 'external_writer': 'tensorboard', 'output_dir': PosixPath('/Users/janleusch/Documents/phd/biotrainer/output'), 'bootstrapping_iterations': 30, 'validate_input': True, 'input_file': '/Users/janleusch/Documents/phd/biotrainer/data/sampled_3Dii.fasta', 'optimizer_choice': 'adam', 'learning_rate': 0.001, 'epsilon': 0.001, 'loss_choice': 'cross_entropy_loss', 'model_params': {'dropout_rate': 0.15, 'n_layers': 3, 'kernel_sizes': [(7, 1), (7, 1), (5, 1)], 'padding': [(3, 0), (3, 0), (2, 0)], 'hidden_dims': [256, 32]}, 'num_epochs': 200, 'batch_size': 128, 'patience': 10, 'shuffle': True, 'use_class_weights': False, 'auto_resume': False, 'limited_sample_size': -1, 'embedder_name': 'one_hot_encoding', 'use_half_precision': False, 'cross_validation_config': {'method': 'hold_out', 'choose_by': 'loss'}, 'log_dir': '/Users/janleusch/Documents/phd/biotrainer/output/CNN/one_

{'device': device(type='mps'), 'seed': 42, 'save_split_ids': False, 'sanity_check': True, 'ignore_file_inconsistencies': False, 'external_writer': 'tensorboard', 'output_dir': PosixPath('/Users/janleusch/Documents/phd/biotrainer/output'), 'bootstrapping_iterations': 30, 'validate_input': True, 'input_file': '/Users/janleusch/Documents/phd/biotrainer/data/sampled_3Dii.fasta', 'optimizer_choice': 'adam', 'learning_rate': 0.001, 'epsilon': 0.001, 'loss_choice': 'cross_entropy_loss', 'model_params': {'dropout_rate': 0.15, 'n_layers': 3, 'kernel_sizes': [(7, 1), (7, 1), (5, 1)], 'padding': [(3, 0), (3, 0), (2, 0)], 'hidden_dims': [256, 32]}, 'num_epochs': 200, 'batch_size': 128, 'patience': 10, 'shuffle': True, 'use_class_weights': False, 'auto_resume': False, 'limited_sample_size': -1, 'embedder_name': 'one_hot_encoding', 'use_half_precision': False, 'cross_validation_config': {'method': 'hold_out', 'choose_by': 'loss'}, 'log_dir': '/Users/janleusch/Documents/phd/biotrainer/output/CNN/one_

In [23]:
res["training_results"]["hold_out"]["best_training_epoch_metrics"]["training"]["accuracy"]

0.20687341690063477