In [1]:
#imports
import optuna
import sys
sys.path.append('/home/harry/personal/uni/project/individual/harry/model')

from model import *

feature_names = [
    'is_summer', 'is_autumn', 'is_winter', 'is_spring',
    'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday', 'is_friday', 'is_saturday', 'is_sunday',
    'is_weekday', 'is_weekend',
    'is_jan', 'is_feb', 'is_mar', 'is_apr', 'is_may', 'is_jun', 'is_jul', 'is_aug', 'is_sep', 'is_oct', 'is_nov', 'is_dec',
    'min_30_min_demand',
    'avg_30_min_demand',
    'max_30_min_demand',
    'avg_temp',
    'max_temp',
    'min_temp',
    'hd_next_24h',
    'cd_next_24h',
    'sunlight',
    'precipitation'
]
optimal_feature_names = [
    'is_summer', 'is_autumn', 'is_winter', 'is_spring',
    'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday', 'is_friday', 'is_saturday', 'is_sunday',
    'is_weekday', 'is_weekend',
    'avg_30_min_demand',
    'avg_temp',
    'max_temp',
    'cd_next_24h'
]


In [None]:

## Optuna Function
def objective(trial):
    # Suggest hyperparameters
    # Restrict d_model and nhead to best from first optuna to get better insights (no inf)
    d_model = 64
    nhead = 4
    

    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2016-2019',
        'train_test_split': 'prior:2019',
        'features': ['sum_30_min_demand'] + feature_names,
        'visualise': False,
    }

    return median_mape(params)

In [None]:
study = optuna.create_study(storage="sqlite:///../db.sqlite3", study_name="2016_2019+all_features", load_if_exists=True)
study.optimize(objective, n_trials=100, n_jobs=1)

study.best_params

[I 2025-09-29 15:40:51,593] A new study created in RDB with name: 2016_2019+all_features


Using device: cuda


[I 2025-09-29 15:41:14,757] Trial 0 finished with value: 3.3257223871712616 and parameters: {'learning_rate': 0.002030899356209592, 'batch_size': 21, 'seq_length': 14, 'dim_feedforward': 267, 'dropout': 0.0671148069642173, 'num_layers': 4}. Best is trial 0 with value: 3.3257223871712616.


Using device: cuda


[I 2025-09-29 15:41:22,246] Trial 1 finished with value: 3.2074449919348686 and parameters: {'learning_rate': 0.0004912525481300255, 'batch_size': 42, 'seq_length': 19, 'dim_feedforward': 168, 'dropout': 0.09432145815761739, 'num_layers': 3}. Best is trial 1 with value: 3.2074449919348686.


Using device: cuda


[I 2025-09-29 15:41:35,934] Trial 2 finished with value: 3.0010241207241846 and parameters: {'learning_rate': 0.004373605534635941, 'batch_size': 13, 'seq_length': 13, 'dim_feedforward': 504, 'dropout': 0.03170187763917859, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:41:41,455] Trial 3 finished with value: 5.9147364692565425 and parameters: {'learning_rate': 0.009296994978726757, 'batch_size': 43, 'seq_length': 19, 'dim_feedforward': 274, 'dropout': 0.042163050261384, 'num_layers': 3}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:42:03,003] Trial 4 finished with value: 3.3871620499808213 and parameters: {'learning_rate': 0.0011142948296489245, 'batch_size': 14, 'seq_length': 18, 'dim_feedforward': 388, 'dropout': 0.0484119560643522, 'num_layers': 3}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:42:07,327] Trial 5 finished with value: 3.479723757261441 and parameters: {'learning_rate': 0.0024779297803427464, 'batch_size': 60, 'seq_length': 13, 'dim_feedforward': 455, 'dropout': 0.011062607352148025, 'num_layers': 3}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:42:17,402] Trial 6 finished with value: 3.5290421546138604 and parameters: {'learning_rate': 0.0004647674977772755, 'batch_size': 39, 'seq_length': 17, 'dim_feedforward': 202, 'dropout': 0.05776597925188734, 'num_layers': 5}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:42:24,297] Trial 7 finished with value: 3.2961582985817515 and parameters: {'learning_rate': 0.00018128114913263933, 'batch_size': 30, 'seq_length': 6, 'dim_feedforward': 235, 'dropout': 0.07671381178278877, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:42:32,471] Trial 8 finished with value: 3.4111685004837757 and parameters: {'learning_rate': 0.001391828804385173, 'batch_size': 51, 'seq_length': 5, 'dim_feedforward': 295, 'dropout': 0.08877820627109545, 'num_layers': 4}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:42:53,115] Trial 9 finished with value: 3.3942134592147606 and parameters: {'learning_rate': 0.0004067493759741905, 'batch_size': 34, 'seq_length': 21, 'dim_feedforward': 427, 'dropout': 0.026807333065729345, 'num_layers': 5}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:43:36,079] Trial 10 finished with value: 3.126725179584623 and parameters: {'learning_rate': 0.007905978285680668, 'batch_size': 8, 'seq_length': 9, 'dim_feedforward': 512, 'dropout': 0.0026464262396251118, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:44:07,184] Trial 11 finished with value: 3.273344564915057 and parameters: {'learning_rate': 0.009863495564459876, 'batch_size': 10, 'seq_length': 9, 'dim_feedforward': 507, 'dropout': 0.0005313608502770968, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:44:14,970] Trial 12 finished with value: 3.320261273194415 and parameters: {'learning_rate': 0.004642933932378061, 'batch_size': 21, 'seq_length': 10, 'dim_feedforward': 502, 'dropout': 0.02516420496024714, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:44:42,131] Trial 13 finished with value: 3.1140008555771477 and parameters: {'learning_rate': 0.0044827987079312254, 'batch_size': 9, 'seq_length': 9, 'dim_feedforward': 367, 'dropout': 0.031994582568382224, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:44:57,443] Trial 14 finished with value: 3.3225169292022287 and parameters: {'learning_rate': 0.0034336294468423805, 'batch_size': 21, 'seq_length': 15, 'dim_feedforward': 355, 'dropout': 0.02882538465301981, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:18,738] Trial 15 finished with value: 3.218274017638666 and parameters: {'learning_rate': 0.004821173608091746, 'batch_size': 16, 'seq_length': 11, 'dim_feedforward': 338, 'dropout': 0.03603226452149999, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:25,175] Trial 16 finished with value: 3.1077623658715168 and parameters: {'learning_rate': 0.005108084946811526, 'batch_size': 29, 'seq_length': 7, 'dim_feedforward': 416, 'dropout': 0.01776410884713855, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:32,390] Trial 17 finished with value: 3.46125428129976 and parameters: {'learning_rate': 0.0019579527377547993, 'batch_size': 28, 'seq_length': 3, 'dim_feedforward': 438, 'dropout': 0.01347187015221402, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:37,207] Trial 18 finished with value: 3.331650532392396 and parameters: {'learning_rate': 0.0007423606258110054, 'batch_size': 27, 'seq_length': 7, 'dim_feedforward': 401, 'dropout': 0.01670201076710633, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:42,692] Trial 19 finished with value: 3.1661332713801027 and parameters: {'learning_rate': 0.00010233071240177725, 'batch_size': 49, 'seq_length': 12, 'dim_feedforward': 473, 'dropout': 0.05566187890463358, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:48,536] Trial 20 finished with value: 3.198554250865741 and parameters: {'learning_rate': 0.006193009454441735, 'batch_size': 34, 'seq_length': 3, 'dim_feedforward': 412, 'dropout': 0.04200360202519187, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:45:58,412] Trial 21 finished with value: 3.4064224172303974 and parameters: {'learning_rate': 0.0032219299792630805, 'batch_size': 15, 'seq_length': 7, 'dim_feedforward': 372, 'dropout': 0.02119041301649504, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:46:37,256] Trial 22 finished with value: 3.252608183086312 and parameters: {'learning_rate': 0.004943032068737613, 'batch_size': 8, 'seq_length': 8, 'dim_feedforward': 320, 'dropout': 0.03516093487632313, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:46:49,369] Trial 23 finished with value: 3.3204197436308314 and parameters: {'learning_rate': 0.003312603351005178, 'batch_size': 18, 'seq_length': 11, 'dim_feedforward': 470, 'dropout': 0.034751854671728886, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:47:03,314] Trial 24 finished with value: 3.206630704103624 and parameters: {'learning_rate': 0.006506672550037614, 'batch_size': 25, 'seq_length': 15, 'dim_feedforward': 374, 'dropout': 0.009453156645302994, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:47:20,602] Trial 25 finished with value: 3.404450165675206 and parameters: {'learning_rate': 0.0023707726365779732, 'batch_size': 12, 'seq_length': 5, 'dim_feedforward': 130, 'dropout': 0.04660656392277736, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:47:25,915] Trial 26 finished with value: 3.093746695158865 and parameters: {'learning_rate': 0.001458095406897964, 'batch_size': 23, 'seq_length': 10, 'dim_feedforward': 323, 'dropout': 0.020138105296764894, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:47:34,120] Trial 27 finished with value: 3.396964734065084 and parameters: {'learning_rate': 0.0013825743913576814, 'batch_size': 31, 'seq_length': 12, 'dim_feedforward': 474, 'dropout': 0.02146827143696959, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:47:45,689] Trial 28 finished with value: 3.4742523896024036 and parameters: {'learning_rate': 0.0008037734846651791, 'batch_size': 25, 'seq_length': 16, 'dim_feedforward': 308, 'dropout': 0.006401219255842749, 'num_layers': 4}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:48:07,931] Trial 29 finished with value: 3.5448778716025795 and parameters: {'learning_rate': 0.0017835537226105343, 'batch_size': 20, 'seq_length': 13, 'dim_feedforward': 243, 'dropout': 0.016876305385052782, 'num_layers': 4}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:48:16,531] Trial 30 finished with value: 3.2640994245446557 and parameters: {'learning_rate': 0.002730115994643764, 'batch_size': 23, 'seq_length': 10, 'dim_feedforward': 441, 'dropout': 0.055657889632201495, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:48:34,200] Trial 31 finished with value: 3.203176306872228 and parameters: {'learning_rate': 0.004199305154459356, 'batch_size': 12, 'seq_length': 9, 'dim_feedforward': 329, 'dropout': 0.0304802505916211, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:48:48,552] Trial 32 finished with value: 3.0638699780919065 and parameters: {'learning_rate': 0.007426606746371314, 'batch_size': 17, 'seq_length': 7, 'dim_feedforward': 342, 'dropout': 0.021028205976819193, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:49:00,015] Trial 33 finished with value: 3.341486907200704 and parameters: {'learning_rate': 0.00649227126083594, 'batch_size': 18, 'seq_length': 5, 'dim_feedforward': 290, 'dropout': 0.018323892199611797, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:49:08,685] Trial 34 finished with value: 3.443668791292299 and parameters: {'learning_rate': 0.007457195013319058, 'batch_size': 24, 'seq_length': 7, 'dim_feedforward': 340, 'dropout': 0.022415066984698022, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:49:49,401] Trial 35 finished with value: 5.473182778762079 and parameters: {'learning_rate': 0.009240392352959457, 'batch_size': 16, 'seq_length': 11, 'dim_feedforward': 398, 'dropout': 0.03913153606650313, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:49:56,774] Trial 36 finished with value: 3.448602189506273 and parameters: {'learning_rate': 0.00029142663743294455, 'batch_size': 39, 'seq_length': 14, 'dim_feedforward': 254, 'dropout': 0.012449927537273142, 'num_layers': 3}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:50:06,808] Trial 37 finished with value: 3.444800240915604 and parameters: {'learning_rate': 0.001524274574737589, 'batch_size': 30, 'seq_length': 6, 'dim_feedforward': 273, 'dropout': 0.005502665660889333, 'num_layers': 3}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:50:21,339] Trial 38 finished with value: 3.1431545663578433 and parameters: {'learning_rate': 0.0010368265784631023, 'batch_size': 13, 'seq_length': 8, 'dim_feedforward': 202, 'dropout': 0.04698009284063927, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:50:27,148] Trial 39 finished with value: 3.326379853352636 and parameters: {'learning_rate': 0.005882483288086188, 'batch_size': 44, 'seq_length': 13, 'dim_feedforward': 422, 'dropout': 0.0619339231741148, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:50:31,028] Trial 40 finished with value: 3.1783726131543144 and parameters: {'learning_rate': 0.002593544384423781, 'batch_size': 62, 'seq_length': 4, 'dim_feedforward': 385, 'dropout': 0.027383313159097006, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:50:42,339] Trial 41 finished with value: 3.2424167619955444 and parameters: {'learning_rate': 0.0034838030910725104, 'batch_size': 18, 'seq_length': 8, 'dim_feedforward': 359, 'dropout': 0.03063836208921906, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:51:11,242] Trial 42 finished with value: 3.3277891294232793 and parameters: {'learning_rate': 0.004035780384180086, 'batch_size': 10, 'seq_length': 10, 'dim_feedforward': 304, 'dropout': 0.015161295854103595, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:51:31,019] Trial 43 finished with value: 3.215176770083742 and parameters: {'learning_rate': 0.00567096269478203, 'batch_size': 13, 'seq_length': 6, 'dim_feedforward': 360, 'dropout': 0.02447579447357406, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:51:40,577] Trial 44 finished with value: 3.1744618734504586 and parameters: {'learning_rate': 0.007406383344544445, 'batch_size': 20, 'seq_length': 9, 'dim_feedforward': 347, 'dropout': 0.07896143784878681, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:52:04,675] Trial 45 finished with value: 3.4764620991947868 and parameters: {'learning_rate': 0.0005868251249728112, 'batch_size': 8, 'seq_length': 10, 'dim_feedforward': 283, 'dropout': 0.032123865711023755, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:52:26,862] Trial 46 finished with value: 8.005834662712404 and parameters: {'learning_rate': 0.009542403401772492, 'batch_size': 22, 'seq_length': 8, 'dim_feedforward': 491, 'dropout': 0.04018908395499296, 'num_layers': 5}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:52:34,015] Trial 47 finished with value: 3.1714781199785587 and parameters: {'learning_rate': 0.0022514893201282227, 'batch_size': 27, 'seq_length': 12, 'dim_feedforward': 320, 'dropout': 0.009627899961212476, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:52:52,463] Trial 48 finished with value: 3.237818666327392 and parameters: {'learning_rate': 0.0050869137048406, 'batch_size': 16, 'seq_length': 6, 'dim_feedforward': 450, 'dropout': 0.02026200555171722, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:53:39,547] Trial 49 finished with value: 3.080003805566931 and parameters: {'learning_rate': 0.003991458352791616, 'batch_size': 10, 'seq_length': 14, 'dim_feedforward': 408, 'dropout': 0.09743142612781927, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:53:49,648] Trial 50 finished with value: 3.3384715436438195 and parameters: {'learning_rate': 0.003813419139350961, 'batch_size': 34, 'seq_length': 14, 'dim_feedforward': 416, 'dropout': 0.09795642908625961, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:54:06,650] Trial 51 finished with value: 3.027713644305698 and parameters: {'learning_rate': 0.0030641179983092962, 'batch_size': 10, 'seq_length': 18, 'dim_feedforward': 385, 'dropout': 0.08925951892837275, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:54:20,576] Trial 52 finished with value: 3.0191939330196633 and parameters: {'learning_rate': 0.002883809511195709, 'batch_size': 11, 'seq_length': 19, 'dim_feedforward': 388, 'dropout': 0.08375021623439088, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:54:36,364] Trial 53 finished with value: 3.1656837795545947 and parameters: {'learning_rate': 0.0028769812584641665, 'batch_size': 12, 'seq_length': 19, 'dim_feedforward': 385, 'dropout': 0.09039813996690177, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:55:02,484] Trial 54 finished with value: 3.190783477921153 and parameters: {'learning_rate': 0.0015637595867224194, 'batch_size': 10, 'seq_length': 21, 'dim_feedforward': 392, 'dropout': 0.08200343814838718, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:55:12,208] Trial 55 finished with value: 3.47153300495691 and parameters: {'learning_rate': 0.0011914207013689216, 'batch_size': 14, 'seq_length': 18, 'dim_feedforward': 405, 'dropout': 0.0696583164610117, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:55:43,737] Trial 56 finished with value: 3.424244448897152 and parameters: {'learning_rate': 0.0019455702878287415, 'batch_size': 11, 'seq_length': 20, 'dim_feedforward': 373, 'dropout': 0.08565390449821203, 'num_layers': 3}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:55:53,738] Trial 57 finished with value: 3.147070205067327 and parameters: {'learning_rate': 0.003194858579949033, 'batch_size': 15, 'seq_length': 17, 'dim_feedforward': 346, 'dropout': 0.09459985061530457, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:56:15,587] Trial 58 finished with value: 3.208837204919108 and parameters: {'learning_rate': 0.0022396165552229143, 'batch_size': 17, 'seq_length': 20, 'dim_feedforward': 432, 'dropout': 0.09242031089439338, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:56:30,766] Trial 59 finished with value: 3.027884243010086 and parameters: {'learning_rate': 0.003907847122453435, 'batch_size': 10, 'seq_length': 18, 'dim_feedforward': 329, 'dropout': 0.0987787696584525, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:57:39,090] Trial 60 finished with value: 3.1655621088075256 and parameters: {'learning_rate': 0.004213004693378151, 'batch_size': 8, 'seq_length': 17, 'dim_feedforward': 459, 'dropout': 0.09746895371106154, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:57:53,787] Trial 61 finished with value: 3.3426347911853007 and parameters: {'learning_rate': 0.0027435990285805574, 'batch_size': 14, 'seq_length': 18, 'dim_feedforward': 330, 'dropout': 0.08712501433293018, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:58:19,734] Trial 62 finished with value: 3.129177232983605 and parameters: {'learning_rate': 0.005419962408726463, 'batch_size': 10, 'seq_length': 16, 'dim_feedforward': 309, 'dropout': 0.0991593710318723, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:58:27,383] Trial 63 finished with value: 3.078595776806461 and parameters: {'learning_rate': 0.008006836956814192, 'batch_size': 57, 'seq_length': 19, 'dim_feedforward': 381, 'dropout': 0.08418600942221928, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:58:35,477] Trial 64 finished with value: 3.2090229347070793 and parameters: {'learning_rate': 0.00746199776955677, 'batch_size': 54, 'seq_length': 20, 'dim_feedforward': 377, 'dropout': 0.07425231336451535, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:59:10,633] Trial 65 finished with value: 3.228118209016246 and parameters: {'learning_rate': 0.008417261238821952, 'batch_size': 11, 'seq_length': 19, 'dim_feedforward': 353, 'dropout': 0.09463836610079629, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:59:17,631] Trial 66 finished with value: 3.2166130162809283 and parameters: {'learning_rate': 0.004440652461050668, 'batch_size': 50, 'seq_length': 16, 'dim_feedforward': 496, 'dropout': 0.08054801807469848, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:59:25,798] Trial 67 finished with value: 3.151791876632452 and parameters: {'learning_rate': 0.0069696360232830625, 'batch_size': 44, 'seq_length': 19, 'dim_feedforward': 363, 'dropout': 0.08402513248662749, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:59:34,327] Trial 68 finished with value: 3.18853637685079 and parameters: {'learning_rate': 0.008575030998736636, 'batch_size': 47, 'seq_length': 15, 'dim_feedforward': 406, 'dropout': 0.08998648397819625, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:59:47,025] Trial 69 finished with value: 3.3578355041994445 and parameters: {'learning_rate': 0.003625617901462527, 'batch_size': 38, 'seq_length': 18, 'dim_feedforward': 392, 'dropout': 0.07445256667665512, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 15:59:54,330] Trial 70 finished with value: 3.117105069383528 and parameters: {'learning_rate': 0.0062224730046301305, 'batch_size': 58, 'seq_length': 18, 'dim_feedforward': 335, 'dropout': 0.09266534940594018, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:00:26,322] Trial 71 finished with value: 3.238628412181874 and parameters: {'learning_rate': 0.002927941661090758, 'batch_size': 9, 'seq_length': 13, 'dim_feedforward': 322, 'dropout': 0.09978395101236441, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:00:41,502] Trial 72 finished with value: 3.1072087212723867 and parameters: {'learning_rate': 0.004905180931548434, 'batch_size': 19, 'seq_length': 20, 'dim_feedforward': 383, 'dropout': 0.08662535389916323, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:01:02,947] Trial 73 finished with value: 3.1783355768374344 and parameters: {'learning_rate': 0.0008591614058768834, 'batch_size': 13, 'seq_length': 21, 'dim_feedforward': 300, 'dropout': 0.05022745853176959, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:01:13,703] Trial 74 finished with value: 3.1117984638332827 and parameters: {'learning_rate': 0.0017436076264683703, 'batch_size': 16, 'seq_length': 14, 'dim_feedforward': 367, 'dropout': 0.08920340070609777, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:01:29,759] Trial 75 finished with value: 3.0576510585557397 and parameters: {'learning_rate': 0.00307035850056566, 'batch_size': 11, 'seq_length': 17, 'dim_feedforward': 343, 'dropout': 0.09600168093336912, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:01:50,058] Trial 76 finished with value: 3.089226816556511 and parameters: {'learning_rate': 0.003701709959665077, 'batch_size': 11, 'seq_length': 17, 'dim_feedforward': 354, 'dropout': 0.09526738463280646, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:02:24,114] Trial 77 finished with value: 8.018224626824233 and parameters: {'learning_rate': 0.003143857277418071, 'batch_size': 8, 'seq_length': 15, 'dim_feedforward': 430, 'dropout': 0.08276650523039658, 'num_layers': 4}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:02:37,685] Trial 78 finished with value: 3.12906417896627 and parameters: {'learning_rate': 0.002415530796632337, 'batch_size': 14, 'seq_length': 16, 'dim_feedforward': 341, 'dropout': 0.09254765627934944, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:02:46,217] Trial 79 finished with value: 3.2790313384579077 and parameters: {'learning_rate': 0.00585612805953686, 'batch_size': 55, 'seq_length': 19, 'dim_feedforward': 395, 'dropout': 0.07740242133626876, 'num_layers': 2}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:03:05,927] Trial 80 finished with value: 3.0881509639665325 and parameters: {'learning_rate': 0.004730041594619153, 'batch_size': 12, 'seq_length': 17, 'dim_feedforward': 483, 'dropout': 0.0966381293071829, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:03:28,989] Trial 81 finished with value: 3.1814578613618547 and parameters: {'learning_rate': 0.004540676227398018, 'batch_size': 12, 'seq_length': 17, 'dim_feedforward': 484, 'dropout': 0.09755628397566078, 'num_layers': 1}. Best is trial 2 with value: 3.0010241207241846.


Using device: cuda


[I 2025-09-29 16:03:57,730] Trial 82 finished with value: 3.0003390376632764 and parameters: {'learning_rate': 0.006699510228516179, 'batch_size': 10, 'seq_length': 18, 'dim_feedforward': 511, 'dropout': 0.0960344805347917, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:04:25,206] Trial 83 finished with value: 3.0846584340784204 and parameters: {'learning_rate': 0.008132854691168872, 'batch_size': 9, 'seq_length': 18, 'dim_feedforward': 510, 'dropout': 0.09071177408178088, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:04:42,932] Trial 84 finished with value: 7.724452155555957 and parameters: {'learning_rate': 0.00693041336253394, 'batch_size': 15, 'seq_length': 20, 'dim_feedforward': 447, 'dropout': 0.08815080856491898, 'num_layers': 3}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:05:15,534] Trial 85 finished with value: 3.2201779146643896 and parameters: {'learning_rate': 0.009809178555539179, 'batch_size': 9, 'seq_length': 18, 'dim_feedforward': 501, 'dropout': 0.08452387171708399, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:05:38,632] Trial 86 finished with value: 3.1471174685267287 and parameters: {'learning_rate': 0.005290492071673292, 'batch_size': 10, 'seq_length': 19, 'dim_feedforward': 468, 'dropout': 0.0925582194952702, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:05:55,318] Trial 87 finished with value: 3.2834773617953927 and parameters: {'learning_rate': 0.00010845208494672818, 'batch_size': 14, 'seq_length': 13, 'dim_feedforward': 417, 'dropout': 0.06195737640762782, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:06:08,699] Trial 88 finished with value: 3.2315536145981114 and parameters: {'learning_rate': 0.0039907346837110905, 'batch_size': 17, 'seq_length': 15, 'dim_feedforward': 461, 'dropout': 0.09441151644160037, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:06:29,548] Trial 89 finished with value: 3.1916032564064856 and parameters: {'learning_rate': 0.006403846565733592, 'batch_size': 11, 'seq_length': 11, 'dim_feedforward': 288, 'dropout': 0.0999229911139692, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:06:35,460] Trial 90 finished with value: 3.278788936311397 and parameters: {'learning_rate': 0.0003515772233149812, 'batch_size': 63, 'seq_length': 18, 'dim_feedforward': 313, 'dropout': 0.09635640313610729, 'num_layers': 2}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:07:06,898] Trial 91 finished with value: 3.1299953942627474 and parameters: {'learning_rate': 0.008385565965198962, 'batch_size': 9, 'seq_length': 18, 'dim_feedforward': 503, 'dropout': 0.09058600123811095, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:07:42,355] Trial 92 finished with value: 3.154234362861329 and parameters: {'learning_rate': 0.00804268991209152, 'batch_size': 8, 'seq_length': 17, 'dim_feedforward': 511, 'dropout': 0.08759169476072899, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:08:10,040] Trial 93 finished with value: 3.0580449978395214 and parameters: {'learning_rate': 0.005669072059468287, 'batch_size': 13, 'seq_length': 19, 'dim_feedforward': 487, 'dropout': 0.09091300937740711, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:08:29,062] Trial 94 finished with value: 3.1413447255260114 and parameters: {'learning_rate': 0.005403094138679943, 'batch_size': 13, 'seq_length': 19, 'dim_feedforward': 477, 'dropout': 0.07972242823829315, 'num_layers': 1}. Best is trial 82 with value: 3.0003390376632764.


Using device: cuda


[I 2025-09-29 16:09:01,779] Trial 95 finished with value: 2.9708868155075416 and parameters: {'learning_rate': 0.003490834806057147, 'batch_size': 11, 'seq_length': 19, 'dim_feedforward': 376, 'dropout': 0.09152546239599968, 'num_layers': 1}. Best is trial 95 with value: 2.9708868155075416.


Using device: cuda


[I 2025-09-29 16:09:22,166] Trial 96 finished with value: 3.1144341736575676 and parameters: {'learning_rate': 0.0032459401787999485, 'batch_size': 15, 'seq_length': 19, 'dim_feedforward': 488, 'dropout': 0.08530299041093943, 'num_layers': 1}. Best is trial 95 with value: 2.9708868155075416.


Using device: cuda


[I 2025-09-29 16:09:27,716] Trial 97 finished with value: 3.34006447753536 and parameters: {'learning_rate': 0.0020590024743729935, 'batch_size': 41, 'seq_length': 20, 'dim_feedforward': 213, 'dropout': 0.09252935498223824, 'num_layers': 1}. Best is trial 95 with value: 2.9708868155075416.


Using device: cuda


[I 2025-09-29 16:09:39,698] Trial 98 finished with value: 3.122596641443862 and parameters: {'learning_rate': 0.002574426849326108, 'batch_size': 17, 'seq_length': 21, 'dim_feedforward': 380, 'dropout': 0.06885002609781869, 'num_layers': 1}. Best is trial 95 with value: 2.9708868155075416.


Using device: cuda


[I 2025-09-29 16:09:59,019] Trial 99 finished with value: 3.203033582594926 and parameters: {'learning_rate': 0.005869940549712211, 'batch_size': 11, 'seq_length': 19, 'dim_feedforward': 347, 'dropout': 0.024134619125663077, 'num_layers': 1}. Best is trial 95 with value: 2.9708868155075416.


{'learning_rate': 0.003490834806057147,
 'batch_size': 11,
 'seq_length': 19,
 'dim_feedforward': 376,
 'dropout': 0.09152546239599968,
 'num_layers': 1}

In [5]:
def objective2(trial):
    # Suggest hyperparameters
    # Restrict d_model and nhead to best from first optuna to get better insights (no inf)
    d_model = 64
    nhead = 4
    

    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2016-2019',
        'train_test_split': '80:20',
        'features': ['sum_30_min_demand'] + feature_names,
        'visualise': False,
    }
    mapes = []
    # sd of 0.08 on 100 runs, therefore average over 4 runs to reduce variance to ~0.04
    for i in range(4):  # Average over 4 runs to reduce variance
        mapes.append(train_model(params))
    return sum(mapes)/len(mapes)

study = optuna.create_study(storage="sqlite:///db.sqlite3", study_name="2016_2019+all_features+80:20+4runs", load_if_exists=True)
study.optimize(objective2, n_trials=100, n_jobs=10)

study.best_params

[I 2025-09-29 22:21:33,082] A new study created in RDB with name: 2016_2019+all_features+80:20+4runs


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:26:36,129] Trial 0 finished with value: 2.7196023646650884 and parameters: {'learning_rate': 0.006889350197222347, 'batch_size': 25, 'seq_length': 16, 'dim_feedforward': 502, 'dropout': 0.014758552445583406, 'num_layers': 2}. Best is trial 0 with value: 2.7196023646650884.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:29:36,531] Trial 2 finished with value: 2.8148344294383354 and parameters: {'learning_rate': 0.0006038322467543339, 'batch_size': 9, 'seq_length': 17, 'dim_feedforward': 395, 'dropout': 0.03717878097805828, 'num_layers': 2}. Best is trial 0 with value: 2.7196023646650884.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:30:25,111] Trial 10 finished with value: 2.792522300540358 and parameters: {'learning_rate': 0.00013427082533922632, 'batch_size': 24, 'seq_length': 7, 'dim_feedforward': 266, 'dropout': 0.022424371556974243, 'num_layers': 3}. Best is trial 0 with value: 2.7196023646650884.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:30:51,034] Trial 6 finished with value: 4.912618152429336 and parameters: {'learning_rate': 0.004347007934794934, 'batch_size': 34, 'seq_length': 18, 'dim_feedforward': 296, 'dropout': 0.044087474484909496, 'num_layers': 5}. Best is trial 0 with value: 2.7196023646650884.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:32:13,047] Trial 9 finished with value: 7.087805047519385 and parameters: {'learning_rate': 0.00624128932181987, 'batch_size': 50, 'seq_length': 11, 'dim_feedforward': 433, 'dropout': 0.09558597060223833, 'num_layers': 5}. Best is trial 0 with value: 2.7196023646650884.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:32:54,105] Trial 11 finished with value: 2.6432362756027676 and parameters: {'learning_rate': 0.001166906551495673, 'batch_size': 26, 'seq_length': 4, 'dim_feedforward': 209, 'dropout': 0.05891210541854609, 'num_layers': 1}. Best is trial 11 with value: 2.6432362756027676.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:33:33,424] Trial 4 finished with value: 2.840268527916469 and parameters: {'learning_rate': 0.0009790414082557863, 'batch_size': 52, 'seq_length': 12, 'dim_feedforward': 253, 'dropout': 0.018148544090550733, 'num_layers': 5}. Best is trial 11 with value: 2.6432362756027676.


Using device: cuda


[I 2025-09-29 22:33:51,243] Trial 7 finished with value: 2.672340143244913 and parameters: {'learning_rate': 0.0031102213864134015, 'batch_size': 47, 'seq_length': 5, 'dim_feedforward': 373, 'dropout': 0.081118256285594, 'num_layers': 1}. Best is trial 11 with value: 2.6432362756027676.


Using device: cuda


[I 2025-09-29 22:34:08,665] Trial 12 finished with value: 2.9477662974633434 and parameters: {'learning_rate': 0.004330111332022696, 'batch_size': 41, 'seq_length': 19, 'dim_feedforward': 271, 'dropout': 0.0008077933529450632, 'num_layers': 4}. Best is trial 11 with value: 2.6432362756027676.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:35:24,953] Trial 18 finished with value: 2.8593481695282246 and parameters: {'learning_rate': 0.0001995232644759348, 'batch_size': 49, 'seq_length': 21, 'dim_feedforward': 153, 'dropout': 0.07434727620596546, 'num_layers': 1}. Best is trial 11 with value: 2.6432362756027676.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:36:54,296] Trial 15 finished with value: 2.840108145074334 and parameters: {'learning_rate': 0.00017667511160759822, 'batch_size': 14, 'seq_length': 7, 'dim_feedforward': 202, 'dropout': 0.03338636541067557, 'num_layers': 2}. Best is trial 11 with value: 2.6432362756027676.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:37:07,106] Trial 3 finished with value: 2.5777469976966 and parameters: {'learning_rate': 0.0028844719673065196, 'batch_size': 41, 'seq_length': 10, 'dim_feedforward': 238, 'dropout': 0.07009410111823772, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:37:45,858] Trial 20 finished with value: 2.6200890601251814 and parameters: {'learning_rate': 0.001990868586179084, 'batch_size': 64, 'seq_length': 3, 'dim_feedforward': 377, 'dropout': 0.06996444432711264, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:38:57,196] Trial 8 finished with value: 2.8048475835402713 and parameters: {'learning_rate': 0.0001782313541702206, 'batch_size': 19, 'seq_length': 12, 'dim_feedforward': 468, 'dropout': 0.02513567585276585, 'num_layers': 4}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:39:09,614] Trial 22 finished with value: 2.735607594144067 and parameters: {'learning_rate': 0.002152056215103207, 'batch_size': 64, 'seq_length': 9, 'dim_feedforward': 346, 'dropout': 0.06277778779096255, 'num_layers': 2}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:40:31,286] Trial 1 finished with value: 2.6895994168064554 and parameters: {'learning_rate': 0.0030934663679523327, 'batch_size': 38, 'seq_length': 17, 'dim_feedforward': 346, 'dropout': 0.07539006815248844, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:40:37,731] Trial 13 finished with value: 2.6951165401771107 and parameters: {'learning_rate': 0.003158017860186118, 'batch_size': 25, 'seq_length': 13, 'dim_feedforward': 509, 'dropout': 0.024687576029130312, 'num_layers': 4}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:40:44,933] Trial 24 finished with value: 2.5780787635404465 and parameters: {'learning_rate': 0.0020396063516305058, 'batch_size': 63, 'seq_length': 3, 'dim_feedforward': 328, 'dropout': 0.09957071857431103, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:41:14,602] Trial 23 finished with value: 2.6499863416113434 and parameters: {'learning_rate': 0.001996538197859218, 'batch_size': 59, 'seq_length': 9, 'dim_feedforward': 346, 'dropout': 0.062361065781557895, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:41:34,237] Trial 19 finished with value: 2.6665985638966476 and parameters: {'learning_rate': 0.0004554529751398293, 'batch_size': 8, 'seq_length': 3, 'dim_feedforward': 157, 'dropout': 0.06286393622955794, 'num_layers': 2}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:41:36,509] Trial 14 finished with value: 2.7951136807289707 and parameters: {'learning_rate': 0.0020247267050826946, 'batch_size': 43, 'seq_length': 9, 'dim_feedforward': 358, 'dropout': 0.020058379010770302, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:42:44,367] Trial 27 finished with value: 2.653081333642847 and parameters: {'learning_rate': 0.0005389312280673682, 'batch_size': 58, 'seq_length': 14, 'dim_feedforward': 138, 'dropout': 0.094664948053054, 'num_layers': 3}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:43:01,958] Trial 29 finished with value: 5.981766694985657 and parameters: {'learning_rate': 0.00934940294118818, 'batch_size': 56, 'seq_length': 14, 'dim_feedforward': 229, 'dropout': 0.09383830589767413, 'num_layers': 3}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:44:01,321] Trial 26 finished with value: 2.651154414956058 and parameters: {'learning_rate': 0.0014406143430565044, 'batch_size': 63, 'seq_length': 3, 'dim_feedforward': 143, 'dropout': 0.09219517720337408, 'num_layers': 3}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:44:04,978] Trial 32 finished with value: 2.6297153368048174 and parameters: {'learning_rate': 0.00111254330628851, 'batch_size': 63, 'seq_length': 6, 'dim_feedforward': 297, 'dropout': 0.08486179523173967, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:44:24,015] Trial 5 finished with value: 2.6379739893255847 and parameters: {'learning_rate': 0.0004119487493428584, 'batch_size': 40, 'seq_length': 12, 'dim_feedforward': 270, 'dropout': 0.0893685306536493, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:44:32,631] Trial 16 finished with value: 2.7645288717767937 and parameters: {'learning_rate': 0.005772960759771816, 'batch_size': 51, 'seq_length': 4, 'dim_feedforward': 236, 'dropout': 0.03258978098535159, 'num_layers': 3}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:44:36,816] Trial 31 finished with value: 2.7400792327372283 and parameters: {'learning_rate': 0.0012505736030256525, 'batch_size': 63, 'seq_length': 6, 'dim_feedforward': 306, 'dropout': 0.0845949347454405, 'num_layers': 3}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:45:28,792] Trial 21 finished with value: 2.5804377774789193 and parameters: {'learning_rate': 0.0019649390332039103, 'batch_size': 61, 'seq_length': 4, 'dim_feedforward': 141, 'dropout': 0.06308501583592417, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:45:34,493] Trial 34 finished with value: 2.719072496697629 and parameters: {'learning_rate': 0.0006955306786087401, 'batch_size': 32, 'seq_length': 5, 'dim_feedforward': 412, 'dropout': 0.052489949704074335, 'num_layers': 2}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:45:53,330] Trial 28 finished with value: 2.6491620210380336 and parameters: {'learning_rate': 0.0006175813441323151, 'batch_size': 57, 'seq_length': 15, 'dim_feedforward': 206, 'dropout': 0.09995189631013271, 'num_layers': 3}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:46:27,058] Trial 37 finished with value: 2.7280326680518057 and parameters: {'learning_rate': 0.0015716640221796947, 'batch_size': 32, 'seq_length': 8, 'dim_feedforward': 408, 'dropout': 0.07191122955769132, 'num_layers': 2}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:46:31,174] Trial 39 finished with value: 2.8113102594379873 and parameters: {'learning_rate': 0.0008285499949524502, 'batch_size': 54, 'seq_length': 8, 'dim_feedforward': 178, 'dropout': 0.05036162304825465, 'num_layers': 2}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:46:36,254] Trial 33 finished with value: 2.64927295397715 and parameters: {'learning_rate': 0.0007873173914372153, 'batch_size': 32, 'seq_length': 6, 'dim_feedforward': 311, 'dropout': 0.08365696176524966, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:47:03,366] Trial 41 finished with value: 2.6730663582236396 and parameters: {'learning_rate': 0.0025075009946457635, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 183, 'dropout': 0.052739951848508465, 'num_layers': 1}. Best is trial 3 with value: 2.5777469976966.


Using device: cuda


[I 2025-09-29 22:47:06,046] Trial 42 finished with value: 2.576016665584721 and parameters: {'learning_rate': 0.002324509504066925, 'batch_size': 46, 'seq_length': 3, 'dim_feedforward': 327, 'dropout': 0.06945590198581707, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:47:39,759] Trial 43 finished with value: 2.6148547548111405 and parameters: {'learning_rate': 0.00231464938541052, 'batch_size': 60, 'seq_length': 4, 'dim_feedforward': 330, 'dropout': 0.07036532723388238, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:47:44,234] Trial 40 finished with value: 2.5879417247799603 and parameters: {'learning_rate': 0.0016984084352498936, 'batch_size': 54, 'seq_length': 3, 'dim_feedforward': 179, 'dropout': 0.07237317834798117, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:48:03,698] Trial 44 finished with value: 2.6831301052319696 and parameters: {'learning_rate': 0.004112373849360651, 'batch_size': 60, 'seq_length': 4, 'dim_feedforward': 379, 'dropout': 0.06991250869200633, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:48:20,144] Trial 45 finished with value: 2.7128448190760435 and parameters: {'learning_rate': 0.0038227226256512988, 'batch_size': 44, 'seq_length': 5, 'dim_feedforward': 331, 'dropout': 0.06842293405214551, 'num_layers': 2}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:48:26,865] Trial 30 finished with value: 2.7309763085613894 and parameters: {'learning_rate': 0.001202136051737844, 'batch_size': 59, 'seq_length': 6, 'dim_feedforward': 312, 'dropout': 0.099922698005699, 'num_layers': 3}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:49:29,468] Trial 46 finished with value: 2.7041816218082344 and parameters: {'learning_rate': 0.004122395998328168, 'batch_size': 46, 'seq_length': 5, 'dim_feedforward': 324, 'dropout': 0.0405940063235363, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda


[I 2025-09-29 22:49:39,448] Trial 49 finished with value: 2.6908844828576806 and parameters: {'learning_rate': 0.0029460627503693107, 'batch_size': 37, 'seq_length': 11, 'dim_feedforward': 252, 'dropout': 0.04419803468147823, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda


[I 2025-09-29 22:49:43,782] Trial 48 finished with value: 2.848129762204789 and parameters: {'learning_rate': 0.003005793433532307, 'batch_size': 47, 'seq_length': 11, 'dim_feedforward': 323, 'dropout': 0.0439519323726571, 'num_layers': 2}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:50:10,201] Trial 17 finished with value: 2.9886489987960845 and parameters: {'learning_rate': 0.0027324006064676546, 'batch_size': 28, 'seq_length': 5, 'dim_feedforward': 482, 'dropout': 0.018784130317698534, 'num_layers': 2}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda


[I 2025-09-29 22:50:17,008] Trial 52 finished with value: 2.62131549204116 and parameters: {'learning_rate': 0.0015737699162672008, 'batch_size': 53, 'seq_length': 3, 'dim_feedforward': 129, 'dropout': 0.07973350635696387, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda


[I 2025-09-29 22:50:19,010] Trial 53 finished with value: 2.628310880006164 and parameters: {'learning_rate': 0.0016015489987285182, 'batch_size': 53, 'seq_length': 3, 'dim_feedforward': 173, 'dropout': 0.059261871604086955, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:50:48,992] Trial 47 finished with value: 2.7257128787683795 and parameters: {'learning_rate': 0.004280207383012016, 'batch_size': 46, 'seq_length': 11, 'dim_feedforward': 280, 'dropout': 0.04295999566733084, 'num_layers': 2}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda


[I 2025-09-29 22:50:52,640] Trial 55 finished with value: 2.646887459957504 and parameters: {'learning_rate': 0.0017353257872171864, 'batch_size': 42, 'seq_length': 4, 'dim_feedforward': 169, 'dropout': 0.07839791139079197, 'num_layers': 1}. Best is trial 42 with value: 2.576016665584721.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:51:06,295] Trial 56 finished with value: 2.5692276700614882 and parameters: {'learning_rate': 0.0059517128574747115, 'batch_size': 42, 'seq_length': 4, 'dim_feedforward': 283, 'dropout': 0.07870400190564725, 'num_layers': 1}. Best is trial 56 with value: 2.5692276700614882.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:51:09,197] Trial 51 finished with value: 2.5875791973357667 and parameters: {'learning_rate': 0.001610587975502386, 'batch_size': 37, 'seq_length': 3, 'dim_feedforward': 172, 'dropout': 0.07939873301804069, 'num_layers': 1}. Best is trial 56 with value: 2.5692276700614882.


Using device: cuda


[I 2025-09-29 22:51:15,095] Trial 36 finished with value: 2.6918132674772526 and parameters: {'learning_rate': 0.0007925476277965316, 'batch_size': 32, 'seq_length': 7, 'dim_feedforward': 401, 'dropout': 0.07174923758113493, 'num_layers': 2}. Best is trial 56 with value: 2.5692276700614882.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:51:55,021] Trial 58 finished with value: 2.732666568786067 and parameters: {'learning_rate': 0.005666210837195988, 'batch_size': 49, 'seq_length': 16, 'dim_feedforward': 226, 'dropout': 0.05597666743438126, 'num_layers': 1}. Best is trial 56 with value: 2.5692276700614882.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:52:22,125] Trial 59 finished with value: 2.60859414861204 and parameters: {'learning_rate': 0.005296186517496327, 'batch_size': 39, 'seq_length': 7, 'dim_feedforward': 285, 'dropout': 0.08902047671402576, 'num_layers': 1}. Best is trial 56 with value: 2.5692276700614882.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:53:01,141] Trial 62 finished with value: 2.5674798173455344 and parameters: {'learning_rate': 0.007422887689583403, 'batch_size': 40, 'seq_length': 4, 'dim_feedforward': 286, 'dropout': 0.08859578928270256, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:53:18,778] Trial 57 finished with value: 2.721465895547308 and parameters: {'learning_rate': 0.0009831942016060703, 'batch_size': 41, 'seq_length': 17, 'dim_feedforward': 194, 'dropout': 0.07713857811551415, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:53:21,934] Trial 63 finished with value: 2.6520080410488407 and parameters: {'learning_rate': 0.0035079428500495104, 'batch_size': 36, 'seq_length': 4, 'dim_feedforward': 242, 'dropout': 0.07541152979153436, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda


[I 2025-09-29 22:53:29,023] Trial 60 finished with value: 2.574183215208831 and parameters: {'learning_rate': 0.00545608247568281, 'batch_size': 40, 'seq_length': 7, 'dim_feedforward': 287, 'dropout': 0.07772289808075006, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:53:39,934] Trial 64 finished with value: 2.595799062800029 and parameters: {'learning_rate': 0.006770091979917986, 'batch_size': 44, 'seq_length': 4, 'dim_feedforward': 243, 'dropout': 0.08884608576037625, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda


[I 2025-09-29 22:53:47,439] Trial 50 finished with value: 2.6213252498437725 and parameters: {'learning_rate': 0.0016472233754635027, 'batch_size': 47, 'seq_length': 11, 'dim_feedforward': 176, 'dropout': 0.0779299917382259, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:55:49,114] Trial 66 finished with value: 7.373520001421726 and parameters: {'learning_rate': 0.007946485491011651, 'batch_size': 44, 'seq_length': 4, 'dim_feedforward': 261, 'dropout': 0.08835511803086449, 'num_layers': 5}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:56:12,416] Trial 68 finished with value: 7.832360917593052 and parameters: {'learning_rate': 0.008733828608580152, 'batch_size': 35, 'seq_length': 10, 'dim_feedforward': 262, 'dropout': 0.08739191710085752, 'num_layers': 5}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:56:51,910] Trial 67 finished with value: 7.250208714992709 and parameters: {'learning_rate': 0.009717580784587922, 'batch_size': 35, 'seq_length': 10, 'dim_feedforward': 265, 'dropout': 0.08886994283943594, 'num_layers': 4}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:57:10,667] Trial 70 finished with value: 2.6252594764589237 and parameters: {'learning_rate': 0.0050353592181877765, 'batch_size': 39, 'seq_length': 5, 'dim_feedforward': 360, 'dropout': 0.06662709711448789, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda


[I 2025-09-29 22:57:12,056] Trial 61 finished with value: 2.6324520218411633 and parameters: {'learning_rate': 0.005365673416175614, 'batch_size': 39, 'seq_length': 4, 'dim_feedforward': 237, 'dropout': 0.0896906711247846, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:57:45,844] Trial 65 finished with value: 2.6409660790016494 and parameters: {'learning_rate': 0.008469310245603023, 'batch_size': 35, 'seq_length': 4, 'dim_feedforward': 257, 'dropout': 0.06489808459724901, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:58:20,387] Trial 73 finished with value: 2.642811891154702 and parameters: {'learning_rate': 0.007680450191671969, 'batch_size': 42, 'seq_length': 6, 'dim_feedforward': 286, 'dropout': 0.0826368132504983, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda


[I 2025-09-29 22:58:36,668] Trial 72 finished with value: 2.6720373530621364 and parameters: {'learning_rate': 0.007266238791455568, 'batch_size': 39, 'seq_length': 5, 'dim_feedforward': 289, 'dropout': 0.0654526440069686, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda


[I 2025-09-29 22:58:39,543] Trial 71 finished with value: 2.7716260977292646 and parameters: {'learning_rate': 0.005081279671310815, 'batch_size': 39, 'seq_length': 5, 'dim_feedforward': 290, 'dropout': 0.06552125387279986, 'num_layers': 4}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 22:59:30,463] Trial 78 finished with value: 2.6811346408813703 and parameters: {'learning_rate': 0.0034065347533076165, 'batch_size': 51, 'seq_length': 8, 'dim_feedforward': 219, 'dropout': 0.0968666129986853, 'num_layers': 2}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:00:07,501] Trial 76 finished with value: 2.776153568416448 and parameters: {'learning_rate': 0.0025844998975170503, 'batch_size': 18, 'seq_length': 8, 'dim_feedforward': 293, 'dropout': 0.060683368870828404, 'num_layers': 1}. Best is trial 62 with value: 2.5674798173455344.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:00:47,148] Trial 79 finished with value: 2.5454720927583168 and parameters: {'learning_rate': 0.0066779511652954255, 'batch_size': 28, 'seq_length': 7, 'dim_feedforward': 355, 'dropout': 0.07519874608589755, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:01:05,316] Trial 80 finished with value: 2.584885252163526 and parameters: {'learning_rate': 0.001992432588740973, 'batch_size': 29, 'seq_length': 3, 'dim_feedforward': 304, 'dropout': 0.0750021290256768, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda


[I 2025-09-29 23:01:10,330] Trial 74 finished with value: 2.603712985304159 and parameters: {'learning_rate': 0.007223297702341401, 'batch_size': 50, 'seq_length': 8, 'dim_feedforward': 288, 'dropout': 0.08278459028140796, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:01:14,696] Trial 77 finished with value: 2.7653279097288803 and parameters: {'learning_rate': 0.0025858148246265373, 'batch_size': 29, 'seq_length': 8, 'dim_feedforward': 305, 'dropout': 0.07464926169783831, 'num_layers': 2}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:01:52,618] Trial 81 finished with value: 2.6275797864375905 and parameters: {'learning_rate': 0.005807876690196943, 'batch_size': 29, 'seq_length': 7, 'dim_feedforward': 343, 'dropout': 0.0736602460372834, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:02:29,323] Trial 85 finished with value: 2.675774807217559 and parameters: {'learning_rate': 0.0062583082600939, 'batch_size': 61, 'seq_length': 6, 'dim_feedforward': 361, 'dropout': 0.008421908007002536, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda


[I 2025-09-29 23:02:34,794] Trial 82 finished with value: 2.612681807292663 and parameters: {'learning_rate': 0.006616451374634196, 'batch_size': 29, 'seq_length': 6, 'dim_feedforward': 337, 'dropout': 0.08262342215177214, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:03:14,869] Trial 87 finished with value: 2.7350275055621407 and parameters: {'learning_rate': 0.000281234912048089, 'batch_size': 41, 'seq_length': 9, 'dim_feedforward': 278, 'dropout': 0.0922054815907706, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:03:28,511] Trial 86 finished with value: 2.689356953206918 and parameters: {'learning_rate': 0.0022319442444513607, 'batch_size': 24, 'seq_length': 12, 'dim_feedforward': 388, 'dropout': 0.09824814629979271, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:04:05,006] Trial 84 finished with value: 2.64828662102182 and parameters: {'learning_rate': 0.006223320082138388, 'batch_size': 20, 'seq_length': 6, 'dim_feedforward': 340, 'dropout': 0.009474823133884966, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:04:13,343] Trial 75 finished with value: 2.6190076940291704 and parameters: {'learning_rate': 0.0070504626899740245, 'batch_size': 22, 'seq_length': 6, 'dim_feedforward': 296, 'dropout': 0.081196487021376, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:04:24,019] Trial 54 finished with value: 2.65579932561189 and parameters: {'learning_rate': 0.0016942044010100224, 'batch_size': 53, 'seq_length': 3, 'dim_feedforward': 283, 'dropout': 0.0794249792554007, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:04:45,421] Trial 88 finished with value: 2.5995338327410695 and parameters: {'learning_rate': 0.004690748180153653, 'batch_size': 25, 'seq_length': 13, 'dim_feedforward': 316, 'dropout': 0.09755377794079065, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda


[I 2025-09-29 23:04:51,532] Trial 89 finished with value: 2.634225853122942 and parameters: {'learning_rate': 0.0046251933940602615, 'batch_size': 21, 'seq_length': 21, 'dim_feedforward': 433, 'dropout': 0.05692190614090614, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:05:45,084] Trial 94 finished with value: 2.648838606010519 and parameters: {'learning_rate': 0.0020817550087667627, 'batch_size': 27, 'seq_length': 3, 'dim_feedforward': 305, 'dropout': 0.06927990972861142, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:05:58,085] Trial 93 finished with value: 2.6444510757084023 and parameters: {'learning_rate': 0.002058434841282146, 'batch_size': 27, 'seq_length': 3, 'dim_feedforward': 273, 'dropout': 0.08607562437982567, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:06:18,350] Trial 90 finished with value: 2.689251991691036 and parameters: {'learning_rate': 0.0019119276850285906, 'batch_size': 26, 'seq_length': 3, 'dim_feedforward': 312, 'dropout': 0.08589438568916657, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:06:33,297] Trial 95 finished with value: 2.630713388303525 and parameters: {'learning_rate': 0.0013490175067802487, 'batch_size': 31, 'seq_length': 3, 'dim_feedforward': 272, 'dropout': 0.07673752210570944, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:07:13,698] Trial 96 finished with value: 2.6945142772750867 and parameters: {'learning_rate': 0.0013673944740640455, 'batch_size': 33, 'seq_length': 4, 'dim_feedforward': 371, 'dropout': 0.07675836452533107, 'num_layers': 2}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:07:44,214] Trial 98 finished with value: 2.7206290711785326 and parameters: {'learning_rate': 0.0036310977766885735, 'batch_size': 37, 'seq_length': 7, 'dim_feedforward': 354, 'dropout': 0.07280239026201804, 'num_layers': 2}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:07:49,703] Trial 99 finished with value: 2.676230592748286 and parameters: {'learning_rate': 0.003728346645037659, 'batch_size': 62, 'seq_length': 5, 'dim_feedforward': 352, 'dropout': 0.06764805604326385, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.
[I 2025-09-29 23:08:01,315] Trial 91 finished with value: 2.679392281116605 and parameters: {'learning_rate': 0.0019186029630040586, 'batch_size': 27, 'seq_length': 3, 'dim_feedforward': 317, 'dropout': 0.0861030715366971, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.
[I 2025-09-29 23:08:03,230] Trial 69 finished with value: 8.01925787357725 and parameters: {'learning_rate': 0.007952676958545969, 'batch_size': 35, 'seq_length': 8, 'dim_feedforward': 264, 'dropout': 0.06582817151725441, 'num_layers': 5}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda
Using device: cuda


[I 2025-09-29 23:08:18,757] Trial 83 finished with value: 2.6144806061202392 and parameters: {'learning_rate': 0.005852877131491771, 'batch_size': 27, 'seq_length': 6, 'dim_feedforward': 354, 'dropout': 0.012881306934650602, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.
[I 2025-09-29 23:08:19,188] Trial 97 finished with value: 2.7790183989496944 and parameters: {'learning_rate': 0.0014068918353662768, 'batch_size': 33, 'seq_length': 5, 'dim_feedforward': 350, 'dropout': 0.07496198782066822, 'num_layers': 2}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda


[I 2025-09-29 23:08:27,640] Trial 38 finished with value: 2.7573125749945087 and parameters: {'learning_rate': 0.0007786283661312173, 'batch_size': 34, 'seq_length': 8, 'dim_feedforward': 197, 'dropout': 0.051922685707599114, 'num_layers': 2}. Best is trial 79 with value: 2.5454720927583168.
[I 2025-09-29 23:08:31,014] Trial 25 finished with value: 2.7090335582379206 and parameters: {'learning_rate': 0.001530145521934141, 'batch_size': 64, 'seq_length': 3, 'dim_feedforward': 132, 'dropout': 0.09870326257062419, 'num_layers': 3}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda
Using device: cuda


[I 2025-09-29 23:08:42,670] Trial 92 finished with value: 2.5666134264125957 and parameters: {'learning_rate': 0.0013980419158554714, 'batch_size': 33, 'seq_length': 3, 'dim_feedforward': 315, 'dropout': 0.06848087706976444, 'num_layers': 1}. Best is trial 79 with value: 2.5454720927583168.


Using device: cuda


[I 2025-09-29 23:08:59,760] Trial 35 finished with value: 2.7801835547436986 and parameters: {'learning_rate': 0.0008288445837630881, 'batch_size': 31, 'seq_length': 5, 'dim_feedforward': 402, 'dropout': 0.052006878352606846, 'num_layers': 2}. Best is trial 79 with value: 2.5454720927583168.


{'learning_rate': 0.0066779511652954255,
 'batch_size': 28,
 'seq_length': 7,
 'dim_feedforward': 355,
 'dropout': 0.07519874608589755,
 'num_layers': 1}

In [None]:
def objective4(trial):
    # Suggest hyperparameters
    # Restrict d_model and nhead to best from first optuna to get better insights (no inf)
    d_model = 64
    nhead = 4
    
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2016-2019',
        'train_test_split': 'prior:2019',
        'features': ['sum_30_min_demand'] + feature_names,
        'visualise': False,
    }
    mapes = []
    # sd of 0.08 on 100 runs, therefore average over 5 runs to reduce variance to ~0.04
    for i in range(5):  # Average over 5 runs to reduce variance
        mapes.append(train_model(params))
    median_mape = sorted(mapes)[len(mapes) // 2]  # Use median to reduce impact of outliers
    return median_mape

study = optuna.create_study(storage="sqlite:///db.sqlite3", study_name="2016_2019+sunlight_precipitation+prior:2019+5runs", load_if_exists=True)
study.optimize(objective4, n_trials=100, n_jobs=5)

study.best_params

[I 2025-09-30 11:00:43,264] Using an existing study with name '2016_2019+sunlight_precipitation+prior:2019+5runs' instead of creating a new one.
[I 2025-09-30 11:03:14,068] Trial 5 finished with value: 3.3521779817646973 and parameters: {'learning_rate': 0.0003144995895281153, 'batch_size': 33, 'seq_length': 18, 'dim_feedforward': 447, 'dropout': 0.09972828946414458, 'num_layers': 5}. Best is trial 5 with value: 3.3521779817646973.
[I 2025-09-30 11:03:56,115] Trial 6 finished with value: 3.215110882080167 and parameters: {'learning_rate': 0.0005016988608835774, 'batch_size': 26, 'seq_length': 9, 'dim_feedforward': 316, 'dropout': 0.03986945130763411, 'num_layers': 3}. Best is trial 6 with value: 3.215110882080167.
[I 2025-09-30 11:05:08,578] Trial 10 finished with value: 3.243127719450048 and parameters: {'learning_rate': 0.00405119218860763, 'batch_size': 31, 'seq_length': 10, 'dim_feedforward': 253, 'dropout': 0.02121794642786904, 'num_layers': 2}. Best is trial 6 with value: 3.21511

{'learning_rate': 0.005749102129708673,
 'batch_size': 43,
 'seq_length': 5,
 'dim_feedforward': 379,
 'dropout': 0.06878088229261956,
 'num_layers': 1}

In [None]:
def objective5(trial):
    # Suggest hyperparameters
    # Restrict d_model and nhead to best from first optuna to get better insights (no inf)
    d_model = 64
    nhead = 4
    
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2016-2019',
        'train_test_split': 'prior:2019',
        'features': ['sum_30_min_demand'],
        'visualise': False,
    }
    model = median_model(params)
    return model['results']['test_mape']

study = optuna.create_study(storage="sqlite:///../../db.sqlite3", study_name="75:25-base", load_if_exists=True)
study.optimize(objective5, n_trials=100, n_jobs=5)

study.best_params

[I 2025-10-02 10:04:03,395] Using an existing study with name '75:25-base' instead of creating a new one.
[I 2025-10-02 10:10:38,798] Trial 37 finished with value: 3.9944380702471207 and parameters: {'learning_rate': 0.001580317854197986, 'batch_size': 12, 'seq_length': 14, 'dim_feedforward': 135, 'dropout': 0.021791949587060353, 'num_layers': 2}. Best is trial 21 with value: 3.8813432231978093.
[I 2025-10-02 10:12:59,173] Trial 38 finished with value: 3.8832749267350573 and parameters: {'learning_rate': 0.001473261522735139, 'batch_size': 8, 'seq_length': 14, 'dim_feedforward': 333, 'dropout': 0.024403476197233723, 'num_layers': 2}. Best is trial 21 with value: 3.8813432231978093.
[I 2025-10-02 10:14:34,878] Trial 42 finished with value: 4.1152942029813095 and parameters: {'learning_rate': 0.0015350167517641496, 'batch_size': 58, 'seq_length': 15, 'dim_feedforward': 329, 'dropout': 0.021290549398654494, 'num_layers': 2}. Best is trial 21 with value: 3.8813432231978093.
[I 2025-10-02 1

In [3]:
def objective6(trial):
    # Suggest hyperparameters
    # Restrict d_model and nhead to best from first optuna to get better insights (no inf)
    d_model = 64
    nhead = 4
    
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2016-2019',
        'train_test_split': 'prior:2019',
        'features': ['sum_30_min_demand'] + optimal_feature_names,
        'visualise': False,
    }
    model = median_model(params, 5)
    return model['results']['test_mape']

study = optuna.create_study(storage="sqlite:///../../db.sqlite3", study_name="75:25 - optimal features", load_if_exists=True)
study.optimize(objective6, n_trials=100, n_jobs=5)

study.best_params

[I 2025-10-02 15:07:40,539] Using an existing study with name '75:25 - optimal features' instead of creating a new one.
[I 2025-10-02 15:10:57,448] Trial 5 finished with value: 3.1857422923969527 and parameters: {'learning_rate': 0.006385769772796186, 'batch_size': 36, 'seq_length': 10, 'dim_feedforward': 232, 'dropout': 0.038716876107915736, 'num_layers': 3}. Best is trial 5 with value: 3.1857422923969527.
[I 2025-10-02 15:11:14,338] Trial 6 finished with value: 3.1320018438806607 and parameters: {'learning_rate': 0.00047500427341821315, 'batch_size': 53, 'seq_length': 17, 'dim_feedforward': 487, 'dropout': 0.016052546670216463, 'num_layers': 1}. Best is trial 6 with value: 3.1320018438806607.
[I 2025-10-02 15:11:47,855] Trial 9 finished with value: 6.836800359977777 and parameters: {'learning_rate': 0.0045952848957766905, 'batch_size': 8, 'seq_length': 19, 'dim_feedforward': 266, 'dropout': 0.09051533028347072, 'num_layers': 3}. Best is trial 6 with value: 3.1320018438806607.
[I 2025

{'learning_rate': 0.00019631858761495606,
 'batch_size': 18,
 'seq_length': 3,
 'dim_feedforward': 182,
 'dropout': 0.07784433954123926,
 'num_layers': 2}

In [5]:
def objective7(trial):
    d_model = 64
    nhead = 4
    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2016-2019',
        'train_test_split': 'prior:2019',
        'features': ['sum_30_min_demand'] + feature_names,
        'visualise': False,
    }
    model = median_model(params, 5)
    return model['results']['test_mape']

study = optuna.create_study(storage="sqlite:///../../db.sqlite3", study_name="75:25 - all features", load_if_exists=True)
study.optimize(objective7, n_trials=100, n_jobs=5)

study.best_params

[I 2025-10-03 10:23:20,474] A new study created in RDB with name: 75:25 - all features
[I 2025-10-03 10:24:12,525] Trial 1 finished with value: 6.380832659717736 and parameters: {'learning_rate': 0.004511987258548201, 'batch_size': 55, 'seq_length': 19, 'dim_feedforward': 395, 'dropout': 0.08204705393654972, 'num_layers': 5}. Best is trial 1 with value: 6.380832659717736.
[I 2025-10-03 10:24:59,725] Trial 5 finished with value: 2.987462829549501 and parameters: {'learning_rate': 0.0003778456272706621, 'batch_size': 32, 'seq_length': 6, 'dim_feedforward': 430, 'dropout': 0.037373846767619734, 'num_layers': 1}. Best is trial 5 with value: 2.987462829549501.
[I 2025-10-03 10:25:50,738] Trial 0 finished with value: 3.2346515894116226 and parameters: {'learning_rate': 0.0018265646152774678, 'batch_size': 24, 'seq_length': 6, 'dim_feedforward': 160, 'dropout': 0.010635137266631367, 'num_layers': 4}. Best is trial 5 with value: 2.987462829549501.
[I 2025-10-03 10:26:09,506] Trial 3 finished w

{'learning_rate': 0.00023012066138871276,
 'batch_size': 47,
 'seq_length': 14,
 'dim_feedforward': 355,
 'dropout': 0.043975107811000946,
 'num_layers': 1}