In [None]:
#imports
import optuna
import sys
sys.path.append('/home/harry/personal/uni/project/individual/harry/model')

from model import *

feature_names = [
    'is_summer', 'is_autumn', 'is_winter', 'is_spring',
    'is_monday', 'is_tuesday', 'is_wednesday', 'is_thursday', 'is_friday', 'is_saturday', 'is_sunday',
    'is_weekday', 'is_weekend',
    'is_jan', 'is_feb', 'is_mar', 'is_apr', 'is_may', 'is_jun', 'is_jul', 'is_aug', 'is_sep', 'is_oct', 'is_nov', 'is_dec',
    'min_30_min_demand',
    'avg_30_min_demand',
    'max_30_min_demand',
    'avg_temp',
    'max_temp',
    'min_temp',
    'hd_next_24h',
    'cd_next_24h'
]

## Optuna Function
def objective(trial):
    # Suggest hyperparameters
    # Restrict d_model and nhead to best from first optuna to get better insights (no inf)
    d_model = 64
    nhead = 4
    

    params = {
        'learning_rate': trial.suggest_float("learning_rate", 1e-4, 1e-2, log=True),
        'batch_size': trial.suggest_int("batch_size", 8, 64),
        'seq_length': trial.suggest_int("seq_length", 3, 21),
        'transformer_encoder_layer_params': {
            'd_model': d_model,
            'nhead': nhead,
            'dim_feedforward': trial.suggest_int("dim_feedforward", 128, 512),
            'dropout': trial.suggest_float("dropout", 0.0, 0.1),
            'activation': 'relu'
        },
        'transformer_layer_params': {
            'num_layers': trial.suggest_int("num_layers", 1, 5),
        },
        'dataset': '2010-2019',
        'train_test_split': '80:20',
        'features': ['sum_30_min_demand'] + feature_names,
        'visualise': False,
    }

    return train_model(params)

In [2]:
study = optuna.create_study(storage="sqlite:///db.sqlite3", study_name="2010_2019+all_features", load_if_exists=True)
study.optimize(objective, n_trials=100, n_jobs=1)

study.best_params

[I 2025-09-29 11:19:38,320] Using an existing study with name '2010_2019+all_features' instead of creating a new one.


Using device: cuda


[I 2025-09-29 11:20:08,000] Trial 3 finished with value: 3.1824056399172322 and parameters: {'learning_rate': 0.004805942134174947, 'batch_size': 47, 'seq_length': 10, 'dim_feedforward': 323, 'dropout': 0.07539051040836629, 'num_layers': 4}. Best is trial 1 with value: 3.0486290549032313.


Using device: cuda


[I 2025-09-29 11:20:32,570] Trial 4 finished with value: 3.001530727837248 and parameters: {'learning_rate': 0.0002878752531644364, 'batch_size': 56, 'seq_length': 6, 'dim_feedforward': 232, 'dropout': 0.07307726163621338, 'num_layers': 5}. Best is trial 4 with value: 3.001530727837248.


Using device: cuda


[I 2025-09-29 11:21:01,922] Trial 5 finished with value: 3.1115477510892338 and parameters: {'learning_rate': 0.0010205521037639552, 'batch_size': 52, 'seq_length': 17, 'dim_feedforward': 307, 'dropout': 0.07380976263213435, 'num_layers': 4}. Best is trial 4 with value: 3.001530727837248.


Using device: cuda


[I 2025-09-29 11:21:19,232] Trial 6 finished with value: 2.9445880118359256 and parameters: {'learning_rate': 0.008659258360609209, 'batch_size': 63, 'seq_length': 12, 'dim_feedforward': 354, 'dropout': 0.008324597856990369, 'num_layers': 1}. Best is trial 6 with value: 2.9445880118359256.


Using device: cuda


[I 2025-09-29 11:22:37,695] Trial 7 finished with value: 3.017898836093181 and parameters: {'learning_rate': 0.00016582936455100577, 'batch_size': 24, 'seq_length': 17, 'dim_feedforward': 129, 'dropout': 0.06291018973677207, 'num_layers': 5}. Best is trial 6 with value: 2.9445880118359256.


Using device: cuda


[I 2025-09-29 11:22:46,166] Trial 8 finished with value: 2.8774886339778973 and parameters: {'learning_rate': 0.006529525930381961, 'batch_size': 60, 'seq_length': 6, 'dim_feedforward': 340, 'dropout': 0.059293098909012944, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:23:00,083] Trial 9 finished with value: 3.007431944917096 and parameters: {'learning_rate': 0.0019802997032520547, 'batch_size': 64, 'seq_length': 6, 'dim_feedforward': 474, 'dropout': 0.0641210375988857, 'num_layers': 4}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:23:45,084] Trial 10 finished with value: 6.8842939205676394 and parameters: {'learning_rate': 0.008679454614105613, 'batch_size': 10, 'seq_length': 18, 'dim_feedforward': 347, 'dropout': 0.018773179276062812, 'num_layers': 2}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:24:07,876] Trial 11 finished with value: 3.06791597647111 and parameters: {'learning_rate': 0.00995536649029403, 'batch_size': 29, 'seq_length': 6, 'dim_feedforward': 342, 'dropout': 0.09963952730006977, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:24:21,969] Trial 12 finished with value: 2.9533571632190685 and parameters: {'learning_rate': 0.0033753589127150044, 'batch_size': 44, 'seq_length': 3, 'dim_feedforward': 493, 'dropout': 0.03827436384797589, 'num_layers': 2}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:24:29,012] Trial 13 finished with value: 3.018519208073878 and parameters: {'learning_rate': 0.004768576895398871, 'batch_size': 64, 'seq_length': 12, 'dim_feedforward': 417, 'dropout': 0.005649494253888296, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:24:35,695] Trial 14 finished with value: 3.1005944320033407 and parameters: {'learning_rate': 0.000346260164262477, 'batch_size': 56, 'seq_length': 14, 'dim_feedforward': 252, 'dropout': 0.041831571019833536, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:25:11,746] Trial 15 finished with value: 3.0770965732473865 and parameters: {'learning_rate': 0.002790999801268797, 'batch_size': 41, 'seq_length': 21, 'dim_feedforward': 270, 'dropout': 0.05241713812585839, 'num_layers': 3}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:25:18,682] Trial 16 finished with value: 3.008713138865924 and parameters: {'learning_rate': 0.006593043296756514, 'batch_size': 64, 'seq_length': 10, 'dim_feedforward': 411, 'dropout': 0.011168258791351714, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:25:31,176] Trial 17 finished with value: 2.9608274952296685 and parameters: {'learning_rate': 0.0010520850833653974, 'batch_size': 54, 'seq_length': 3, 'dim_feedforward': 188, 'dropout': 0.09503154618377507, 'num_layers': 2}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:26:30,833] Trial 18 finished with value: 3.128585455029769 and parameters: {'learning_rate': 0.0005777139567383755, 'batch_size': 10, 'seq_length': 9, 'dim_feedforward': 382, 'dropout': 0.023091876809295354, 'num_layers': 3}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:26:42,002] Trial 19 finished with value: 3.007497252628302 and parameters: {'learning_rate': 0.0001011871450308179, 'batch_size': 39, 'seq_length': 14, 'dim_feedforward': 454, 'dropout': 0.00010108673658373868, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:27:02,258] Trial 20 finished with value: 3.1284469767361776 and parameters: {'learning_rate': 0.0052108690851234065, 'batch_size': 48, 'seq_length': 14, 'dim_feedforward': 296, 'dropout': 0.05023853862440829, 'num_layers': 2}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:28:01,318] Trial 21 finished with value: 3.0710665195338818 and parameters: {'learning_rate': 0.002931915631681341, 'batch_size': 18, 'seq_length': 8, 'dim_feedforward': 362, 'dropout': 0.03833010334433096, 'num_layers': 3}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:28:13,752] Trial 22 finished with value: 2.925389035763739 and parameters: {'learning_rate': 0.0017031437046001676, 'batch_size': 59, 'seq_length': 12, 'dim_feedforward': 438, 'dropout': 0.08741887021665484, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:28:25,073] Trial 23 finished with value: 2.9311097579180148 and parameters: {'learning_rate': 0.002065116800460377, 'batch_size': 58, 'seq_length': 12, 'dim_feedforward': 444, 'dropout': 0.08625285795401257, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:28:39,862] Trial 24 finished with value: 2.891259154204279 and parameters: {'learning_rate': 0.0015752896342646192, 'batch_size': 58, 'seq_length': 11, 'dim_feedforward': 446, 'dropout': 0.08757795164826013, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:29:01,477] Trial 25 finished with value: 2.9328347679203306 and parameters: {'learning_rate': 0.0006832986659095371, 'batch_size': 51, 'seq_length': 10, 'dim_feedforward': 509, 'dropout': 0.08700937934171783, 'num_layers': 2}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:29:10,849] Trial 26 finished with value: 2.8985135850895567 and parameters: {'learning_rate': 0.0012979846060732678, 'batch_size': 59, 'seq_length': 4, 'dim_feedforward': 431, 'dropout': 0.085459610568028, 'num_layers': 1}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:29:21,159] Trial 27 finished with value: 3.0459499113318755 and parameters: {'learning_rate': 0.0006876142649266277, 'batch_size': 59, 'seq_length': 4, 'dim_feedforward': 466, 'dropout': 0.06351701441034711, 'num_layers': 2}. Best is trial 8 with value: 2.8774886339778973.


Using device: cuda


[I 2025-09-29 11:29:30,699] Trial 28 finished with value: 2.849691558465298 and parameters: {'learning_rate': 0.0012560519208899958, 'batch_size': 49, 'seq_length': 5, 'dim_feedforward': 413, 'dropout': 0.08317277065055476, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:29:46,154] Trial 29 finished with value: 2.984908513315785 and parameters: {'learning_rate': 0.000496880143361635, 'batch_size': 48, 'seq_length': 5, 'dim_feedforward': 378, 'dropout': 0.07810783438494691, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:30:06,413] Trial 30 finished with value: 3.1689874842230368 and parameters: {'learning_rate': 0.0013356092123784212, 'batch_size': 37, 'seq_length': 8, 'dim_feedforward': 408, 'dropout': 0.05658575858551594, 'num_layers': 3}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:30:20,178] Trial 31 finished with value: 3.06078407689288 and parameters: {'learning_rate': 0.002534736594333693, 'batch_size': 44, 'seq_length': 8, 'dim_feedforward': 385, 'dropout': 0.06805419635488501, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:30:41,282] Trial 32 finished with value: 3.0206856519209713 and parameters: {'learning_rate': 0.00038858757072065744, 'batch_size': 33, 'seq_length': 7, 'dim_feedforward': 326, 'dropout': 0.09447112582848904, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:30:50,811] Trial 33 finished with value: 2.947600089431085 and parameters: {'learning_rate': 0.0012545204679891189, 'batch_size': 59, 'seq_length': 4, 'dim_feedforward': 431, 'dropout': 0.08393629331508705, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:31:02,147] Trial 34 finished with value: 2.9190414944354264 and parameters: {'learning_rate': 0.0015395302722682636, 'batch_size': 51, 'seq_length': 5, 'dim_feedforward': 401, 'dropout': 0.08117768386371173, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:31:14,709] Trial 35 finished with value: 2.923492432338486 and parameters: {'learning_rate': 0.0008579347525724155, 'batch_size': 54, 'seq_length': 5, 'dim_feedforward': 492, 'dropout': 0.0922050611587245, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:31:27,558] Trial 36 finished with value: 3.0105791989937876 and parameters: {'learning_rate': 0.004164848699988268, 'batch_size': 61, 'seq_length': 7, 'dim_feedforward': 428, 'dropout': 0.07077046918284007, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:31:38,516] Trial 37 finished with value: 2.8790254642360416 and parameters: {'learning_rate': 0.0008539495048604844, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 471, 'dropout': 0.07696336895399421, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:31:56,078] Trial 38 finished with value: 2.9246429747791907 and parameters: {'learning_rate': 0.0008680093578706935, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 478, 'dropout': 0.07659788051147373, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:32:21,504] Trial 39 finished with value: 8.196122319700738 and parameters: {'learning_rate': 0.0066171853830933125, 'batch_size': 45, 'seq_length': 6, 'dim_feedforward': 458, 'dropout': 0.05527457321891023, 'num_layers': 5}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:32:48,414] Trial 40 finished with value: 3.0821180375549027 and parameters: {'learning_rate': 0.00019431121997706528, 'batch_size': 50, 'seq_length': 9, 'dim_feedforward': 300, 'dropout': 0.06859564166890827, 'num_layers': 4}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:32:59,305] Trial 41 finished with value: 3.014188264893173 and parameters: {'learning_rate': 0.00212843763600576, 'batch_size': 53, 'seq_length': 16, 'dim_feedforward': 365, 'dropout': 0.0758698044919992, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:33:12,498] Trial 42 finished with value: 2.9966876793027444 and parameters: {'learning_rate': 0.00048073881815187484, 'batch_size': 61, 'seq_length': 6, 'dim_feedforward': 506, 'dropout': 0.05874372240054842, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:33:20,791] Trial 43 finished with value: 2.948018144351028 and parameters: {'learning_rate': 0.0012103155909282033, 'batch_size': 56, 'seq_length': 4, 'dim_feedforward': 479, 'dropout': 0.08172200224161041, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:33:30,492] Trial 44 finished with value: 2.9222367189781324 and parameters: {'learning_rate': 0.0009309240675200567, 'batch_size': 61, 'seq_length': 4, 'dim_feedforward': 449, 'dropout': 0.09973803602161738, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:33:39,515] Trial 45 finished with value: 2.9749410842203257 and parameters: {'learning_rate': 0.0014985722302802667, 'batch_size': 58, 'seq_length': 3, 'dim_feedforward': 426, 'dropout': 0.09140788896061355, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:33:48,870] Trial 46 finished with value: 2.917028286263237 and parameters: {'learning_rate': 0.000758277929467497, 'batch_size': 62, 'seq_length': 5, 'dim_feedforward': 211, 'dropout': 0.045001341176984726, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:33:58,169] Trial 47 finished with value: 2.9280045766749736 and parameters: {'learning_rate': 0.003658450594875465, 'batch_size': 56, 'seq_length': 6, 'dim_feedforward': 331, 'dropout': 0.07241803216381139, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:34:18,284] Trial 48 finished with value: 3.1144142903359024 and parameters: {'learning_rate': 0.00110238603775806, 'batch_size': 47, 'seq_length': 11, 'dim_feedforward': 394, 'dropout': 0.07976655582508993, 'num_layers': 4}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:34:27,912] Trial 49 finished with value: 2.975408979329053 and parameters: {'learning_rate': 0.0023351482103141016, 'batch_size': 52, 'seq_length': 21, 'dim_feedforward': 274, 'dropout': 0.08973393650209359, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:34:41,069] Trial 50 finished with value: 2.996408078674392 and parameters: {'learning_rate': 0.00027755465032268286, 'batch_size': 41, 'seq_length': 19, 'dim_feedforward': 151, 'dropout': 0.060223371060819327, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:34:48,345] Trial 51 finished with value: 2.9476423658677966 and parameters: {'learning_rate': 0.001833521137903023, 'batch_size': 64, 'seq_length': 3, 'dim_feedforward': 487, 'dropout': 0.09567715524798333, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:34:57,201] Trial 52 finished with value: 3.0373489304843075 and parameters: {'learning_rate': 0.0005758603386455493, 'batch_size': 57, 'seq_length': 7, 'dim_feedforward': 418, 'dropout': 0.0740868857056641, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:35:04,871] Trial 53 finished with value: 2.921176350661955 and parameters: {'learning_rate': 0.0007322272478450759, 'batch_size': 62, 'seq_length': 5, 'dim_feedforward': 180, 'dropout': 0.04246916348037741, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:35:12,167] Trial 54 finished with value: 2.9003598826616095 and parameters: {'learning_rate': 0.0009895919148590936, 'batch_size': 61, 'seq_length': 4, 'dim_feedforward': 200, 'dropout': 0.033578447109106624, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:35:30,025] Trial 55 finished with value: 3.1395711926267973 and parameters: {'learning_rate': 0.0010821768935788381, 'batch_size': 26, 'seq_length': 4, 'dim_feedforward': 230, 'dropout': 0.014715884134650109, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:35:40,107] Trial 56 finished with value: 2.9201113496718225 and parameters: {'learning_rate': 0.0014207567327061102, 'batch_size': 54, 'seq_length': 3, 'dim_feedforward': 312, 'dropout': 0.02136781028163054, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:35:57,178] Trial 57 finished with value: 2.986222244137959 and parameters: {'learning_rate': 0.0017378607837102641, 'batch_size': 60, 'seq_length': 15, 'dim_feedforward': 464, 'dropout': 0.06607558524578497, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:36:06,196] Trial 58 finished with value: 2.939280347805351 and parameters: {'learning_rate': 0.0009578710128735929, 'batch_size': 50, 'seq_length': 4, 'dim_feedforward': 342, 'dropout': 0.027022107509853663, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:36:30,062] Trial 59 finished with value: 3.1183307911833764 and parameters: {'learning_rate': 0.0060636247250783135, 'batch_size': 57, 'seq_length': 13, 'dim_feedforward': 262, 'dropout': 0.03144114803798361, 'num_layers': 2}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:36:42,467] Trial 60 finished with value: 3.039606965585211 and parameters: {'learning_rate': 0.0030564808363597747, 'batch_size': 64, 'seq_length': 7, 'dim_feedforward': 287, 'dropout': 0.03376811498953752, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:37:11,655] Trial 61 finished with value: 3.0658465210004815 and parameters: {'learning_rate': 0.0005620324705430455, 'batch_size': 49, 'seq_length': 9, 'dim_feedforward': 444, 'dropout': 0.04656270759441882, 'num_layers': 3}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:37:22,456] Trial 62 finished with value: 2.969487530027635 and parameters: {'learning_rate': 0.0004134736368365124, 'batch_size': 52, 'seq_length': 5, 'dim_feedforward': 240, 'dropout': 0.0844772600737247, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:37:31,725] Trial 63 finished with value: 2.982039194435502 and parameters: {'learning_rate': 0.0008034003942809629, 'batch_size': 63, 'seq_length': 5, 'dim_feedforward': 193, 'dropout': 0.04402314094302146, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:37:45,087] Trial 64 finished with value: 3.1462457428068067 and parameters: {'learning_rate': 0.0007185433289554337, 'batch_size': 60, 'seq_length': 6, 'dim_feedforward': 214, 'dropout': 0.03668675365047748, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:37:53,993] Trial 65 finished with value: 2.931002784694771 and parameters: {'learning_rate': 0.0011802314588539044, 'batch_size': 62, 'seq_length': 4, 'dim_feedforward': 170, 'dropout': 0.025862366429374308, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:38:07,197] Trial 66 finished with value: 2.8917619789245586 and parameters: {'learning_rate': 0.0006453392801727208, 'batch_size': 58, 'seq_length': 3, 'dim_feedforward': 209, 'dropout': 0.04828973313287376, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:38:15,694] Trial 67 finished with value: 2.9172968458880386 and parameters: {'learning_rate': 0.0006447726821963531, 'batch_size': 58, 'seq_length': 3, 'dim_feedforward': 128, 'dropout': 0.04862941822610318, 'num_layers': 1}. Best is trial 28 with value: 2.849691558465298.


Using device: cuda


[I 2025-09-29 11:38:29,502] Trial 68 finished with value: 2.839785350800706 and parameters: {'learning_rate': 0.00030947067568322416, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 154, 'dropout': 0.052385587797542, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:38:47,871] Trial 69 finished with value: 2.911546265537545 and parameters: {'learning_rate': 0.00024080696659748091, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 152, 'dropout': 0.06244162777300251, 'num_layers': 2}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:39:23,789] Trial 70 finished with value: 2.9773259806435517 and parameters: {'learning_rate': 0.00016195134105504284, 'batch_size': 14, 'seq_length': 6, 'dim_feedforward': 147, 'dropout': 0.08832156502428114, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:39:31,516] Trial 71 finished with value: 2.921137409825177 and parameters: {'learning_rate': 0.000327569529796459, 'batch_size': 53, 'seq_length': 3, 'dim_feedforward': 404, 'dropout': 0.050733518668954446, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:39:53,478] Trial 72 finished with value: 3.102312569313855 and parameters: {'learning_rate': 0.00989939955764662, 'batch_size': 46, 'seq_length': 4, 'dim_feedforward': 373, 'dropout': 0.08341239139029827, 'num_layers': 2}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:40:00,875] Trial 73 finished with value: 3.0068384893409754 and parameters: {'learning_rate': 0.0004788155002881716, 'batch_size': 57, 'seq_length': 4, 'dim_feedforward': 198, 'dropout': 0.05336872269737773, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:40:13,560] Trial 74 finished with value: 2.9128631851869407 and parameters: {'learning_rate': 0.00014656996372597984, 'batch_size': 60, 'seq_length': 5, 'dim_feedforward': 169, 'dropout': 0.09646631856447378, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:40:19,814] Trial 75 finished with value: 2.841995605837357 and parameters: {'learning_rate': 0.001021380446997293, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 473, 'dropout': 0.040088621912942816, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:40:48,048] Trial 76 finished with value: 2.891216671407315 and parameters: {'learning_rate': 0.001574603563122812, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 469, 'dropout': 0.03971479115212132, 'num_layers': 5}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:41:10,886] Trial 77 finished with value: 3.0040523890921467 and parameters: {'learning_rate': 0.001558521123022576, 'batch_size': 55, 'seq_length': 3, 'dim_feedforward': 502, 'dropout': 0.040814914915982096, 'num_layers': 5}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:41:45,973] Trial 78 finished with value: 3.0018005475256313 and parameters: {'learning_rate': 0.00011141549229585354, 'batch_size': 43, 'seq_length': 3, 'dim_feedforward': 471, 'dropout': 0.03885887065732295, 'num_layers': 5}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:42:02,291] Trial 79 finished with value: 3.0538469741023153 and parameters: {'learning_rate': 0.002100245297261749, 'batch_size': 53, 'seq_length': 3, 'dim_feedforward': 458, 'dropout': 0.048300531633112403, 'num_layers': 4}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:42:14,951] Trial 80 finished with value: 3.2519920898803956 and parameters: {'learning_rate': 0.002399126591512168, 'batch_size': 51, 'seq_length': 11, 'dim_feedforward': 438, 'dropout': 0.054714190790970216, 'num_layers': 3}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:42:40,884] Trial 81 finished with value: 3.0309267460096905 and parameters: {'learning_rate': 0.0008684656550819713, 'batch_size': 48, 'seq_length': 5, 'dim_feedforward': 487, 'dropout': 0.057771103240434836, 'num_layers': 4}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:42:46,416] Trial 82 finished with value: 2.9885383064124813 and parameters: {'learning_rate': 0.0012718094104567834, 'batch_size': 56, 'seq_length': 4, 'dim_feedforward': 481, 'dropout': 0.03561419522112579, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:42:54,317] Trial 83 finished with value: 2.9231037026573072 and parameters: {'learning_rate': 0.0013969296212229334, 'batch_size': 59, 'seq_length': 3, 'dim_feedforward': 452, 'dropout': 0.09207068028738985, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:43:01,685] Trial 84 finished with value: 2.931437898292433 and parameters: {'learning_rate': 0.0018979130771246211, 'batch_size': 57, 'seq_length': 4, 'dim_feedforward': 415, 'dropout': 0.07816252004061251, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:43:08,961] Trial 85 finished with value: 2.9077715739287022 and parameters: {'learning_rate': 0.0011171935606107246, 'batch_size': 54, 'seq_length': 5, 'dim_feedforward': 497, 'dropout': 0.04040949212974862, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:43:20,945] Trial 86 finished with value: 2.8625147764803565 and parameters: {'learning_rate': 0.0006274059230010401, 'batch_size': 34, 'seq_length': 3, 'dim_feedforward': 436, 'dropout': 0.08540288394322232, 'num_layers': 1}. Best is trial 68 with value: 2.839785350800706.


Using device: cuda


[I 2025-09-29 11:43:35,776] Trial 87 finished with value: 2.791110011420222 and parameters: {'learning_rate': 0.000419166373275899, 'batch_size': 32, 'seq_length': 3, 'dim_feedforward': 467, 'dropout': 0.04715374171568079, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:43:48,848] Trial 88 finished with value: 2.8388758432988164 and parameters: {'learning_rate': 0.00036756227971282575, 'batch_size': 33, 'seq_length': 3, 'dim_feedforward': 466, 'dropout': 0.051292471879914155, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:44:00,527] Trial 89 finished with value: 2.9165008951627622 and parameters: {'learning_rate': 0.00042588876523181555, 'batch_size': 33, 'seq_length': 3, 'dim_feedforward': 470, 'dropout': 0.05062796322010633, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:44:41,742] Trial 90 finished with value: 2.982235902622501 and parameters: {'learning_rate': 0.00034034051111924416, 'batch_size': 30, 'seq_length': 4, 'dim_feedforward': 511, 'dropout': 0.04399922931225471, 'num_layers': 3}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:44:52,431] Trial 91 finished with value: 2.9607093378171236 and parameters: {'learning_rate': 0.00027405563271712505, 'batch_size': 35, 'seq_length': 6, 'dim_feedforward': 461, 'dropout': 0.060621517941560676, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:45:05,478] Trial 92 finished with value: 2.8472954758178153 and parameters: {'learning_rate': 0.00021752389666480595, 'batch_size': 29, 'seq_length': 3, 'dim_feedforward': 423, 'dropout': 0.05382819002697188, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:45:20,211] Trial 93 finished with value: 2.863608630521948 and parameters: {'learning_rate': 0.0003909570690441121, 'batch_size': 27, 'seq_length': 3, 'dim_feedforward': 437, 'dropout': 0.052587842745270466, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:45:37,618] Trial 94 finished with value: 2.95589261948694 and parameters: {'learning_rate': 0.00020556320688499573, 'batch_size': 27, 'seq_length': 4, 'dim_feedforward': 437, 'dropout': 0.05425783758451897, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:45:48,470] Trial 95 finished with value: 2.876158870426455 and parameters: {'learning_rate': 0.00037366063051466164, 'batch_size': 31, 'seq_length': 3, 'dim_feedforward': 422, 'dropout': 0.05163080605690909, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:46:01,587] Trial 96 finished with value: 2.8358049364959608 and parameters: {'learning_rate': 0.0003126184291574593, 'batch_size': 22, 'seq_length': 5, 'dim_feedforward': 420, 'dropout': 0.0572438227897383, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:46:13,964] Trial 97 finished with value: 2.905207353283212 and parameters: {'learning_rate': 0.00031675212730954545, 'batch_size': 23, 'seq_length': 5, 'dim_feedforward': 425, 'dropout': 0.052055125248603465, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:46:31,997] Trial 98 finished with value: 2.9005231278465127 and parameters: {'learning_rate': 0.000386781592063548, 'batch_size': 30, 'seq_length': 4, 'dim_feedforward': 393, 'dropout': 0.05718557949530892, 'num_layers': 1}. Best is trial 87 with value: 2.791110011420222.


Using device: cuda


[I 2025-09-29 11:46:54,604] Trial 99 finished with value: 2.7369358069832876 and parameters: {'learning_rate': 0.00024261380969184664, 'batch_size': 22, 'seq_length': 3, 'dim_feedforward': 420, 'dropout': 0.05278116820216414, 'num_layers': 1}. Best is trial 99 with value: 2.7369358069832876.


Using device: cuda


[I 2025-09-29 11:47:14,110] Trial 100 finished with value: 2.9697212869864464 and parameters: {'learning_rate': 0.00024043082953422178, 'batch_size': 21, 'seq_length': 4, 'dim_feedforward': 409, 'dropout': 0.04646528099003004, 'num_layers': 1}. Best is trial 99 with value: 2.7369358069832876.


Using device: cuda


[I 2025-09-29 11:47:27,454] Trial 101 finished with value: 2.9500937674742924 and parameters: {'learning_rate': 0.00029515456810600256, 'batch_size': 27, 'seq_length': 3, 'dim_feedforward': 436, 'dropout': 0.05644014426158325, 'num_layers': 1}. Best is trial 99 with value: 2.7369358069832876.


Using device: cuda


[I 2025-09-29 11:47:42,227] Trial 102 finished with value: 2.9225745837947796 and parameters: {'learning_rate': 0.00024820814576021464, 'batch_size': 37, 'seq_length': 5, 'dim_feedforward': 399, 'dropout': 0.06524949770853845, 'num_layers': 1}. Best is trial 99 with value: 2.7369358069832876.


{'learning_rate': 0.00024261380969184664,
 'batch_size': 22,
 'seq_length': 3,
 'dim_feedforward': 420,
 'dropout': 0.05278116820216414,
 'num_layers': 1}