In [None]:
from functools import partial
import os
import tempfile
import torch
from ray import tune, train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from Helper.ml_models import * 
import json
from datetime import datetime
import ray.cloudpickle as pickle

from ray.tune.search.optuna import OptunaSearch
from optuna.samplers import TPESampler

2025-01-10 15:12:19.675049: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def make_directory(model):
    dir_name = f'Hyperparameter/{model}'
    os.makedirs(dir_name, exist_ok=True)

# Variables

In [3]:
all_models = ['deeplabv3_resnet50', 'deeplabv3_resnet101', 'deeplabv3_mobilenet_v3_large', 'lraspp_mobilenet_v3_large']
not_yet_studied = ['fcn_resnet50', 'fcn_resnet101']

k_fold_dataset = K_Fold_Dataset('/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/CityscapesDaten/images',
                         '/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/CityscapesDaten/semantic',
                         k_fold_csv_dir='/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Daten/CityscapesDaten',
                         leave_out_fold=0,
                         )

k_fold_dataset.check_for_data_leaks()               

No data leaks found.


In [4]:
model = all_models[0]

def train_hyper(config, checkpoint_dir=None):  
    try:
        make_directory(model)
        hyper_model = TrainedModel(model, 2048, 1024, weights_name='', folder_path=f'Hyperparameter/{model}', start_epoch='latest')
        
        # Checkpoint laden, falls vorhanden
        if checkpoint_dir:
            with open(os.path.join(checkpoint_dir, 'checkpoint.pkl'), 'rb') as fp:
                checkpoint = pickle.load(fp)
                hyper_model.model.load_state_dict(checkpoint["model_state"])
                hyper_model.optimizer.load_state_dict(checkpoint["optimizer_state"])
                start_epoch = checkpoint["epoch"]
        else:
            start_epoch = 0
        
        
        hyper_model.prepare_model_training(
            dataset_train=k_fold_dataset.train_dataset,
            dataset_val=k_fold_dataset.val_dataset,
            dataset_test=k_fold_dataset.test_dataset,
            batch_size=int(config['batch_size']), 
            val_batch_size=int(config['batch_size']),
            shuffle=True, 
            learning_rate=config['learning_rate'],
            weight_decay=config['weight_decay'], 
            num_workers=4, 
            pin_memory=True,
            ray_tune=True,
            )

        EPOCHS = 20 
        
        for epoch in range(start_epoch, EPOCHS):
            epoch_loss, epoch_acc, val_loss, _ = hyper_model.train(use_autocast=config['auto_cast']) 
            #miou = hyper_model.calculate_miou(k_fold_dataset.val_dataset)
            # with tune.checkpoint_dir(epoch) as cp_dir:
            #     hyper_model.save_model(file_management=False, save_path=cp_dir)
            # tune.report(loss=epoch_loss, val_loss= val_loss , acc=epoch_acc)
            
            with tempfile.TemporaryDirectory() as checkpoint_dir:
                checkpoint_data = {
                    "model_state": hyper_model.model.state_dict(),
                    "optimizer_state": hyper_model.optimizer.state_dict(),
                    "epoch": epoch,
                }
                with open(os.path.join(checkpoint_dir, 'checkpoint.pkl'), 'wb') as fp:
                    pickle.dump(checkpoint_data, fp)

                checkpoint = Checkpoint.from_directory(checkpoint_dir)
                train.report(
                    {"loss": epoch_loss, "val_loss": val_loss, "acc": epoch_acc},
                    checkpoint=checkpoint
                )
            
    except RuntimeError as e:
        if "out of memory" in str(e):
            tune.report(loss=float('inf'), val_loss= float('inf') , acc=0.0)
        else:
            raise e  
        
config = {
    "learning_rate": tune.loguniform(1e-12, 1e-2),
    'batch_size': tune.choice([2,4,6,8,12,14,16]),
    "weight_decay": tune.loguniform(1e-6, 1e-1), 
    "auto_cast": tune.choice([True, False]),
}

analysis = tune.run(
    train_hyper,
    config=config,
    resources_per_trial={"cpu": 6, "gpu": 1},
    scheduler=ASHAScheduler(
        metric="val_loss",
        mode="min",
        max_t=20,
        grace_period=5,
        reduction_factor=3,
    ),
    progress_reporter=CLIReporter(metric_columns=["loss", "val_loss", "acc", "training_iteration"]),
    local_dir=f"/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/{model}_{datetime.now().strftime('%Y%m%d-%H%M%S')}",
    search_alg=OptunaSearch(
        metric="val_loss",
        mode="min",
        sampler=TPESampler(seed=42),
    ),
    num_samples=100,
    #checkpoint_config=train.CheckpointConfig(
        #checkpoint_frequency=5,
        #checkpoint_at_end=True,
    #),
    resume=True,
)

print("Best hyperparameters found were: ", analysis.get_best_config(metric="val_loss", mode="min"))

best_config = analysis.best_config(metric="val_loss", mode="min")

# Save the best configuration to a JSON file
with open('hyper_best_config.json', 'w') as json_file:
    json.dump(best_config, json_file)

print("Best configuration saved to best_config.json.")

# Speichere alle getesteten Konfigurationen und Ergebnisse
all_trials = analysis.trials
with open('hyper_all_trials.json', 'w') as json_file:
    json.dump([trial.config for trial in all_trials], json_file)

print("All configurations saved to all_trials.json.")


2025-01-10 15:06:58,884	INFO worker.py:1749 -- Started a local Ray instance.
2025-01-10 15:06:59,361	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2025-01-10 15:06:59,365	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
[I 2025-01-10 15:06:59,375] A new study created in memory with name: optuna
2025-01-10 15:06:59,394	ERROR tune_controller.py:235 -- Failed to restore the run state.
Traceback (most recent call last):
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/tune/execution/tune_controller.py", line 230, in __init__
    self.resume(resume_config=resume_config)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/tune/execution/tune_controller.py", line 437, in resume
    r

== Status ==
Current time: 2025-01-10 15:06:59 (running for 00:00:00.11)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 15.000: None | Iter 5.000: None
Logical resource usage: 6.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-01-10_15-06-57_086153_23985/artifacts/2025-01-10_15-06-59/train_hyper_2025-01-10_15-06-59/driver_artifacts
Number of trials: 1/100 (1 PENDING)
+----------------------+----------+-------+-------------+--------------+-----------------+----------------+
| Trial name           | status   | loc   | auto_cast   |   batch_size |   learning_rate |   weight_decay |
|----------------------+----------+-------+-------------+--------------+-----------------+----------------|
| train_hyper_6b05bbc8 | PENDING  |       | True        |            2 |     5.56418e-09 |     0.00101292 |
+----------------------+----------+-------+-------------+--------------+-----------------+----------------+




[36m(pid=24740)[0m 2025-01-10 15:07:02.011580: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(pid=24740)[0m To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[36m(train_hyper pid=24740)[0m Using CUDA GPU
[36m(train_hyper pid=24740)[0m Model loaded: deeplabv3_resnet50 | Device: cuda 
[36m(train_hyper pid=24740)[0m Latest Epoch Save doesnt exist or Epoch Number Save doesnt exist, initialising new Save
[36m(train_hyper pid=24740)[0m Saved Model
[36m(train_hyper pid=24740)[0m Successfully loaded Model
[36m(train_hyper pid=24740)[0m Training Dataset prepared
[36m(train_hyper pid=24740)[0m Validation Dataset prepared
[36m(train_hyper pid=24740)[0m Test Dataset prepared
== Status ==
Current time: 2025-01-10 15:07:04 (running for 00:00:05.18)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 15.000: None | Iter 5.000: None
Logical resource usage: 6.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-01-10_15-06-57_086153_23985/artifacts/2025-01-10_15-06-59/train_hyper_2025-01-10_15-06-59/driver_artifacts
Number of trials: 2/100 (1 PENDING, 1 RUNNING)
+----------------------+----------+------------

2025-01-10 15:09:20,050	ERROR tune_controller.py:1331 -- Trial task failed for trial train_hyper_6b05bbc8
Traceback (most recent call last):
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskE

Trial name
train_hyper_6b05bbc8


== Status ==
Current time: 2025-01-10 15:09:20 (running for 00:02:20.95)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 15.000: None | Iter 5.000: None
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-01-10_15-06-57_086153_23985/artifacts/2025-01-10_15-06-59/train_hyper_2025-01-10_15-06-59/driver_artifacts
Number of trials: 2/100 (1 ERROR, 1 PENDING)
+----------------------+----------+----------------+-------------+--------------+-----------------+----------------+
| Trial name           | status   | loc            | auto_cast   |   batch_size |   learning_rate |   weight_decay |
|----------------------+----------+----------------+-------------+--------------+-----------------+----------------|
| train_hyper_9d18aff9 | PENDING  |                | True        |            2 |     0.00500148  |    2.85855e-05 |
| train_hyper_6b05bbc8 | ERROR    | 10.7.0.4:24740 | True        |            2 |     5.56418e-09 |    0.00101292  

[36m(pid=25106)[0m 2025-01-10 15:09:28.197737: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(pid=25106)[0m To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[36m(train_hyper pid=25106)[0m Using CUDA GPU
[36m(train_hyper pid=25106)[0m Model loaded: deeplabv3_resnet50 | Device: cuda 
[36m(train_hyper pid=25106)[0m Latest Epoch Save doesnt exist or Epoch Number Save doesnt exist, initialising new Save
== Status ==
Current time: 2025-01-10 15:09:30 (running for 00:02:31.06)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 15.000: None | Iter 5.000: None
Logical resource usage: 6.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-01-10_15-06-57_086153_23985/artifacts/2025-01-10_15-06-59/train_hyper_2025-01-10_15-06-59/driver_artifacts
Number of trials: 3/100 (1 ERROR, 1 PENDING, 1 RUNNING)
+----------------------+----------+----------------+-------------+--------------+-----------------+----------------+
| Trial name           | status   | loc            | auto_cast   |   batch_size |   learning_rate |   weight_decay |
|----------------------+----------+----------------+-------------+--------------+--

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2025-01-10 15:09:33,881	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/deeplabv3_resnet50_20250110-150657/train_hyper_2025-01-10_15-06-59' in 0.0037s.


== Status ==
Current time: 2025-01-10 15:09:33 (running for 00:02:34.48)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 15.000: None | Iter 5.000: None
Logical resource usage: 6.0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-01-10_15-06-57_086153_23985/artifacts/2025-01-10_15-06-59/train_hyper_2025-01-10_15-06-59/driver_artifacts
Number of trials: 3/100 (1 ERROR, 1 PENDING, 1 RUNNING)
+----------------------+----------+----------------+-------------+--------------+-----------------+----------------+
| Trial name           | status   | loc            | auto_cast   |   batch_size |   learning_rate |   weight_decay |
|----------------------+----------+----------------+-------------+--------------+-----------------+----------------|
| train_hyper_9d18aff9 | RUNNING  | 10.7.0.4:25106 | True        |            2 |     0.00500148  |    2.85855e-05 |
| train_hyper_e4dd098f | PENDING  |                | True        |            6 |     8.34539e-10 | 

2025-01-10 15:09:43,889	ERROR tune.py:1035 -- Trials did not complete: [train_hyper_6b05bbc8]
2025-01-10 15:09:43,890	INFO tune.py:1039 -- Total run time: 164.52 seconds (154.48 seconds for the tuning loop).
Resume experiment with: tune.run(..., resume=True)
- train_hyper_e4dd098f: FileNotFoundError('Could not fetch metrics for train_hyper_e4dd098f: both result.json and progress.csv were not found at /home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/deeplabv3_resnet50_20250110-150657/train_hyper_2025-01-10_15-06-59/train_hyper_e4dd098f_3_auto_cast=True,batch_size=6,learning_rate=0.0000,weight_decay=0.0011_2025-01-10_15-09-29')


Best hyperparameters found were:  None


ValueError: To fetch the `best_config`, pass a `metric` and `mode` parameter to `tune.run()`. Alternatively, use the `get_best_config(metric, mode)` method to set the metric and mode explicitly.