In [1]:
from Helper.ml_models import MapillaryDataLoader, MapillaryTrainedModel

from functools import partial
from tqdm import tqdm
import os
import tempfile
import torch

from Helper.ml_models import * 
import json
from datetime import datetime
import ray.cloudpickle as pickle

from ray import tune, train
from ray.air.config import CheckpointConfig
from ray import tune, train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
import ray

from ray.tune.search.optuna import OptunaSearch
from optuna.samplers import TPESampler

import ray
ray.shutdown()
ray.init() 


2025-03-07 14:53:17,604	INFO worker.py:1841 -- Started a local Ray instance.


0,1
Python version:,3.9.18
Ray version:,2.43.0


In [2]:
best_config_path = "FINAL_DATEN/best_configs_Map_BIG.json"


# Mapillary-Daten laden 
# UNBEDINGT (!!!) volle Pfade nutzen, da Raytune aus dem Basisverzeichnis startet
# und sonst die Pfade nicht findet
mapillary_loader = MapillaryDataLoader(
    train_images_dir='/home/iff-ros-station-1-studi/Sbiegay/Mapillary_Vistas/training/images',
    train_annotations_dir='/home/iff-ros-station-1-studi/Sbiegay/Mapillary_Vistas/training/v2.0/labels_big',
    val_images_dir='/home/iff-ros-station-1-studi/Sbiegay/Mapillary_Vistas/validation/images',
    val_annotations_dir='/home/iff-ros-station-1-studi/Sbiegay/Mapillary_Vistas/validation/v2.0/labels_big'
)

def make_directory(model):
    dir_name = f'Hyperparameter/{model}'
    os.makedirs(dir_name, exist_ok=True)
    

  original_init(self, **validated_kwargs)


In [3]:
    
    
modells_to_study = [
    'deeplabv3_resnet50', 
    #'deeplabv3_resnet101', 
    #'fcn_resnet50',
    #'fcn_resnet101'
]

In [4]:

def train_hyper(config, checkpoint_dir=None):  
    try:
        make_directory(model)

        # Nutze jetzt die MapillaryTrainedModel-Klasse!
        hyper_model = MapillaryTrainedModel(
            model_name=model,
            width=2048,
            height=1024,
            weights_name='',
            folder_path=f'Hyperparameter/{model}',
            start_epoch='latest'
        )
        
        # Number of classes check
        print(f"[INIT] Modell '{model}' initialisiert mit {hyper_model.num_classes} Klassen.")


        # Falls ein Checkpoint existiert, lade ihn
        if checkpoint_dir:
            with get_checkpoint().as_directory() as checkpoint_dir_path:
                with open(os.path.join(checkpoint_dir_path, 'checkpoint.pkl'), 'rb') as fp:
                    checkpoint = pickle.load(fp)
                    hyper_model.model.load_state_dict(checkpoint["model_state"])
                    hyper_model.optimizer.load_state_dict(checkpoint["optimizer_state"])
                    start_epoch = checkpoint["epoch"] + 1
        else:
            start_epoch = 0

        # Nutze jetzt den Mapillary-Dataloader
        hyper_model.prepare_model_training(
            dataset_train=mapillary_loader.train_dataset,
            dataset_val=mapillary_loader.val_dataset,
            batch_size=int(config['batch_size']),
            val_batch_size=int(config['batch_size']),
            shuffle=True,
            learning_rate=config['learning_rate'],
            weight_decay=config['weight_decay'],
            num_workers=4,
            pin_memory=True,
            ray_tune=True,
        )

        max_epochs = min(config.get("max_epochs", 100), 100)

        for epoch in range(start_epoch, max_epochs):
            epoch_loss, epoch_acc, val_loss, val_acc = hyper_model.train(use_autocast=config['auto_cast'])

            print(
                f"Epoch: {epoch}, "
                f"Loss: {epoch_loss:.4f}, Train Acc: {epoch_acc:.2f}%, "
                f"Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%"
            )

            with tempfile.TemporaryDirectory() as tmp_dir:
                checkpoint_data = {
                    "model_state": hyper_model.model.state_dict(),
                    "optimizer_state": hyper_model.optimizer.state_dict(),
                    "epoch": epoch,
                }
                with open(os.path.join(tmp_dir, 'checkpoint.pkl'), 'wb') as fp:
                    pickle.dump(checkpoint_data, fp)

                checkpoint_obj = Checkpoint.from_directory(tmp_dir)
                train.report(
                    {
                        "loss": epoch_loss,
                        "val_loss": val_loss,
                        "train_acc": epoch_acc,
                        "val_acc": val_acc,
                        "training_iteration": epoch,
                        "num_classes": hyper_model.num_classes,
                    },
                    checkpoint=checkpoint_obj
                )

        print("Training completed successfully!")

    except RuntimeError as e:
        if "out of memory" in str(e):
            train.report({"loss": float('inf'), "val_loss": float('inf'), "train_acc": 0.0, "val_acc": 0.0})
        else:
            raise e




In [5]:

for model in modells_to_study:
    ray.shutdown()
    ray.init() 

    # Define your parameter search space
    config = {
        "learning_rate": tune.loguniform(0.0001, 0.00011),
        "batch_size": 8,
        "weight_decay": 0,
        "auto_cast": True,
        "max_epochs": 100,
    }

    # Create an OptunaSearch object (which implements TPE)
    search_alg = OptunaSearch(
        sampler=TPESampler(seed=42),
        metric="val_loss",
        mode="min",
    )

    # Wrap your training function to request 1 CPU and 1 GPU
    train_hyper_with_resources = tune.with_resources(
        train_hyper,
        resources={"cpu": 1, "gpu": 1}
    )

    tuner = tune.Tuner(
        train_hyper_with_resources,
        param_space=config,
        tune_config=tune.TuneConfig(
            num_samples=2,
            search_alg=search_alg,
            scheduler=ASHAScheduler(
                max_t=100,
                grace_period=5,
                reduction_factor=3,
            ),
            metric="val_loss",
            mode="min",
        ),
        run_config=train.RunConfig(
            name=f"{model}",
            storage_path="/home/iff-ros-station-1-studi/Sbiegay/HyperparameterLOG",
            checkpoint_config=CheckpointConfig(
                num_to_keep=5,
                checkpoint_score_attribute="val_loss",
                checkpoint_score_order="min",
            ),
            progress_reporter=CLIReporter(
                metric_columns=["loss", "val_loss", "train_acc", "val_acc", "training_iteration", "num_classes"],
            ),
        ),
    )


2025-03-07 14:53:20,604	INFO worker.py:1841 -- Started a local Ray instance.


[36m(train_hyper pid=42675)[0m Using CUDA GPU
[36m(train_hyper pid=42675)[0m Error loading model in class Model with deeplabv3_resnet50
[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffffa686f5358d04b5df00c4ee2201000000 Worker ID: 3db0e10e158979c2fb8a5cc4becab3a3043a815b21983b631d1090c2 Node ID: 2e8c48bf7a17811a42cb52aa14728e96556074c6fcdefb6b162d839d Worker IP address: 134.169.206.100 Worker port: 38025 Worker PID: 42675 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None.


In [6]:

analysis = tuner.fit() 

2025-03-07 14:53:21,166	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949
[I 2025-03-07 14:53:21,179] A new study created in memory with name: optuna


== Status ==
Current time: 2025-03-07 14:53:21 (running for 00:00:00.14)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 45.000: None | Iter 15.000: None | Iter 5.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-03-07_14-53-19_896673_40525/artifacts/2025-03-07_14-53-21/deeplabv3_resnet50/driver_artifacts
Number of trials: 1/2 (1 PENDING)
+----------------------+----------+-------+-----------------+
| Trial name           | status   | loc   |   learning_rate |
|----------------------+----------+-------+-----------------|
| train_hyper_2b777fcc | PENDING  |       |     0.000103634 |
+----------------------+----------+-------+-----------------+




2025-03-07 14:53:24,917	ERROR tune_controller.py:1331 -- Trial task failed for trial train_hyper_2b777fcc
Traceback (most recent call last):
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/_private/worker.py", line 2771, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packa

== Status ==
Current time: 2025-03-07 14:53:26 (running for 00:00:05.21)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 45.000: None | Iter 15.000: None | Iter 5.000: None
Logical resource usage: 1.0/16 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-03-07_14-53-19_896673_40525/artifacts/2025-03-07_14-53-21/deeplabv3_resnet50/driver_artifacts
Number of trials: 2/2 (1 ERROR, 1 PENDING)
+----------------------+----------+-----------------------+-----------------+
| Trial name           | status   | loc                   |   learning_rate |
|----------------------+----------+-----------------------+-----------------|
| train_hyper_fbcd2689 | PENDING  |                       |     0.000109484 |
| train_hyper_2b777fcc | ERROR    | 134.169.206.100:42675 |     0.000103634 |
+----------------------+----------+-----------------------+-----------------+
Number of errored trials: 1
+----------------------+--------------+---------------------------------------

2025-03-07 14:53:28,444	ERROR tune_controller.py:1331 -- Trial task failed for trial train_hyper_fbcd2689
Traceback (most recent call last):
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packages/ray/_private/worker.py", line 2771, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/iff-ros-station-1-studi/miniconda3/envs/sbiegay/lib/python3.9/site-packa

== Status ==
Current time: 2025-03-07 14:53:28 (running for 00:00:07.13)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 45.000: None | Iter 15.000: None | Iter 5.000: None
Logical resource usage: 0/16 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2025-03-07_14-53-19_896673_40525/artifacts/2025-03-07_14-53-21/deeplabv3_resnet50/driver_artifacts
Number of trials: 2/2 (2 ERROR)
+----------------------+----------+-----------------------+-----------------+
| Trial name           | status   | loc                   |   learning_rate |
|----------------------+----------+-----------------------+-----------------|
| train_hyper_2b777fcc | ERROR    | 134.169.206.100:42675 |     0.000103634 |
| train_hyper_fbcd2689 | ERROR    | 134.169.206.100:42756 |     0.000109484 |
+----------------------+----------+-----------------------+-----------------+
Number of errored trials: 2
+----------------------+--------------+------------------------------------------------------