In [1]:
from functools import partial
import os
import tempfile
from pathlib import Path
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle
from ray.tune import CLIReporter
from Helper.ml_models import * 

In [2]:
def load_data(image_dir='CityscapesDaten/images', annotation_dir='CityscapesDaten/semantic'):
    trainset = CustomDataSet(image_dir=image_dir, annotation_dir=annotation_dir)

    # If you have a separate set of images and annotations for testing, you can create a testset in a similar way:
    # testset = CustomDataSet(image_dir=test_image_dir, annotation_dir=test_annotation_dir)

    # If you don't have a separate test set, you can split the trainset into a training set and a test set:
    train_size = int(0.8 * len(trainset))
    test_size = len(trainset) - train_size
    trainset, testset = torch.utils.data.random_split(trainset, [train_size, test_size])

    return trainset, testset

def make_directory(model):
    dir_name = f'Hyperparameter/{model}'
    os.makedirs(dir_name, exist_ok=True)
    


# Variables

In [3]:
all_models = ['deeplabv3_resnet50', 'deeplabv3_resnet101', 'deeplabv3_mobilenet_v3_large', 'fcn_resnet50', 'fcn_resnet101', 'lraspp_mobilenet_v3_large']
test_epochs = 20

k_fold_dataset = K_Fold_Dataset('CityscapesDaten/images',
                         'CityscapesDaten/semantic',
                         k_fold_csv_dir='Daten/CityscapesDaten',
                         leave_out_fold=0,
                         )

k_fold_dataset.check_for_data_leaks()               
        

No data leaks found.


In [4]:
model = all_models[0]
make_directory(model)
config = {  
        'batch_size': 6,
        'lr' : 0.001,
        'momentum' : 0.9,
        'weight_decay' : 0.0005,    
}

hyper_model = TrainedModel(model, 2048, 1024, weights_name='', folder_path=f'Hyperparameter/{model}', start_epoch='latest')
hyper_model.prepare_model_training(dataset_train=k_fold_dataset.train_dataset,
                                            dataset_val=k_fold_dataset.val_dataset,
                                            batch_size=int(config['batch_size']), 
                                            shuffle=True, 
                                            learning_rate=config['lr'], 
                                            momentum=config['momentum'],
                                            weight_decay=config['weight_decay'],)


hyper_model.train(1)  # Train for one epoch
val_loss = hyper_model.validate(k_fold_dataset.val_dataset)  # Validate on the validation dataset
miou = hyper_model.calculate_miou_miou(k_fold_dataset.val_dataset)
tune.report(loss=val_loss, miou=miou)

Using CUDA GPU
Model loaded: deeplabv3_resnet50 | Device: cuda 
Latest Epoch Save doesnt exist or Epoch Number Save doesnt exist, initialising new Save
Saved Model
Successfully loaded Model
Training Dataset prepared
Validation Dataset prepared
Epoch 1 von 1    |   Loss: 0.9581758890833173
Saved Model


ValueError: Expected input batch_size (1) to match target batch_size (20).

In [None]:
# model = all_models[0]


# def train_hyper(config):
    
#     make_directory(model)
#     hyper_model = TrainedModel(model, 2048, 1024, weights_name='', folder_path=f'Hyperparameter/{model}', start_epoch='latest')
#     hyper_model.prepare_model_training(dataset_train=k_fold_dataset.train_dataset,
#                                                 dataset_val=k_fold_dataset.val_dataset,
#                                                 batch_size=int(config['batch_size']), 
#                                                 shuffle=True, 
#                                                 learning_rate=config['lr'], 
#                                                 momentum=config['momentum'],
#                                                 weight_decay=config['weight_decay'],)

    
#     hyper_model.train(1)  # Train for one epoch
#     val_loss = hyper_model.validate(k_fold_dataset.val_dataset)  # Validate on the validation dataset
#     miou = hyper_model.calculate_miou_miou(k_fold_dataset.val_dataset)
#     tune.report(loss=val_loss, miou=miou)
        

# config = {
#     "learning_rate": tune.loguniform(1e-12, 1e-2),
#     "batch_size": tune.choice([2, 3, 4]),
#     "momentum": tune.uniform(0.1, 0.9),
#     "weight_decay": tune.loguniform(1e-6, 1e-1)
# }

# scheduler = ASHAScheduler(
#     metric="miou",
#     mode="max",
#     max_t=10,
#     grace_period=3,
#     reduction_factor=2
# )

# reporter = CLIReporter(metric_columns=["loss", "miou", "training_iteration"])

# analysis = tune.run(train_hyper,
#                     config=config,
#                     resources_per_trial={"gpu": 1},
#                     scheduler=scheduler,
#                     progress_reporter=reporter)

2024-05-04 13:21:46,817	INFO worker.py:1749 -- Started a local Ray instance.
2024-05-04 13:21:47,309	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-05-04 13:21:47,310	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-05-04 13:21:47 (running for 00:00:00.11)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 6.000: None | Iter 3.000: None
Logical resource usage: 0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2024-05-04_13-21-45_240245_12924/artifacts/2024-05-04_13-21-47/train_hyper_2024-05-04_13-21-47/driver_artifacts
Number of trials: 1/1 (1 PENDING)
+-------------------------+----------+-------+--------------+-----------------+------------+----------------+
| Trial name              | status   | loc   |   batch_size |   learning_rate |   momentum |   weight_decay |
|-------------------------+----------+-------+--------------+-----------------+------------+----------------|
| train_hyper_7eaed_00000 | PENDING  |       |            2 |     1.05195e-06 |   0.192267 |    1.73692e-05 |
+-------------------------+----------+-------+--------------+-----------------+------------+----------------+


[36m(train_hyper pid=13654)[0m Using CU

2024-05-04 13:21:50,716	ERROR tune_controller.py:1331 -- Trial task failed for trial train_hyper_7eaed_00000
Traceback (most recent call last):
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/worker.py", line 863, in get_objects
    raise value
ray.exceptions.ActorDiedError: The actor d

[33m(raylet)[0m A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff5cb41565fe55f3a5fe68ac8f01000000 Worker ID: acacad46c545bd5be07685f172fcfc55b24d52955dab8a2d082b1e6d Node ID: 2498711258b425fdebcab6b10262372b301234b82601d67499adc2bd Worker IP address: 10.7.0.4 Worker port: 32973 Worker PID: 13654 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker exits unexpectedly. Worker exits with an exit code None.


Trial name
train_hyper_7eaed_00000


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
2024-05-04 13:21:50,728	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/home/jan/ray_results/train_hyper_2024-05-04_13-21-47' in 0.0040s.


== Status ==
Current time: 2024-05-04 13:21:50 (running for 00:00:03.40)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 6.000: None | Iter 3.000: None
Logical resource usage: 0/12 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2024-05-04_13-21-45_240245_12924/artifacts/2024-05-04_13-21-47/train_hyper_2024-05-04_13-21-47/driver_artifacts
Number of trials: 1/1 (1 ERROR)
+-------------------------+----------+----------------+--------------+-----------------+------------+----------------+
| Trial name              | status   | loc            |   batch_size |   learning_rate |   momentum |   weight_decay |
|-------------------------+----------+----------------+--------------+-----------------+------------+----------------|
| train_hyper_7eaed_00000 | ERROR    | 10.7.0.4:13654 |            2 |     1.05195e-06 |   0.192267 |    1.73692e-05 |
+-------------------------+----------+----------------+--------------+-----------------+------------+----------------+
Nu

TuneError: ('Trials did not complete', [train_hyper_7eaed_00000])