In [1]:
from functools import partial
import os
import torch
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from Helper.ml_models import * 
import json

2024-06-01 14:00:41.434427: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_data(image_dir='CityscapesDaten/images', annotation_dir='CityscapesDaten/semantic'):
    trainset = CustomDataSet(image_dir=image_dir, annotation_dir=annotation_dir)

    # If you have a separate set of images and annotations for testing, you can create a testset in a similar way:
    # testset = CustomDataSet(image_dir=test_image_dir, annotation_dir=test_annotation_dir)

    # If you don't have a separate test set, you can split the trainset into a training set and a test set:
    train_size = int(0.8 * len(trainset))
    test_size = len(trainset) - train_size
    trainset, testset = torch.utils.data.random_split(trainset, [train_size, test_size])

    return trainset, testset

def make_directory(model):
    dir_name = f'Hyperparameter/{model}'
    os.makedirs(dir_name, exist_ok=True)
    


# Variables

In [3]:
all_models = ['deeplabv3_resnet50', 'deeplabv3_resnet101', 'deeplabv3_mobilenet_v3_large', 'lraspp_mobilenet_v3_large']
not_yet_studied = ['fcn_resnet50', 'fcn_resnet101']
test_epochs = 60

k_fold_dataset = K_Fold_Dataset('/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/CityscapesDaten/images',
                         '/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/CityscapesDaten/semantic',
                         k_fold_csv_dir='/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Daten/CityscapesDaten',
                         leave_out_fold=0,
                         )

k_fold_dataset.check_for_data_leaks()               
        

No data leaks found.


In [4]:
# model = all_models[0]
# make_directory(model)
# config = {  
#         'batch_size': 10,
#         'lr' : 0.001,
#         'momentum' : 0.9,
#         'weight_decay' : 0.0005,    
# }

# hyper_model = TrainedModel(model, 2048, 1024, weights_name='', folder_path=f'Hyperparameter/{model}', start_epoch='latest')

# hyper_model.prepare_model_training(dataset_train=k_fold_dataset.train_dataset,
#                                             dataset_val=k_fold_dataset.val_dataset,
#                                             dataset_test=k_fold_dataset.test_dataset,
#                                             batch_size=int(config['batch_size']), 
#                                             shuffle=True, 
#                                             learning_rate=config['lr'], 
#                                             momentum=config['momentum'],
#                                             weight_decay=config['weight_decay'], 
#                                             num_workers=4, 
#                                             pin_memory=True,
#                                             )


# epoch_loss, epoch_acc = hyper_model.train()  # Train for one epoch
# #miou = hyper_model.calculate_miou_miou(k_fold_dataset.val_dataset)
# tune.report(loss=epoch_loss, miou=epoch_acc)

In [5]:
model = all_models[0]


def train_hyper(config):
    try:
        make_directory(model)
        hyper_model = TrainedModel(model, 2048, 1024, weights_name='', folder_path=f'Hyperparameter/{model}', start_epoch='latest')
        hyper_model.prepare_model_training(dataset_train=k_fold_dataset.train_dataset,
                                                dataset_val=k_fold_dataset.val_dataset,
                                                dataset_test=k_fold_dataset.test_dataset,
                                                batch_size=int(config['batch_size']), 
                                                shuffle=True, 
                                                learning_rate=config['learning_rate'],
                                                weight_decay=config['weight_decay'], 
                                                num_workers=4, 
                                                pin_memory=True,
                                                ray_tune=True,
                                                )

        epoch_loss, epoch_acc = hyper_model.train() 
        miou = hyper_model.calculate_miou_miou(k_fold_dataset.val_dataset)
        tune.report(loss=epoch_loss, miou=miou)
    except RuntimeError as e:
        if "out of memory" in str(e):
            tune.report(loss=float('inf'), miou=0)  
        else:
            raise e  
        

config = {
    "learning_rate": tune.loguniform(1e-12, 1e-2),
    'batch_size': tune.choice([2,4,6,8,12,14,16]),
    "weight_decay": tune.loguniform(1e-6, 1e-1)
}

# Define the scheduler and reporter
scheduler = ASHAScheduler(
    metric="loss",
    mode="min",
    max_t=10,
    grace_period=3,
    reduction_factor=2
)

reporter = CLIReporter(metric_columns=["loss", "acc", "training_iteration"])

analysis = tune.run(train_hyper,
                    config=config,
                    resources_per_trial={"gpu": 1},
                    scheduler=scheduler,    
                    progress_reporter=reporter, 
                    )

print("Best hyperparameters found were: ", analysis.best_config)



best_config = analysis.best_config

# Save the best configuration to a JSON file
with open('best_config.json', 'w') as json_file:
    json.dump(best_config, json_file)

print("Best configuration saved to best_config.json.")

2024-06-01 14:00:44,349	INFO worker.py:1749 -- Started a local Ray instance.
2024-06-01 14:00:44,866	INFO tune.py:253 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `tune.run(...)`.
2024-06-01 14:00:44,867	INFO tune.py:614 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


== Status ==
Current time: 2024-06-01 14:00:44 (running for 00:00:00.11)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 6.000: None | Iter 3.000: None
Logical resource usage: 0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2024-06-01_14-00-42_652936_15596/artifacts/2024-06-01_14-00-44/train_hyper_2024-06-01_14-00-44/driver_artifacts
Number of trials: 1/1 (1 PENDING)
+-------------------------+----------+-------+--------------+-----------------+----------------+
| Trial name              | status   | loc   |   batch_size |   learning_rate |   weight_decay |
|-------------------------+----------+-------+--------------+-----------------+----------------|
| train_hyper_938ae_00000 | PENDING  |       |           12 |     1.34659e-06 |     0.00966882 |
+-------------------------+----------+-------+--------------+-----------------+----------------+




[36m(pid=16351)[0m 2024-06-01 14:00:47.516532: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
[36m(pid=16351)[0m To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


[36m(train_hyper pid=16351)[0m Using CUDA GPU
[36m(train_hyper pid=16351)[0m Model loaded: deeplabv3_resnet50 | Device: cuda 
[36m(train_hyper pid=16351)[0m own lrs: 1e-05
[36m(train_hyper pid=16351)[0m Latest Epoch Save doesnt exist or Epoch Number Save doesnt exist, initialising new Save
[36m(train_hyper pid=16351)[0m own lrs: 1e-05
[36m(train_hyper pid=16351)[0m Saved Model
[36m(train_hyper pid=16351)[0m Successfully loaded Model
[36m(train_hyper pid=16351)[0m Training Dataset prepared
[36m(train_hyper pid=16351)[0m Validation Dataset prepared
[36m(train_hyper pid=16351)[0m Test Dataset prepared
== Status ==
Current time: 2024-06-01 14:00:50 (running for 00:00:05.21)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 6.000: None | Iter 3.000: None
Logical resource usage: 0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2024-06-01_14-00-42_652936_15596/artifacts/2024-06-01_14-00-44/train_hyper_2024-06-01_14-00-44/driver_artifacts
N

2024-06-01 14:00:52,334	ERROR tune_controller.py:1331 -- Trial task failed for trial train_hyper_938ae_00000
Traceback (most recent call last):
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/air/execution/_internal/event_manager.py", line 110, in resolve_future
    result = ray.get(future)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper
    return fn(*args, **kwargs)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
    return func(*args, **kwargs)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/worker.py", line 2623, in get
    values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout)
  File "/home/jan/anaconda3/envs/studi/lib/python3.9/site-packages/ray/_private/worker.py", line 861, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTa

Trial name
train_hyper_938ae_00000


2024-06-01 14:00:52,346	INFO tune.py:1007 -- Wrote the latest version of all result files and experiment state to '/home/jan/ray_results/train_hyper_2024-06-01_14-00-44' in 0.0033s.


== Status ==
Current time: 2024-06-01 14:00:52 (running for 00:00:07.46)
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 6.000: None | Iter 3.000: None
Logical resource usage: 0/12 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:G)
Result logdir: /tmp/ray/session_2024-06-01_14-00-42_652936_15596/artifacts/2024-06-01_14-00-44/train_hyper_2024-06-01_14-00-44/driver_artifacts
Number of trials: 1/1 (1 ERROR)
+-------------------------+----------+----------------+--------------+-----------------+----------------+
| Trial name              | status   | loc            |   batch_size |   learning_rate |   weight_decay |
|-------------------------+----------+----------------+--------------+-----------------+----------------|
| train_hyper_938ae_00000 | ERROR    | 10.7.0.4:16351 |           12 |     1.34659e-06 |     0.00966882 |
+-------------------------+----------+----------------+--------------+-----------------+----------------+
Number of errored trials: 1
+-------------------------+----------

TuneError: ('Trials did not complete', [train_hyper_938ae_00000])