In [None]:
from ray import tune, train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
from ray.tune import CLIReporter
from Helper.ml_models import * 
import json
import ray.cloudpickle as pickle

from ray.tune.search.optuna import OptunaSearch
from optuna.samplers import TPESampler

In [None]:
def make_directory(model):
    dir_name = f'Hyperparameter/{model}'
    os.makedirs(dir_name, exist_ok=True)

# Variables

In [None]:
all_models = ['deeplabv3_resnet50', 'deeplabv3_resnet101', 'deeplabv3_mobilenet_v3_large', 'lraspp_mobilenet_v3_large']
not_yet_studied = ['fcn_resnet50', 'fcn_resnet101']

k_fold_dataset = K_Fold_Dataset('/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/CityscapesDaten/images',
                         '/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/CityscapesDaten/semantic',
                         k_fold_csv_dir='/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/Daten/CityscapesDaten',
                         leave_out_fold=0,
                         )

k_fold_dataset.check_for_data_leaks()         

model = all_models[0]      

In [None]:
def train_hyper(config, checkpoint_dir=None):  
    try:
        make_directory(model)
        hyper_model = TrainedModel(
            model,
            2048,
            1024,
            weights_name='',
            folder_path=f'Hyperparameter/{model}',
            start_epoch='latest'
        )
        
        # Load checkpoint if available
        if checkpoint_dir:
            with open(os.path.join(checkpoint_dir, 'checkpoint.pkl'), 'rb') as fp:
                checkpoint = pickle.load(fp)
                hyper_model.model.load_state_dict(checkpoint["model_state"])
                hyper_model.optimizer.load_state_dict(checkpoint["optimizer_state"])
                start_epoch = checkpoint["epoch"] + 1  # Resume from the next epoch
        else:
            start_epoch = 0
        
        # Prepare the datasets and dataloaders
        hyper_model.prepare_model_training(
            dataset_train=k_fold_dataset.train_dataset,
            dataset_val=k_fold_dataset.val_dataset,
            dataset_test=k_fold_dataset.test_dataset,
            batch_size=int(config['batch_size']),
            val_batch_size=int(config['batch_size']),
            shuffle=True,
            learning_rate=config['learning_rate'],
            weight_decay=config['weight_decay'],
            num_workers=4,
            pin_memory=True,
            ray_tune=True,
        )

        # Use a dynamic number of epochs up to 100
        max_epochs = min(config.get("max_epochs", 100), 100)
        log_dir = f"HyperparameterLOG/{model}"
        os.makedirs(log_dir, exist_ok=True)

        for epoch in range(start_epoch, max_epochs):
            # Train and evaluate
            epoch_loss, epoch_acc, val_loss, val_acc = hyper_model.train(use_autocast=config['auto_cast'])
            
            print(f'Epoch: {epoch}, Loss: {epoch_loss}, Train Acc: {epoch_acc}, Val Loss: {val_loss}, Val Acc: {val_acc}')

            # Save checkpoint to a directory
            checkpoint_dir_epoch = os.path.join(log_dir, f"checkpoint_epoch_{epoch}")
            os.makedirs(checkpoint_dir_epoch, exist_ok=True)
            checkpoint_data = {
                "model_state": hyper_model.model.state_dict(),
                "optimizer_state": hyper_model.optimizer.state_dict(),
                "epoch": epoch,
            }
            with open(os.path.join(checkpoint_dir_epoch, 'checkpoint.pkl'), 'wb') as fp:
                pickle.dump(checkpoint_data, fp)

            # Create a Ray Tune checkpoint
            checkpoint = Checkpoint.from_directory(checkpoint_dir_epoch)
            train.report(
                {
                    "loss": epoch_loss,
                    "val_loss": val_loss,
                    "train_acc": epoch_acc,
                    "val_acc": val_acc
                },
                checkpoint=checkpoint
            )
        
    except RuntimeError as e:
        # Handle out-of-memory errors
        if "out of memory" in str(e):
            train.report({"loss": float('inf'), "val_loss": float('inf'), "train_acc": 0.0, "val_acc": 0.0})
        else:
            raise e


In [None]:
config = {
    "learning_rate": tune.loguniform(1e-5, 1e-2),
    "batch_size": tune.choice([4, 8, 16]),
    "weight_decay": tune.loguniform(1e-6, 1e-2),
    "auto_cast": tune.choice([True, False]),
    "max_epochs": 100,  # Maximum epochs per trial
}

analysis = tune.run(
    train_hyper,
    config=config,
    resources_per_trial={"cpu": 6, "gpu": 1},
    scheduler=ASHAScheduler(
        metric="val_loss",
        mode="min",
        max_t=100,  # Maximum training iterations (epochs)
        grace_period=5,
        reduction_factor=3,
    ),
    progress_reporter=CLIReporter(metric_columns=["loss", "val_loss", "train_acc", "val_acc", "training_iteration"]),
    local_dir=f"/home/jan/studienarbeit/Studienarbeit-CODE_Semantische_Segmentation/HyperparameterLOG/{model}",
    search_alg=OptunaSearch(
        metric="val_loss",
        mode="min",
        sampler=TPESampler(seed=42),
    ),
    num_samples=50,
    resume="AUTO",  # Automatically resumes from the last checkpoint
)


print("Best hyperparameters found were: ", analysis.get_best_config(metric="val_loss", mode="min"))

best_config = analysis.best_config(metric="val_loss", mode="min")

# Save the best configuration to a JSON file
with open('hyper_best_config.json', 'w') as json_file:
    json.dump(best_config, json_file)

print("Best configuration saved to best_config.json.")

# Speichere alle getesteten Konfigurationen und Ergebnisse
all_trials = analysis.trials
with open('hyper_all_trials.json', 'w') as json_file:
    json.dump([trial.config for trial in all_trials], json_file)

print("All configurations saved to all_trials.json.")
