# Imports

In [None]:
import os
import sys

import pandas as pd
import numpy as np
import torch
from ray.tune import Analysis
import progressbar

from rayTune_common.constants import metric, mode, ins, outs
from rayTune_common.test import test_model
from rayTune_common.utils import config_to_model

# Data loaders


In [None]:
path = "dataset/training_set.csv"
train_set = pd.read_csv(path, index_col=0)
path = "dataset/validation_set.csv"
val_set = pd.read_csv(path, index_col=0)
path = "dataset/test_set.csv"
test_set = pd.read_csv(path, index_col=0)

def prepare_data(
        input_cols: [],
        output_cols: [],
        train_batch_size: int
):
    """
    Prepares the dataset to be used for HPO
    Converts to torch tensors and dataset loaders
    :param input_cols: list of strings
    :param output_cols: list of strings
    :param train_batch_size: Batch size
    :return:
    :return: train_loader, x_val, y_val, val_loader, x_test, y_test
    """
    # Get input and output tensors and convert them to torch tensors
    x_val = torch.from_numpy(val_set[input_cols].values).to(torch.float)
    y_val = torch.from_numpy(val_set[output_cols].values).to(torch.float)

    # Create dataset loaders
    # Here we specify the batch size and if the dataset should be shuffled
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=train_batch_size, shuffle=True)

    return val_loader

# Train on validation data as well

In [None]:
def train(net, config):
    """
    :param config:
    :param net: Model to train
    :return:
    """
    
    net = net.to(net.device)

    # Define loss and optimizer
    criterion = torch.nn.MSELoss(reduction='mean')
    optimizer = torch.optim.Adam(net.parameters(), lr=config["lr"])

    # Import training, validation and test data
    val_loader = prepare_data(
        input_cols=ins,
        output_cols=outs,
        train_batch_size=64
    )

    # Train Network
    for epoch in range(100):
        # specify that we are in training mode
        net.train()

        for inputs, labels in val_loader:

            inputs, labels = inputs.to(net.device), labels.to(net.device)
            # Zero the parameter gradients (from last iteration)
            optimizer.zero_grad()

            # Forward propagation
            outputs = net(inputs)

            # Compute cost function
            batch_mse = criterion(outputs, labels)

            reg_loss = 0
            for param in net.parameters():
                reg_loss += param.pow(2).sum()

            cost = batch_mse + config["l2"] * reg_loss

            # Backward propagation to compute gradient
            cost.backward()

            # Update parameters using gradient
            optimizer.step()
    return net

Analyse ray tune logs

In [None]:
path_to_run_results = "/home/knut/Documents/project/UnseededRun_results"
list_runs = [f.path for f in os.scandir(path_to_run_results) if f.is_dir()]
bar = progressbar.ProgressBar(maxval=10*15*100, widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
prog = 0
for path_to_run in list_runs:
    data = []
    path_to_csv = os.path.join(path_to_run, "train_val_results.csv")
    path_to_config_csv = os.path.join(path_to_run, "train_val_config.csv")

    list_experiments = [f.path for f in os.scandir(path_to_run) if f.is_dir()]
    list_experiments.sort(key=lambda x: x.split("_")[-1])

    for experient_number, path_to_experiment in enumerate(list_experiments):
        # print(f"Run: {path_to_run} -- Experiment: {experient_number} -- {path_to_experiment}")
        experiment_data = {}
        best_trial_analysis = Analysis(path_to_experiment, default_metric=metric, default_mode=mode)
        best_trial_config = best_trial_analysis.get_best_config(metric=metric, mode=mode)
        best_trial_logdir = best_trial_analysis.get_best_logdir(metric=metric, mode=mode)
        list_best_trial_checkpoints = [f.path for f in os.scandir(best_trial_logdir) if f.is_dir()]
        list_best_trial_checkpoints.sort(key=lambda x: int(x.split("_")[-1]))
        best_trial_checkpoint_path = os.path.join(list_best_trial_checkpoints[-1], "checkpoint")
        best_trial_model = config_to_model(config=best_trial_config, checkpoint_path=best_trial_checkpoint_path)
        best_trial_model = train(best_trial_model, best_trial_config)
        best_trial_mse = test_model(model=best_trial_model, batch_size=64)

        list_experiment_trials = [f.path for f in os.scandir(path_to_experiment) if f.is_dir()]
        list_experiment_trials.sort(key=lambda x: int(x.split("_")[4]))
        
        for trial_number, path_to_trial in enumerate(list_experiment_trials):
            list_trial_checkpoints = [f.path for f in os.scandir(path_to_trial) if f.is_dir()]
            list_trial_checkpoints.sort(key=lambda x: int(x.split("_")[-1]))

            trial_checkpoint_path = os.path.join(list_trial_checkpoints[-1], "checkpoint")

            trial_analysis = Analysis(path_to_trial, default_metric=metric, default_mode=mode)
            trial_config = trial_analysis.get_best_config(metric=metric, mode=mode)

            trial_model = config_to_model(config=trial_config, checkpoint_path=trial_checkpoint_path)
            trial_model = train(trial_model, trial_config)
            trial_mse = test_model(model=trial_model, batch_size=64)
            experiment_data[trial_number] = trial_mse
            
            bar.update(prog+1)
            prog += 1
        
        sorted_experiment_data = dict(sorted(experiment_data.items()))
        data.append(sorted_experiment_data)
    
    # Convert into pandas dataframe
    df = pd.DataFrame(data)

    # Find best trial based on test mse
    # Store the config of the best model as csv file along with test mse and trial number
    data = []
    column_names = 1
    col_index_of_min = df.idxmin(axis=1)
    value_of_min = df.min(axis=1)
    for i in range(len(col_index_of_min)):
        if (path_to_run[-3:-1] == "rs"):
            path = os.path.join(path_to_run, "rs_" + str(i).rjust(3, "0"))
        else:
            path = os.path.join(path_to_run, "xp_" + str(i).rjust(3, "0"))
        list_experiment_trials = [f.path for f in os.scandir(path) if f.is_dir()]
        list_experiment_trials.sort(key=lambda x: int(x.split("_")[4]))
        path_to_best_test_mse = list_experiment_trials[col_index_of_min[i]]

        list_trial_checkpoints = [f.path for f in os.scandir(path_to_best_test_mse) if f.is_dir()]
        list_trial_checkpoints.sort(key=lambda x: int(x.split("_")[-1]))
        trial_checkpoint_path = os.path.join(list_trial_checkpoints[-1], "checkpoint")
        trial_analysis = Analysis(path_to_best_test_mse, default_metric=metric, default_mode=mode)
        trial_config = trial_analysis.get_best_config(metric=metric, mode=mode)
        trial_model = config_to_model(config=trial_config, checkpoint_path=trial_checkpoint_path)
        trial_model = train(trial_model, trial_config)
        trial_mse = test_model(model=trial_model, batch_size=64)
        # assert (trial_mse == value_of_min[i])

        trial_config["mse"] = trial_mse
        trial_config["trial number"] = col_index_of_min[i]

        data.append(trial_config)

    config_df = pd.DataFrame(data)

    # Add mean and variance to test results
    df["best"] = df.min(axis=1)
    mean = df.mean(axis=0)
    var = df.var(axis=0)
    std = df.std(axis=0)
    df.loc["mean"] = mean
    df.loc["var"] = var
    df.loc["std"] = std

    # Store dataframes as csv
    df.to_csv(path_to_csv)
    config_df.to_csv(path_to_config_csv)

bar.finish()