## Setup of QUEENS based models, schedulers and drivers

In [11]:
# Paths to external computational models and directories
from pathlib import Path

## Paths to model input files
lf_input_file_template = Path("./external_models/lf_input_template.json")
lf_adjoint_input_file_template = Path(
    "./external_models/lf_adjoint_input_template.json"
)
hf_input_file_template = Path("./external_models/hf_input_template.json")

## Paths to model executables
lf_model_path = Path("./external_models/darcy")
lf_adjoint_model_path = Path("./external_models/darcy_adjoint")
hf_model_path = Path("./external_models/darcy")

## Path to output directory
output_dir_path = Path("./output")
output_dir_path_initial_training = output_dir_path / "initial_training_phase"
output_dir_path_inference = output_dir_path / "inference_phase"

## quick check if all these paths exist
assert (
    lf_input_file_template.exists()
), "Low-fidelity input file template does not exist."
assert (
    lf_adjoint_input_file_template.exists()
), "Low-fidelity adjoint input file template does not exist."
assert (
    hf_input_file_template.exists()
), "High-fidelity input file template does not exist."
assert lf_model_path.exists(), "Low-fidelity model executable does not exist."
assert (
    lf_adjoint_model_path.exists()
), "Low-fidelity adjoint model executable does not exist."
assert hf_model_path.exists(), "High-fidelity model executable does not exist."
assert output_dir_path.exists(), "Output directory does not exist."
assert (
    output_dir_path_initial_training.exists()
), "Output directory for initial training phase does not exist."
assert (
    output_dir_path_inference.exists()
), "Output directory for inference phase does not exist."

In [25]:
# Initial training phase of BMFIA
## Import all necessary modules from QUEENS to setup models, drivers and schedulers
from queens.global_settings import GlobalSettings
from queens.data_processors.numpy_file import NumpyFile as NumpyDataProc
from queens.drivers.mpi import Mpi as MpiDriver
from queens.schedulers.local import Local as LocalScheduler
from queens.models.simulation import Simulation as SimulationModel
from queens.parameters import Parameters
from queens.distributions.mean_field_normal import MeanFieldNormal
from bmfia.bmfia_iterator import BmfiaIterator

## Setup global settings
experiment_name = "bmfia_initial_training_phase"
global_settings_initial = GlobalSettings(
    experiment_name=experiment_name, output_dir=output_dir_path_initial_training
)

## Setup the parameter definition for the models
x_vec = MeanFieldNormal(0, 1, 1000)  # TODO: exchange this for actual GMRF distribution
parameters = Parameters(x_vec=x_vec)

## Setup data processors, the velocity field is here additionally stored in a numpy file
## with file name ending "_sol.npy"
lf_data_processor = NumpyDataProc(
    file_name_identifier="_sol.npy", file_options_dict={"delete_field_data": False}
)
hf_data_processor = NumpyDataProc(
    file_name_identifier="_sol.npy", file_options_dict={"delete_field_data": False}
)

## Setup drivers
mpi_driver_lf = MpiDriver(
    parameters,
    lf_input_file_template,
    lf_model_path,
    files_to_copy=None,
    data_processor=lf_data_processor,
    gradient_data_processor=None,
    mpi_cmd="/usr/bin/mpirun --bind-to none",
)
mpi_driver_hf = MpiDriver(
    parameters,
    hf_input_file_template,
    hf_model_path,
    files_to_copy=None,
    data_processor=hf_data_processor,
    gradient_data_processor=None,
    mpi_cmd="/usr/bin/mpirun --bind-to none",
)


+-----------------------------------------------------------------------------------------------+
|                                        MeanFieldNormal                                        |
|- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
| self      : <queens.distributions.mean_field_normal.MeanFieldNormal object at 0x7fdcc47bbd90> |
| mean      : 0                                                                                 |
| variance  : 1                                                                                 |
| dimension : 1000                                                                              |
+-----------------------------------------------------------------------------------------------+


+-----------------------------------------------------------------------------------------------+
|                                        MeanFieldNormal                                        |
|- - - - - - - - 


+--------------------------------------------------------------------------------------------------------+
|                                               NumpyFile                                                |
|- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - |
| self                          : <queens.data_processors.numpy_file.NumpyFile object at 0x7fdcac75ffd0> |
| file_name_identifier          : '_sol.npy'                                                             |
| file_options_dict             : {'delete_field_data': False}                                           |
| files_to_be_deleted_regex_lst : None                                                                   |
+--------------------------------------------------------------------------------------------------------+


+--------------------------------------------------------------------------------------------------------+
|                                 

In [26]:
# build the schedulers and the rest of the QUEENS model in context
# this makes sure that everything is closed properly at the end

with global_settings_initial:
    
    ## Setup schedulers with local MPI scheduler; allow 6 jobs to run in parallel on one processor each
    local_scheduler_lf = LocalScheduler(
        experiment_name, num_jobs=6, num_procs=1, restart_workers=False, verbose=True
    )
    local_scheduler_hf = LocalScheduler(
        experiment_name, num_jobs=6, num_procs=1, restart_workers=False, verbose=True
    )

    ## Setup the QUEENS simulation models
    lf_model = SimulationModel(scheduler=local_scheduler_lf, driver=mpi_driver_lf)
    hf_model = SimulationModel(scheduler=local_scheduler_hf, driver=mpi_driver_hf)

    ## Setup the BMFIA iterator
    bmfia_iterator = BmfiaIterator()

                                                                                  
                                                                                  
                                                                                  
                                                                                  
                                        *                                         
                                        *                                         
                                        *                                         
                                        *                                         
                                      * | *                                       
                                      * | *                                       
                                      * | *                                       
                                      * | *                                       
    

2025-08-26 20:21:10,167 - distributed.scheduler - INFO - State start
2025-08-26 20:21:10,171 - distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:43611
2025-08-26 20:21:10,171 - distributed.scheduler - INFO -   dashboard at:  http://127.0.0.1:8787/status
2025-08-26 20:21:10,172 - distributed.scheduler - INFO - Registering Worker plugin shuffle
2025-08-26 20:21:10,193 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:46677'
2025-08-26 20:21:10,195 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:46023'
2025-08-26 20:21:10,198 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:45585'
2025-08-26 20:21:10,202 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:41415'
2025-08-26 20:21:10,226 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:35033'
2025-08-26 20:21:10,243 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:39219'
2025-08-26 20:21:10,613 - distri

To view the Dask dashboard open this link in your browser: http://127.0.0.1:8787/status
To view the Dask dashboard open this link in your browser: http://127.0.0.1:8787/status
To view the Dask dashboard open this link in your browser: http://127.0.0.1:8787/status
To view the Dask dashboard open this link in your browser: http://127.0.0.1:8787/status

+-------------------------------------------------------------------------------+
|                                     Local                                     |
|- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
| self            : <queens.schedulers.local.Local object at 0x7fdcc44bd0d0>    |
| experiment_name : 'bmfia_initial_training_phase'                              |
| num_jobs        : 6                                                           |
| num_procs       : 1                                                           |
| restart_workers : False                                                

Perhaps you already have a cluster running?
Hosting the HTTP server on port 32897 instead
2025-08-26 20:21:10,807 - distributed.scheduler - INFO - State start
2025-08-26 20:21:10,812 - distributed.scheduler - INFO -   Scheduler at:     tcp://127.0.0.1:45537
2025-08-26 20:21:10,813 - distributed.scheduler - INFO -   dashboard at:  http://127.0.0.1:32897/status
2025-08-26 20:21:10,814 - distributed.scheduler - INFO - Registering Worker plugin shuffle
2025-08-26 20:21:10,837 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:46223'
2025-08-26 20:21:10,838 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:39909'
2025-08-26 20:21:10,842 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:39337'
2025-08-26 20:21:10,846 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:34051'
2025-08-26 20:21:10,851 - distributed.nanny - INFO -         Start Nanny at: 'tcp://127.0.0.1:40101'
2025-08-26 20:21:10,856 - distributed.nann

To view the Dask dashboard open this link in your browser: http://127.0.0.1:32897/status
To view the Dask dashboard open this link in your browser: http://127.0.0.1:32897/status
To view the Dask dashboard open this link in your browser: http://127.0.0.1:32897/status
To view the Dask dashboard open this link in your browser: http://127.0.0.1:32897/status

+-------------------------------------------------------------------------------+
|                                  Simulation                                   |
|- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -|
| self      : <queens.models.simulation.Simulation object at 0x7fdcc41a50d0>    |
| scheduler : <queens.schedulers.local.Local object at 0x7fdcc703c910>          |
| driver    : <queens.drivers.mpi.Mpi object at 0x7fdcc442a0d0>                 |
+-------------------------------------------------------------------------------+


+------------------------------------------------------------------

2025-08-26 20:21:11,397 - distributed.scheduler - INFO - Retire worker addresses (stimulus_id='retire-workers-1756232471.3970103') (0, 1, 2, 3, 4, 5)
2025-08-26 20:21:11,398 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:46677'. Reason: nanny-close
2025-08-26 20:21:11,399 - distributed.nanny - INFO - Nanny asking worker to close. Reason: nanny-close
2025-08-26 20:21:11,400 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:46023'. Reason: nanny-close
2025-08-26 20:21:11,401 - distributed.worker - INFO - Stopping worker at tcp://127.0.0.1:46123. Reason: nanny-close
2025-08-26 20:21:11,401 - distributed.worker - INFO - Removing Worker plugin shuffle
2025-08-26 20:21:11,403 - distributed.nanny - INFO - Nanny asking worker to close. Reason: nanny-close
2025-08-26 20:21:11,403 - distributed.core - INFO - Connection to tcp://127.0.0.1:43611 has been closed.
2025-08-26 20:21:11,404 - distributed.nanny - INFO - Closing Nanny at 'tcp://127.0.0.1:45585'. Reason: nanny