# ParallelRunStep with a little touch of HyperDrive

Connect to the AzureML Workspace:

In [13]:
from azureml.core import Workspace

# Set up workspace
ws = Workspace.from_config()

# Connect to the default datastore
default_dstore = ws.get_default_datastore()

# Define the number of files to train a model on
n_files = 2

Get the file dataset from the cleaned CSV files located in the `/data/hydroqc/clean` folder:

In [2]:
from azureml.core.dataset import Dataset

# NOTE: clean_fds was created in the hd-pipeline-clean notebook
clean_fds = Dataset.get_by_name(workspace=ws, name='clean_fds')

Create the Conda environment for the ParallelRunStep:

In [3]:
%%writefile ../pipelines/hyperdrive/src/train-env.yml
name: train-env
channels:
    - conda-forge
    - defaults
dependencies:
    - python=3.8
    - numpy
    - pandas
    - holidays
    - matplotlib
    - scikit-learn
    - pip
    - pip:
        - tensorflow
        - azureml-sdk          

Overwriting ../pipelines/hyperdrive/src/train-env.yml


Create an AzureML environment from the Conda YAML file:

In [4]:
# Create an environment
# NOTE: Register the environment into Workspace
from azureml.core import Environment

train_env = Environment(name="train-env").from_conda_specification(
                                                    name='train-env',
                                                    file_path='../pipelines/hyperdrive/src/train-env.yml')

Create a cluster for PRS.

In [5]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
# NOTE: Need to be 16 characters or less
cluster_name = "prs-cluster"

# Verify that cluster does not exist already
try:
    prs_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing "{}" cluster. Use it.'.format(cluster_name))
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(
                            vm_size='STANDARD_D15_V2',
                            min_nodes=0,
                            max_nodes=5,
                            vm_priority='dedicated',
                            idle_seconds_before_scaledown=2400,
                            admin_username=None,
                            admin_user_password=None,
                            admin_user_ssh_key=None,
                            vnet_resourcegroup_name=None,
                            vnet_name=None,
                            subnet_name=None,
                            description='PRS Cluster',
                            remote_login_port_public_access='NotSpecified',
                            identity_type=None,
                            identity_id=None)

    prs_cluster = ComputeTarget.create(
                              workspace=ws,
                              name=cluster_name,
                              provisioning_configuration=compute_config)

    prs_cluster.wait_for_completion(show_output=False)

# # NOTE: To delete the compute cluster
# ws.compute_targets['prs-cluster'].delete()

Found existing "prs-cluster" cluster. Use it.


Specify the details of the ParallelRunStep in the ParallelRunConfig object:

In [6]:
from azureml.pipeline.steps import ParallelRunConfig

# NOTE: Python use 1 core per process
processes_per_node = n_files

# Maximum nodes available in the compute target
node_count = 1

# NOTE: HyperDrive for the CNN takes approximatively 1h
timeout = 100800

parallel_run_config = ParallelRunConfig(
    source_directory='../pipelines/hyperdrive/src',
    entry_script='prs-hyperdrive.py',
    mini_batch_size=str(n_files),
    run_invocation_timeout=timeout,
    error_threshold=1,
    output_action="append_row",
    environment=train_env,
    process_count_per_node=processes_per_node,
    compute_target=ws.compute_targets['prs-cluster'],
    node_count=node_count,
    run_max_try=3,
    logging_level='DEBUG',
    # Specify the filename for the PRS output
    append_row_file_name='prs-train.txt')

Create a ParallelRunStep from the ParallelRunConfig object:

In [7]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import ParallelRunStep

# Define the input for the PRS cleaning step
input_dataset = clean_fds.take(count=n_files).as_named_input(name='clean_fds')

# Define the output for the PRS training step
prs_output_dir = OutputFileDatasetConfig(name=f'train_prs_output', 
                                     # Write output to default datastore
                                     destination=(default_dstore, '/data/hydroqc/prs/train/output'), 
                                     source=None)                            

# Define the PRS step
parallel_run_step = ParallelRunStep(
    name=f"train_prs",
    parallel_run_config=parallel_run_config,
    inputs=[input_dataset.as_mount(path_on_compute=f'/tmp/{input_dataset.name}/')],
    output=prs_output_dir,
    allow_reuse=False,
    arguments=None) 

Create a cluster for the hyperdrive runs (1 file ==> 1 cluster).

In [8]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
# NOTE: Need to be 16 characters or less
for i in range(1, n_files + 1):
    cluster_name = f"hd-cluster-{i}"

    # Verify that cluster does not exist already
    try:
        hd_cluster = ComputeTarget(workspace=ws, name=cluster_name)
        print('Found existing "{}" cluster. Use it.'.format(cluster_name))
    except ComputeTargetException:
        compute_config = AmlCompute.provisioning_configuration(
                            vm_size='STANDARD_D15_V2',
                            min_nodes=0,
                            max_nodes=5,
                            vm_priority='dedicated',
                            idle_seconds_before_scaledown=2400,
                            admin_username=None,
                            admin_user_password=None,
                            admin_user_ssh_key=None,
                            vnet_resourcegroup_name=None,
                            vnet_name=None,
                            subnet_name=None,
                            description='HyperDrive Cluster',
                            remote_login_port_public_access='NotSpecified',
                            identity_type=None,
                            identity_id=None)

        hd_cluster = ComputeTarget.create(
                              workspace=ws,
                              name=cluster_name,
                              provisioning_configuration=compute_config)

        hd_cluster.wait_for_completion(show_output=False)

# NOTE: To delete the compute cluster
# ws.compute_targets['hd-cluster'].delete()

Found existing "hd-cluster-1" cluster. Use it.
Found existing "hd-cluster-2" cluster. Use it.


In [9]:
from azureml.core import Experiment
from azureml.widgets import RunDetails
from azureml.pipeline.core import Pipeline

# Create the experiment
experiment = Experiment(workspace=ws, name='prs-train-hyperdrive-cpu')

# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[parallel_run_step])

# Launch the experiment
# NOTE: Return azureml.pipeline.core.run.PipelineRun
run = experiment.submit(pipeline, tags={'Files': str(n_files)})

# See the interactive logs
RunDetails(run_instance=run).show()

Created step train_prs [9a0a0941][2b2dac2a-f43d-428a-a988-c10772578bc4], (This step will run and generate new outputs)
Submitted PipelineRun 1faf8f6f-65f7-4209-804a-d385f6bd16aa
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1faf8f6f-65f7-4209-804a-d385f6bd16aa?wsid=/subscriptions/d71e4214-ad22-4df0-8289-acbc0d88408d/resourcegroups/mlops-RG/workspaces/mlops-AML-WS&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

# HyperDrive Runs Details

## HyperDrive Run #1

In [11]:
hd_runs = list(run.find_step_run(name='train_prs')[0].get_children(recursive=False))
RunDetails(run_instance=hd_runs[0]).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

## HyperDrive Run #2

In [12]:
RunDetails(run_instance=hd_runs[1]).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

For each HyperDrive runs completed, we select and register the best run with respect of our primary metric (Loss)

## Register the best model from the HyperDrive Runs

In [None]:
# from azureml.core.model import Model

# for hd_run in list(run.find_step_run(name='train_prs')[0].get_children(recursive=False)):
#     # Get the best model from HyperDrive run
#     best_run = hd_run.get_best_run_by_primary_metric()

#     # Select the tags from the HyperDrive run
#     model_tag = {k: best_run.get_tags()[k] for k in ['model', 'lclid', 'id']}

#     # Register the best model with tags
#     best_run.register_model(model_name=f'TK-CNN-{model_tag["lclid"]}', model_path='models/', model_framework=Model.Framework.TENSORFLOW, model_framework_version='2.6', tags=model_tag)