# ParallelRunStep with a little touch of HyperDrive

Connect to the AzureML Workspace:

In [10]:
from azureml.core import Workspace

# ws = Workspace.get(
#                     name='mlgeek-ws', 
#                     subscription_id='e40e1658-df4f-4dfc-b90f-158e55336daa', 
#                     resource_group='mlgeek-rg',
#                     location='eastus')

# Write the workspace to a file
# ws.write_config(path='../')   

# Set up workspace
ws = Workspace.from_config()              

# Connect to the default datastore
default_dstore = ws.get_default_datastore()

# Define the number of files: 1 file per model
#n_files = 50
n_files = 2

Get the file dataset from the cleaned CSV files located in the `/data/hydroqc/clean` folder:

In [11]:
from azureml.core.dataset import Dataset

# NOTE: clean_fds was created in the hd-pipeline-clean notebook
clean_fds = Dataset.get_by_name(workspace=ws, name='clean_fds')

Create the Conda environment for the HyperDrive runs:

In [12]:
%%writefile ../pipelines/hyperdrive/src/train-gpu-env.yml
name: train-gpu-env
channels:
    - conda-forge
    - defaults
dependencies:
    - python=3.8
    - numpy
    - pandas
    - holidays
    - matplotlib
    - scikit-learn
    - pip
    - pip:
        - azureml-sdk
        - tensorflow-gpu

Overwriting ../pipelines/hyperdrive/src/train-gpu-env.yml


Create an AzureML environment from the Conda YAML file:

In [13]:
# Create an environment
# NOTE: PRS don't need GPU
from azureml.core import Environment

train_env = Environment(name="train-env").from_conda_specification(
                                                    name='train-env',
                                                    file_path='../pipelines/hyperdrive/src/train-env.yml')

Create the PRS compute instance.

In [14]:
import os
from azureml.core.compute import ComputeInstance
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your compute instance (ci)
# Compute instance name should be unique across the Azure region
# NOTE: Need to be 16 characters or less
ci_name = 'prs-ci'

# Verify that instance does not exist already
try:
    compute_instance = ComputeInstance(workspace=ws, name=ci_name)
    print('Found existing "{}" compute instance. Use it.'.format(ci_name))
except ComputeTargetException:
    compute_config = ComputeInstance.provisioning_configuration(
            vm_size='STANDARD_F64S_V2',
            # ssh_public_access=True,
            ssh_public_access=False,
            vnet_resourcegroup_name=None,
            vnet_name=None,
            subnet_name=None,
            description='PRS Compute Instance',
            assigned_user_object_id=None,
            assigned_user_tenant_id=None)

    compute_instance = ComputeInstance.create(
                workspace=ws,
                name=ci_name,
                provisioning_configuration=compute_config)

    compute_instance.wait_for_completion(show_output=True)

Found existing "prs-ci" compute instance. Use it.


Specify the details of the ParallelRunStep in the ParallelRunConfig object:

In [15]:
from azureml.pipeline.steps import ParallelRunConfig

# NOTE: Python use 1 core per process
processes_per_node = n_files

# Maximum nodes available in the compute target
# NOTE: For a compute instance, node_count = 1
node_count = 1

# NOTE: HyperDrive for the CNN takes approximatively 1h
timeout = 100800

parallel_run_config = ParallelRunConfig(
    source_directory='../pipelines/hyperdrive/src',
    entry_script='prs-hyperdrive-gpu.py',
    mini_batch_size=str(n_files),
    run_invocation_timeout=timeout,
    error_threshold=1,
    output_action="append_row",
    environment=train_env,
    process_count_per_node=processes_per_node,
    compute_target=ws.compute_targets['prs-ci'],
    node_count=node_count,
    run_max_try=3,
    logging_level='DEBUG',
    # Specify the filename for the PRS output
    append_row_file_name='prs-train.txt')

Create a ParallelRunStep from the ParallelRunConfig object:

In [16]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import ParallelRunStep

# Define the input for the PRS cleaning step
input_dataset = clean_fds.take(count=n_files).as_named_input(name='clean_fds')

# Define the output for the PRS training step
prs_output_dir = OutputFileDatasetConfig(name=f'train_prs_output', 
                                     # Write PRS output to default datastore
                                     destination=(default_dstore, '/data/hydroqc/prs/train/output'), 
                                     source=None)                            

# Define the PRS step
parallel_run_step = ParallelRunStep(
    name=f"train_prs",
    parallel_run_config=parallel_run_config,
    inputs=[input_dataset.as_mount(path_on_compute=f'/tmp/{input_dataset.name}/')],
    output=prs_output_dir,
    allow_reuse=False,
    arguments=None) 

Create a compute instance for hyperdrive runs (1 file ==> 1 compute instance).

In [17]:
# Choose a name for your compute instance (ci)
# Compute instance name should be unique across the Azure region
# NOTE: Need to be 16 characters or less
for i in range(1, n_files + 1):
    ci_name = f'hd{i}-K80-gpu-ci'

    # Verify that instance does not exist already
    try:
        compute_instance = ComputeInstance(workspace=ws, name=ci_name)
        print('Found existing "{}" compute instance. Use it.'.format(ci_name))
    except ComputeTargetException:
        compute_config = ComputeInstance.provisioning_configuration(
                # vm_size='STANDARD_NC24RS_V3',
                vm_size='Standard_NC6',
                ssh_public_access=False,
                vnet_resourcegroup_name=None,
                vnet_name=None,
                subnet_name=None,
                description='HyperDrive Compute Instance',
                assigned_user_object_id=None,
                assigned_user_tenant_id=None)

        compute_instance = ComputeInstance.create(
                workspace=ws,
                name=ci_name,
                provisioning_configuration=compute_config)

        compute_instance.wait_for_completion(show_output=True)

Found existing "hd1-K80-gpu-ci" compute instance. Use it.
Found existing "hd2-K80-gpu-ci" compute instance. Use it.


## Start the compute instances!

In [18]:
# Start the PRS compute instance
if ws.compute_targets['prs-ci'].get_status().state != 'Running':
    ws.compute_targets['prs-ci'].start()

# Start the HyperDrive compute instances
gpu_cis = [f'hd{i}-K80-gpu-ci' for i in range(1, n_files + 1)]

for ci in gpu_cis:
    if ws.compute_targets[ci].get_status().state != 'Running':
        ws.compute_targets[ci].start()

In [19]:
from azureml.core import Experiment
from azureml.widgets import RunDetails
from azureml.pipeline.core import Pipeline

# Create the experiment
experiment = Experiment(workspace=ws, name='prs-hyperdrive-gpu')

# Create the pipeline
pipeline = Pipeline(workspace=ws, steps=[parallel_run_step])

# Launch the experiment
# NOTE: Return azureml.pipeline.core.run.PipelineRun
pipeline_run = experiment.submit(pipeline, tags={'Files': str(n_files)})

# See the interactive logs
RunDetails(run_instance=pipeline_run).show()

Created step train_prs [37c9fc25][941a3dcc-98e7-4ae0-89a7-14692e437246], (This step will run and generate new outputs)
Submitted PipelineRun b5d7b91a-ef73-4f3b-986f-bbcae798d736
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/b5d7b91a-ef73-4f3b-986f-bbcae798d736?wsid=/subscriptions/d71e4214-ad22-4df0-8289-acbc0d88408d/resourcegroups/mlops-RG/workspaces/mlops-AML-WS&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

# HyperDrive Runs Details

In [21]:
from azureml.train.hyperdrive import HyperDriveRun

# Get the child runs of the PRS training step
child_runs = list(pipeline_run.find_step_run(name='train_prs')[0].get_children(recursive=False))

# Transform the child runs into a list of HyperDriveRun objects
hd_runs = [HyperDriveRun(experiment=experiment, run_id=child_run.id) for child_run in child_runs]

In [22]:
RunDetails(run_instance=hd_runs[0]).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [None]:
# # Print the run details for each hyerdrive run
# # NOTE: Large output in the notebook!
# for (i, hd_run) in enumerate(hd_runs):
#     print(f'HyperDrive #{i + 1}:')
#     print(len(f'HyperDrive #{i + 1}:')*'-')
#     RunDetails(run_instance=hd_run).show()

## Register the best model from the HyperDrive Runs

In [None]:
# from azureml.core.model import Model

# for hd_run in hd_runs:
#     # Get the best model from HyperDrive run
#     best_run = hd_run.get_best_run_by_primary_metric()

#     # Select the tags from the HyperDrive run
#     model_tag = {k: best_run.get_tags()[k] for k in ['model', 'lclid', 'id']}

#     # Register the best model with tags
#     best_run.register_model(
#                 model_name=f'TFKeras-K80-CNN-{model_tag["lclid"]}', 
#                 model_path='models/', 
#                 model_framework=Model.Framework.TENSORFLOW, 
#                 model_framework_version='2.6', 
#                 tags=model_tag)

## Stop all compute instances!

In [None]:
gpu_cis = [ci  for ci in ws.compute_targets.keys() if 'gpu-ci' in ci]

for ci in gpu_cis:
    if ws.compute_targets[ci].get_status().state != 'Stopped':
        ws.compute_targets[ci].stop()

In [None]:
ws.compute_targets['prs-ci'].stop()