In [1]:
from azureml.core import Dataset, Datastore, Workspace
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core.environment import Environment
from azureml.core.experiment import Experiment
from azureml.core.runconfig import RunConfiguration
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep, PythonScriptStep
from datetime import datetime
import toml

## Load configuration

In [2]:
with open('config.toml', 'r') as f:
    config = toml.load(f)
with open('secrets.toml', 'r') as f:
    secrets = toml.load(f)
config = {**config, **secrets}

## Connect to workspace

In [3]:
def connect_to_workspace(subscription_id: str, resource_group: str, aml_workspace_name: str, tenant_id: str = None) \
        -> Workspace:
    interactive_auth = InteractiveLoginAuthentication(tenant_id=tenant_id)
    workspace = Workspace(subscription_id, resource_group, aml_workspace_name, auth=interactive_auth)
    return workspace

In [4]:
ws = connect_to_workspace(config['subscription_id'], config['resource_group'], config['aml_workspace'], config['tenant_id'])

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


## Define some variables

In [5]:
experiment_name = 'parallelization_tutorial'
compute_target_cpu = ws.compute_targets['kmi-cmpclstr-cpu']
jobs_per_node_cpu = 8

## Create an execution environment (only necessary once)

In [6]:
# def create_environment(workspace, name, base_environment, conda_dependencies, pip_dependencies, wait_for_completion=True):
#     env = Environment.get(workspace=workspace, name=base_environment).clone(name)
#     conda_dep = env.python.conda_dependencies
#     for dep in conda_dependencies:
#         conda_dep.add_conda_package(dep)
#     for dep in pip_dependencies:
#         conda_dep.add_pip_package(dep)
#     env.python.conda_dependencies=conda_dep
#     env.register(workspace=workspace)
#     if wait_for_completion:
#         env.build(workspace).wait_for_completion()
        
# name = 'tutorial-environment'
# base_environment = 'AzureML-Minimal'
# conda_dependencies = ['numpy', 'pandas']
# pip_dependencies = ['opencensus-ext-azure', 'parse', 'tqdm']
# create_environment(ws, name, base_environment, conda_dependencies, pip_dependencies)

## Get a handle of the execution environment (if it already exists)

In [7]:
environment = Environment.get(workspace=ws, name='tutorial-environment')

## Configure environment variables

In [8]:
environment.environment_variables = {
    'APPLICATIONINSIGHTS_CONNECTION_STRING': config['app_insights_connection_string']
}

## Get a handle to the datastore

In [9]:
datastore_name = 'tutorial_parallelization'
datastore = Datastore(ws, datastore_name)

## Define the partitioning step

In [10]:
# define the run configuration
run_configuration = RunConfiguration()
run_configuration.environment = environment

# define the input dataset
ds_raw = Dataset.get_by_name(workspace=ws, name='tutorial-parallelization-raw')

# define the output dataset
ds_partitioned = OutputFileDatasetConfig(destination=(datastore, 'partitioned/{run-id}')) \
    .register_on_complete(name='tutorial-parallelization-partitioned')

# define the step
partition_step = PythonScriptStep(
    name='partition-step',
    source_directory='.',
    script_name='partition_step.py',
    compute_target=compute_target_cpu,
    arguments=['--output-dir', ds_partitioned.as_mount()],
    inputs=[ds_raw.as_named_input('ds_raw').as_mount()],
    runconfig=run_configuration,
    allow_reuse=True
)

## Define the parallelized processing step

In [11]:
# define the output dataset
ds_processed = OutputFileDatasetConfig(destination=(datastore, 'processed/{run-id}'))\
    .register_on_complete(name='tutorial-parallelization-processed')

# define the run configuration
parallel_run_config = ParallelRunConfig(
    source_directory='.',
    entry_script='processing_step.py',
    mini_batch_size=1,
    error_threshold=0,
    output_action='summary_only',
    environment=environment,
    compute_target=compute_target_cpu,
    process_count_per_node=jobs_per_node_cpu,
    node_count=compute_target_cpu.get_status().scale_settings.maximum_node_count,
    run_invocation_timeout=300,
    run_max_try=1
)

# define the step
processing_step = ParallelRunStep(
    name='processing-step',
    parallel_run_config=parallel_run_config,
    arguments=['--output-dir', ds_processed.as_mount()],
    inputs=[ds_partitioned.as_input('ds_partitioned').as_mount()],
    allow_reuse=True
)

## Define the aggregation step

In [12]:
# define the run configuration
run_configuration = RunConfiguration()
run_configuration.environment = environment

# define the output dataset
ds_aggregated = OutputFileDatasetConfig(destination=(datastore, 'aggregated/{run-id}'))\
    .register_on_complete(name='tutorial-parallelization-aggregated')

# define the step
aggregation_step = PythonScriptStep(
    name='aggregation-step',
    source_directory='.',
    script_name='aggregation_step.py',
    compute_target=compute_target_cpu,
    arguments=['--output-dir', ds_aggregated.as_mount()],
    inputs=[ds_processed.as_input('ds_processed').as_mount()],
    runconfig=run_configuration,
    allow_reuse=True
)

## Define the final pipeline and experiment

In [13]:
pipeline = Pipeline(workspace=ws, steps=[partition_step, processing_step, aggregation_step])
experiment = Experiment(ws, experiment_name)

## Submit the experiment and wait for completion

In [None]:
print(f'Experiment submitted at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
run = experiment.submit(pipeline)
run.wait_for_completion(show_output=False)
print(f'Experiment terminated at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

Experiment submitted at 2021-01-26 14:55:37
Created step partition-step [db86f90f][9b0c8590-ae13-4e4b-a713-04bec01e3f5d], (This step will run and generate new outputs)Created step processing-step [25c4275d][e9993f05-199f-435e-88ca-5356ffff11e4], (This step will run and generate new outputs)

Created step aggregation-step [a019c7e2][8844e403-dcb0-46a2-978c-258d21998ab3], (This step will run and generate new outputs)
Submitted PipelineRun 8e766113-3415-4d6d-9d5d-24c46ba67ade
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/parallelization_tutorial/runs/8e766113-3415-4d6d-9d5d-24c46ba67ade?wsid=/subscriptions/c22bcbea-3647-4c9e-96b6-6a734c897619/resourcegroups/RG-EUR-WW-DEV-TAGDATAAIRMC/workspaces/ml-rmc
PipelineRunId: 8e766113-3415-4d6d-9d5d-24c46ba67ade
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/parallelization_tutorial/runs/8e766113-3415-4d6d-9d5d-24c46ba67ade?wsid=/subscriptions/c22bcbea-3647-4c9e-96b6-6a734c897619/resourcegroups/RG-