In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws)

{
    "subscription_id": "17a92c38-4328-4a79-b356-bc50e119219f",
    "resource_group": "rg-e-idc-ml-batch",
    "workspace_name": "ml-e-idc-batch"
}

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication

auth =InteractiveLoginAuthentication()
from azureml.core import Workspace
subscription_id = "17a92c38-4328-4a79-b356-bc50e119219f" #you should be owner or contributor
resource_group = "rg-e-idc-ml-batch" #you should be owner or contributor
workspace_name = "ml-e-idc-batch" #your workspace name
workspace_region = "westus2" #your region
ws = Workspace.create(name = workspace_name,
                      subscription_id = subscription_id,
                      resource_group = resource_group,
                      location = workspace_region,
                      auth=auth, exist_ok=True)


If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Performing interactive authentication. Please follow the instructions on the terminal.




In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "batchcluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 1)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

In [None]:
from azureml.core import Datastore
from azureml.core import Workspace

image_datastore = Datastore.register_azure_blob_container(workspace=ws,
                                                         datastore_name="storage_images",
                                                         container_name="images",
                                                         account_name="saeidcfnc01images",
                                                         account_key="HuKo1e+ii++fGdF0DAuHrJNEFnZS7v+Qh2Ympq27aVugAJ6GSe57pjBDh1x8vFTvbprFHz5kGJxuGUbPbzVx+A==")
print(image_datastore)

In [None]:
from azureml.core.dataset import Dataset

path_on_datastore = image_datastore.path('images/')
input_images_ds = Dataset.File.from_files(path=path_on_datastore, validate=False)

print(input_images_ds)

In [None]:
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import PipelineParameter

pipeline_param = PipelineParameter(name="images_param", default_value=input_images_ds)
input_images_ds_consumption = DatasetConsumptionConfig("images_param_config", pipeline_param).as_mount()

print(input_images_ds_consumption)

In [None]:
from azureml.pipeline.core import Pipeline, PipelineData

output_dir = PipelineData(name="inferences", datastore=image_datastore)
print(output_dir)

In [None]:
from azureml.pipeline.core import Pipeline, PipelineData,PipelineParameter
from azureml.core.dataset import Dataset

scripts_folder = "scripts"
queue_length = PipelineParameter(name="queue_length", default_value=5)
num_node = PipelineParameter(name="num_node", default_value=2)
core_per_node = PipelineParameter(name="core_per_node", default_value=3)

#This is the source directory for the landing folder. It's used to read from files from
#datastore_sourcefiles_dir = Dataset.File.from_files((image_datastore, "/")).as_named_input("datastore_sourcefiles_dir")


#This is to output the mapping file (mapping.csv)
mapping_output_dir = PipelineData(name="grouping_output", 
                          datastore=image_datastore, 
                          output_path_on_compute="grouping_output")
mapping_output_ds = mapping_output_dir.as_dataset().parse_delimited_files()
#This is to output the final file

preprocess_output_dir = PipelineData(name="preprocess_output", 
                      datastore=image_datastore,output_path_on_compute="preprocess_output")
#This is to output the final file

print(mapping_output_ds)

In [None]:
#Environment for mapping/distributing step
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# create a new runconfig object
mapping_run_config = RunConfiguration()

# enable Docker 
mapping_run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
mapping_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
mapping_run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
mapping_run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['azure-servicebus','azureml-defaults','pandas','redis'])


#Mapping/grouping step
from azureml.pipeline.steps import PythonScriptStep

distributing_step = PythonScriptStep(name="distributing_step",
   script_name="distributing.py",
   arguments=["--queue_length", queue_length, "--num_node", num_node, "--core_per_node", core_per_node,"--grouping_output", mapping_output_ds],
   outputs=[mapping_output_ds],
   compute_target=compute_target,
   source_directory=scripts_folder,
   runconfig =mapping_run_config,
    allow_reuse=False
)

print(distributing_step)

In [None]:

# for preprocessing step
from azureml.pipeline.core import PipelineParameter,StepSequence
from azureml.pipeline.steps import ParallelRunStep, ParallelRunConfig
from azureml.core import Environment

preprocess_conda_deps = CondaDependencies.create(pip_packages=["azureml-defaults","azureml-dataset-runtime[fuse,pandas]","xlrd","azure-servicebus","azure-storage-blob","redis"])
preprocess_env = Environment(name="preprocess_environment")
preprocess_env.python.conda_dependencies = preprocess_conda_deps
preprocess_env.docker.enabled = True
preprocess_env.docker.base_image = DEFAULT_CPU_IMAGE



preprocess_parallel_run_config = ParallelRunConfig(
    source_directory=scripts_folder,
    
    entry_script="preprocessing.py",
    mini_batch_size=PipelineParameter(name="preprocess_batch_size_param", default_value="1"),

    error_threshold=1,
    output_action="summary_only",
    append_row_file_name="outputs.txt",
    environment=preprocess_env,
    compute_target=compute_target,
    process_count_per_node=PipelineParameter(name="process_count_param", default_value=2),
    node_count=num_node)
# process_count_per_node should be set to the number of cores per node or higher to maximize parallelism
preprocess_parallelrun_step = ParallelRunStep(
    name="preprocess",
   #arguments=["--datastore_sourcefiles_dir", datastore_sourcefiles_dir,
   arguments=["--preprocess_output_dir", preprocess_output_dir],

    parallel_run_config=preprocess_parallel_run_config,
    #side_inputs=[datastore_sourcefiles_dir],
    inputs=[mapping_output_ds],
    output=preprocess_output_dir,
    allow_reuse=False
)

print(preprocess_parallelrun_step)

In [None]:
from azureml.core import Experiment

pipeline = Pipeline(workspace=ws, steps=[distributing_step, preprocess_parallelrun_step])
experiment = Experiment(ws, 'preprocessing_example')
pipeline_run = experiment.submit(pipeline)

In [None]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

In [None]:
pipeline_run.publish_pipeline(name ="preprocessing_example", description ="preprocessing_example pipeline", version=1.0)
