## Logon to the workspace

In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: ws01ent
Azure region: westus2
Subscription id: 0e9bace8-7a81-4922-83b5-d995ff706507
Resource group: azureml


In [2]:
from azureml.core.authentication import InteractiveLoginAuthentication

auth =InteractiveLoginAuthentication(tenant_id='72f988bf-86f1-41af-91ab-2d7cd011db47')
from azureml.core import Workspace
subscription_id = "0e9bace8-7a81-4922-83b5-d995ff706507" #you should be owner or contributor
resource_group = "azureml" #you should be owner or contributor
workspace_name = "ws01ent" #your workspace name
workspace_region = "westus2" #your region
ws = Workspace.create(name = workspace_name,
                      subscription_id = subscription_id,
                      resource_group = resource_group,
                      location = workspace_region,
                      auth=auth, exist_ok=True)


## Prepare data

In [64]:
from azure.servicebus import QueueClient,ServiceBusClient
import json
import os
keyvault = ws.get_default_keyvault()
con_str =keyvault.get_secret('servicebusconstr')

sb_client = ServiceBusClient.from_connection_string(con_str)
queue_client = sb_client.get_queue("landing")
with queue_client.get_receiver(prefetch=1000) as queue_receiver:
    messages = queue_receiver.fetch_next(timeout=30, max_batch_size=1000)
    for message in messages:
        json_content = json.loads(str(message))
        url = json_content['data']['url']
        folder_name = url.split("/")[-2]
        file_name = os.path.basename(url)
#         message.complete()
#         file_path=os.environ.get('AZUREML_DATAREFERENCE_datastore_sourcefiles_dir')+"/"+folder_name+"/"+file_name

        print(url)

https://adlsdatalakegen6.blob.core.windows.net/landing/source1/datafile1.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source1/datafile2.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source1/datafile3.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source1/datafile4.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source1/datafile5.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source2/datafile2.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source2/datafile3.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source2/datafile4.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source2/datafile1.csv
https://adlsdatalakegen6.blob.core.windows.net/landing/source2/datafile5.csv


In [3]:
from azureml.core.model import Model
import joblib
model_name="porto_seguro_safe_driver_model"
model = Model(ws, model_name)
model.download("model",exist_ok =True)
model_path = os.path.join("model", model_name)
LGBM_MODEL = joblib.load(model_path)


In [62]:
import pandas as pd
test_df = pd.read_csv("C:\\Users\\janguy\\Downloads\\datafile3.csv")
test_df= test_df.drop(['id'], axis=1)
output = LGBM_MODEL.predict(test_df)
output = pd.DataFrame({"score":output, "source":["datafile1.csv"]*len(output)})

### Create or Attach existing compute resource for Python steps
By using Azure Machine Learning Compute, a managed service, data scientists can train machine learning models on clusters of Azure virtual machines. Examples include VMs with GPU support. In this tutorial, you create Azure Machine Learning Compute as your training environment. The code below creates the compute clusters for you if they don't already exist in your workspace.

**Creation of compute takes approximately 5 minutes. If the AmlCompute with that name is already in your workspace the code will skip the creation process.**

In [4]:
import os
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.data.data_reference import DataReference

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "worker-cpu")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 10)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D12_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. worker-cpu


In [5]:
from azureml.core import Workspace
from azureml.core import Keyvault
import os

keyvault = ws.get_default_keyvault()
# keyvault.set_secret(name="servicebusconstr", value = 'YOUR_SERVICE_BUS_CONNECTION_STRING')

In [6]:

#This is to mount a BLob datastore
from azureml.core import Datastore

key =keyvault.get_secret('adlsgen6key')
account_name = 'adlsdatalakegen6'
datastore_sourcefiles= Datastore.register_azure_blob_container(workspace=ws, datastore_name = 'adlsgen6landing', 
                                                     container_name='landing',
                                                     account_name= account_name, account_key=key,create_if_not_exists=False)

datastore_batch_score= Datastore.register_azure_blob_container(workspace=ws, datastore_name = 'adlsgen6process', 
                                                     container_name='process',
                                                     account_name= account_name, account_key=key,create_if_not_exists=False)




### Intermediate/Output Data
Intermediate data (or output of a Step) is represented by [PipelineData](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipelinedata?view=azure-ml-py) object. PipelineData can be produced by one step and consumed in another step by providing the PipelineData object as an output of one step and the input of one or more steps.

**Constructing PipelineData**
- name: [Required] Name of the data item within the pipeline graph
- datastore_name: Name of the Datastore to write this output to
- output_name: Name of the output
- output_mode: Specifies "upload" or "mount" modes for producing output (default: mount)
- output_path_on_compute: For "upload" mode, the path to which the module writes this output during execution
- output_overwrite: Flag to overwrite pre-existing data

In [8]:
from azureml.pipeline.core import Pipeline, PipelineData,PipelineParameter
from azureml.core.dataset import Dataset



scripts_folder = "scripts"
#This is the source directory for the landing folder. It's used to read from files from
datastore_sourcefiles_dir = Dataset.File.from_files((datastore_sourcefiles, "/")).as_named_input("datastore_sourcefiles_dir")

#This is to output the mapping file (mapping.csv)
mapping_output_dir = PipelineData(name="grouping_output", 
                          datastore=datastore_batch_score, 
                          output_path_on_compute="grouping_output")
mapping_output_ds = mapping_output_dir.as_dataset()

#This is to output the final file

batch_score_output_dir = PipelineData(name="batch_score_output", 
                      datastore=datastore_batch_score,output_path_on_compute="batch_score_output")
#This is to output the final file





## Define parameters for the pipelines

In [9]:
queue_length = PipelineParameter(name="queue_length", default_value=5)
num_node = PipelineParameter(name="num_node", default_value=2)
core_per_node = PipelineParameter(name="core_per_node", default_value=3)


## Define environments and processing steps

In [65]:
#Environment for mapping/distributing step
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

# create a new runconfig object
mapping_run_config = RunConfiguration()

# enable Docker 
mapping_run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
mapping_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
mapping_run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
mapping_run_config.environment.python.conda_dependencies = CondaDependencies.create(pip_packages=['azure-servicebus','azureml-defaults','pandas'])


#Mapping/grouping step
from azureml.pipeline.steps import PythonScriptStep

distributing_step = PythonScriptStep(name="distributing_step",
   script_name="distributing.py",
   arguments=["--queue_length", queue_length, "--num_node", num_node, "--core_per_node", core_per_node,"--grouping_output", mapping_output_ds],
   outputs=[mapping_output_ds],
   compute_target=compute_target,
   source_directory=scripts_folder,
   runconfig =mapping_run_config,
    allow_reuse=False
)
    


In [66]:
# for batch_scoreing step
from azureml.pipeline.core import PipelineParameter,StepSequence
from azureml.pipeline.steps import ParallelRunStep, ParallelRunConfig
from azureml.core import Environment

batch_score_conda_deps = CondaDependencies.create(conda_packages=['lightgbm'], pip_packages=["azureml-defaults","azureml-dataset-runtime[fuse,pandas]"
                                                               ,"azure-servicebus",'azure-storage-blob',
                                                               'joblib'])
batch_score_env = Environment(name="batch_score_environment")
batch_score_env.python.conda_dependencies = batch_score_conda_deps
batch_score_env.docker.enabled = True
batch_score_env.docker.base_image = DEFAULT_CPU_IMAGE



batch_score_parallel_run_config = ParallelRunConfig(
    source_directory=scripts_folder,
    
    entry_script="batch_score.py",
    mini_batch_size=PipelineParameter(name="batch_score_batch_size_param", default_value="1"),

    error_threshold=1,
    output_action="append_row",
    append_row_file_name="outputs.txt",
    environment=batch_score_env,
    compute_target=compute_target,
    process_count_per_node=PipelineParameter(name="process_count_param", default_value=6),
    node_count=num_node)
# process_count_per_node should be set to the number of cores per node or higher to maximize parallelism
batch_score_parallelrun_step = ParallelRunStep(
    name="batchscore",
    arguments=["--datastore_sourcefiles_dir", datastore_sourcefiles_dir,
               "--batch_score_output_dir", batch_score_output_dir],

    parallel_run_config=batch_score_parallel_run_config,
    side_inputs=[datastore_sourcefiles_dir],
    inputs=[mapping_output_ds],
    output=batch_score_output_dir,
    allow_reuse=False
)

## Define pipeline and submit a run


In [67]:
from azureml.core import Experiment

pipeline = Pipeline(workspace=ws, steps=[distributing_step, batch_score_parallelrun_step])
experiment = Experiment(ws, 'preprocessing_example')
pipeline_run = experiment.submit(pipeline)

Created step distributing_step [ada8ffb0][0defa60e-fb6a-4e0d-8b7e-6f84aefd769c], (This step will run and generate new outputs)Created step batchscore [9b505289][4044c18b-c5ad-4c9c-be77-a0d173ce15d7], (This step will run and generate new outputs)

Using data reference datastore_sourcefiles_dir_0 for StepId [886a3da7][f92e9235-3f6c-4b14-8c1c-0df1be1748fc], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun 4239a1ed-4506-49d6-bbad-4f59235455af
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/preprocessing_example/runs/4239a1ed-4506-49d6-bbad-4f59235455af?wsid=/subscriptions/0e9bace8-7a81-4922-83b5-d995ff706507/resourcegroups/azureml/workspaces/ws01ent


In [34]:
pipeline_run.cancel()

In [68]:
from azureml.widgets import RunDetails
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

## Publish the pipeline to use with external applications


In [53]:
pipeline_run.publish_pipeline(name ="preprocessing_example", description ="preprocessing_example pipeline", version=1.0)

Name,Id,Status,Endpoint
preprocessing_example,56ee21ae-db27-4501-83c0-59ca70a0b9bb,Active,REST Endpoint
