In [None]:
#import required libraries
from azure.ml import MLClient
import mlflow
from azure.ml.entities import CommandJob, Code, PipelineJob, Dataset, InputDatasetEntry
from azure.ml.entities import AutoMLJob

In [None]:
#Enter details of your AML workspace
subscription_id = '15ae9cb6-95c1-483d-a0e3-b1a1a3b06324'
resource_group = 'automldpv2priprev-resgrp'
workspace = 'automldpv2priprev'

In [None]:
#get a handle to the workspace
ml_client = MLClient(subscription_id, resource_group, workspace)
assert client is not None

## Initialize MLFlow Client

The models and artifacts that are produced by AutoML can be accessed by the MLFlow interface. Initialize the MLFlow client here, and set the backend as Azure ML, via. the MLFlow Client.

**Questions**

Q: Can we set this (the tracking URI) inside AutoML, given things won't work at all w/o setting MLFlow context above?

Q: Do we need MLFlow client for job submissions?

In [None]:
# Can this MLFlow initialization call be made transparently from MLFlow Client instead of explecetely?

tracking_uri = "TODO --> Get this from MLClient"

################################################################################
# TODO: The API to get tracking URI is not yet available on Worksapce object.
from azureml.core import Workspace as WorkspaceV1
ws = WorkspaceV1(workspace_name=workspace_name, resource_group=resource_group_name, subscription_id=subscription_id)
tracking_uri = ws.get_mlflow_tracking_uri()
del ws
################################################################################

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(experiment_name)

print("\nCurrent tracking uri: {}".format(mlflow.get_tracking_uri()))

In [None]:
#create the inputs and outputs required for the pipeline
my_local_single_dataset = Dataset(
    local_path="./data"
)
pipeline_job_inputs = {'raw_dataset_from_pipeline': InputDatasetEntry(dataset=my_local_single_dataset),
                       'n_cross_validations_from_pipeline': '5',
                       'task_from_pipeline': 'regression',
                       'optimization_metric_from_pipeline': 'normalized_root_mean_squared_error',
                       'target_column_name_from_pipeline': 'price'
                      }

**Questions:**

Q: Should parameters number based be specified as numbers, not as text? i.e. '5'. In original example from pipelines it was:
pipeline_job_inputs = {'max_epocs_from_pipeline': '20', ...

Q: Should we define pipeline outputs here or at the end?...

# AutoFeaturization Job/Component

In [None]:
# Define the AutoFeaturizationJob (Component?)

autofeaturization_job_inputs = {'data': '${{inputs.raw_dataset_from_pipeline}}',  #upload the local data into a dataset
                                'n_cross_validations': '${{inputs.n_cross_validations_from_pipeline}}', #take this input from the pipeline inputs
                                'task': '${{inputs.task_from_pipeline}}', #take this input from the pipeline inputs 
                                'optimization_metric': '${{inputs.optimization_metric_from_pipeline}}', #take this input from the pipeline inputs 
                                'target_column_name': '${{inputs.target_column_name_from_pipeline}}' #take this input from the pipeline inputs  
                               } 
                                
autofeaturization_job_outputs = {'featurized_training_data': None,
                                 'featurized_validation_data': None,
                                 'featurizer': None
                                }

autofeaturization_job = AutoFeaturizationJob(
    inputs = autofeaturization_job_inputs, #inputs to the job
    outputs = autofeaturization_job_outputs, #outputs of the job
    compute = 'azureml:cpu_cluster' #<override with some other compute if needed>
)

# NOTE:

'featurized_training_data' and 'featurized_validation_data' would contain a single dataset if training/validation split, or would contain multiple datasets each (correlated), one per CV fold. I.e. 5 correlated datasets each (implementation TBD, folders, list of objects?).

## Issues / Problems to be solved

Q: What "type" are 'featurized_training_data' and 'featurized_validation_data'?
   - List(TabularDataset)?
   - Path to a blob with "MLDatasetArtifact" (currently in spec only, Daniel-Sch) at rest with multiple folders one per dataset?
   - Other? 


Q: Do we need "environment" parameter for the AutoML-components? Or we use it internally only?


# AutoTrain Job/Component

In [None]:
# Define the AutoTrainJob to run in the pipeline

### train_cmd = "python train.py --training_data ${{inputs.training_data}} --test_data ${{outputs.test_data}} --model_output ${{outputs.model_output}}"

autotrain_job_inputs = {'data': '${{jobs.autofeaturization-job.outputs.featurized_training_data}}',
                        'validation_data':  '${{jobs.autofeaturization-job.outputs.featurized_validation_data}}', 
                        'task': '${{inputs.task_from_pipeline}}', #take this input from the pipeline inputs 
                        'optimization_metric': '${{inputs.optimization_metric_from_pipeline}}', #take this input from the pipeline inputs 
                        'target_column_name': '${{inputs.target_column_name_from_pipeline}}' #take this input from the pipeline inputs
                       } 

autotrain_job_outputs = {'best_model': None, 'validation_predictions': None}

autotrain_job = AutoTrainJob(
    inputs = autotrain_job_inputs, #inputs to the job
    outputs = autotrain_job_outputs, #outputs of the job
    compute = 'azureml:cpu_cluster' #<override with some other compute if needed>
)


# Define the Pipeline

In [None]:
# lets create the pipeline

pipeline_job = PipelineJob(
    description = 'automl-componentization-example',
    jobs= {
        'autofeaturization-job':autofeaturization_job, 
        'autotrain-job': autotrain_job}, #add all the jobs into this pipeline
    inputs= pipeline_job_inputs, #top level inputs to the pipeline
    outputs=autotrain_job_outputs,
    compute = "cpu-cluster"
)


# QUESTIONS on Pipeline Outputs

Q: Is the pipeline outputs right? How can we merge multiple outputs from multiple jobs/components instead of having just the outputs from the autotrain job?



In [None]:
#submit the pipeline job
returned_job = ml_client.jobs.create_or_update(pipeline_job)
#get a URL for the status of the job
returned_job.services["Studio"].endpoint

# OTHER DEVELOPMENT EXPERIENCE CONSIDERATIONS

**Better way to define INPUTS/OUTPUTS?**: The definition of inputs/outputs is defined with JSON... that's a evry lose way of doing it, not intellisense, etc. prone to errors as compared to object's properties. Is there any other better way for defining the inputs/outputs instead of using JSON?