In [1]:
from azure.ai.ml import command, Input, MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml.dsl import pipeline
from dotenv import load_dotenv
import pandas as pd
import os

# specify the details of the workspace and assets created in the setup notebook
SUBSCRIPTION_ID = "e5615bfe-b43b-41ce-bccb-b78867c2ce63"
RESOURCE_GROUP = "rg-dp100-demo-001"
WORKSPACE_NAME = "mlw-dp100-demo"
DATASTORE_NAME = "diabtesblobdatastore"
STORAGE_ACCOUNT_NAME = "stdp100demo"
CONTAINER_NAME = "diabetesdatacontainer"
STORAGE_ACC_SAS_TOKEN = os.getenv("STORAGE_ACC_SAS_TOKEN")
DATASTORE_NAME = "blob_diabetes_datastore_sas"
MLTABLE_DATA_ASSET_NAME="mltable_diabetes_data_asset"
COMPUTE_CLUSTER = "vmcluster-ml-dev"
# get a handle to the subscription
load_dotenv("python.env")

ml_client = MLClient(DefaultAzureCredential(), 
                     subscription_id=SUBSCRIPTION_ID, 
                     resource_group_name=RESOURCE_GROUP,
                     workspace_name=WORKSPACE_NAME)

In [2]:
# to ensure we are using the latest version of the data:
data_asset_tuple = [(asset.name, asset.latest_version) for asset in ml_client.data.list() if asset.name == MLTABLE_DATA_ASSET_NAME][0]
data_asset = ml_client.data.get(name=data_asset_tuple[0], version=data_asset_tuple[1])
# data_asset.path

### 1. Using AutoML

- Important reference: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-train?view=azureml-api-2&tabs=python
- In order to provide training data to AutoML in SDK v2 you need to upload it into the cloud through an MLTable.
- Requirements for loading data into an MLTable:
    - Data must be in tabular form.
    - The value to predict, target column, must be in the data.

In [3]:
# once we get the data asset, we need to create a valid Input object to pass to the automl job
# the Input object needs 2 key pieces of information: the type and the path
# the syntax for path is "azureml:<data_asset_name>:<data_asset_version>"
# the syntax for type is "AssetTypes.<type>" where type is one of the following: URI_FILE, URI_FOLDER, MLTABLE
input_path = f"azureml:{data_asset.name}:{data_asset.version}"
input_path

'azureml:mltable_diabetes_data_asset:1'

In [4]:
my_training_data_input = Input(type=AssetTypes.MLTABLE, # only MLTABLE is supported for automl
                               path=input_path)

In [5]:
from azure.ai.ml import automl

# configure the classification job
classification_job = automl.classification(
    compute=COMPUTE_CLUSTER,
    experiment_name="auto-ml-diabetes-classification",
    training_data=my_training_data_input,
    target_column_name="Outcome",
    primary_metric="accuracy",
    n_cross_validations=5,
    enable_model_explainability=True
)

classification_job.set_limits(
    timeout_minutes=60, 
    trial_timeout_minutes=20, 
    max_trials=5,
    enable_early_termination=True,
)

In [6]:
# submit the AutoML job
returned_job = ml_client.jobs.create_or_update(
    classification_job
) 

In [7]:
# You can monitor AutoML job runs in the Azure Machine Learning studio. 
# To get a direct link to the AutoML job by running the following code:
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

Monitor your job at https://ml.azure.com/runs/wheat_leather_b703wm8yjl?wsid=/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo&tid=3c2288b7-c4ac-4ad8-a4f3-32a569108be3


### 2. Repeat with the MLTable asset where `colocated=False`

In [10]:
# configure the data input
data_asset_tuple = [(asset.name, asset.latest_version) for asset in ml_client.data.list() if asset.name == "mltable_diabetes_data_asset2"][0]
data_asset = ml_client.data.get(name=data_asset_tuple[0], version=data_asset_tuple[1])
input_path = f"azureml:{data_asset.name}:{data_asset.version}"
print(input_path)
my_training_data_input = Input(type=AssetTypes.MLTABLE, # only MLTABLE is supported for automl
                               path=input_path)

# configure the classification job
classification_job = automl.classification(
    compute=COMPUTE_CLUSTER,
    experiment_name="auto-ml-diabetes-classification",
    training_data=my_training_data_input,
    target_column_name="Outcome",
    primary_metric="accuracy",
    n_cross_validations=5,
    enable_model_explainability=True
)

classification_job.set_limits(
    timeout_minutes=60, 
    trial_timeout_minutes=20, 
    max_trials=5,
    enable_early_termination=True,
)

# submit the AutoML job
returned_job = ml_client.jobs.create_or_update(
    classification_job
) 
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

azureml:mltable_diabetes_data_asset2:1
Monitor your job at https://ml.azure.com/runs/sad_horse_mx7y6pxzx6?wsid=/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo&tid=3c2288b7-c4ac-4ad8-a4f3-32a569108be3


### 3. Repeat with MLTable asset where `colocated=False` + compute instance instead of cluster

- This will throw an error "User starting the run is not an owner or assigned user to the Compute Instance".
- This is because we are sending the jobs as a Service Principal. If the job is sent from the AML workspace, or from VSCode connected to the workspace, (under the logged-in user's credential), this will be fine.

In [11]:
# configure the data input
data_asset_tuple = [(asset.name, asset.latest_version) for asset in ml_client.data.list() if asset.name == "mltable_diabetes_data_asset2"][0]
data_asset = ml_client.data.get(name=data_asset_tuple[0], version=data_asset_tuple[1])
input_path = f"azureml:{data_asset.name}:{data_asset.version}"
print(input_path)
my_training_data_input = Input(type=AssetTypes.MLTABLE, # only MLTABLE is supported for automl
                               path=input_path)

# configure the classification job
classification_job = automl.classification(
    compute="vm-ml-dev",
    experiment_name="auto-ml-diabetes-classification",
    training_data=my_training_data_input,
    target_column_name="Outcome",
    primary_metric="accuracy",
    n_cross_validations=5,
    enable_model_explainability=True
)

classification_job.set_limits(
    timeout_minutes=60, 
    trial_timeout_minutes=20, 
    max_trials=5,
    enable_early_termination=True,
)

# submit the AutoML job
returned_job = ml_client.jobs.create_or_update(
    classification_job
) 
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

azureml:mltable_diabetes_data_asset2:1
Monitor your job at https://ml.azure.com/runs/bright_picture_cpln9xc7h2?wsid=/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo&tid=3c2288b7-c4ac-4ad8-a4f3-32a569108be3


### 3. Repeat with additional config to allow certain models only

In [12]:
# configure the data input
data_asset_tuple = [(asset.name, asset.latest_version) for asset in ml_client.data.list() if asset.name == "mltable_diabetes_data_asset2"][0]
data_asset = ml_client.data.get(name=data_asset_tuple[0], version=data_asset_tuple[1])
input_path = f"azureml:{data_asset.name}:{data_asset.version}"
print(input_path)
my_training_data_input = Input(type=AssetTypes.MLTABLE, # only MLTABLE is supported for automl
                               path=input_path)

# configure the classification job
classification_job = automl.classification(
    compute=COMPUTE_CLUSTER,
    experiment_name="auto-ml-diabetes-classification",
    training_data=my_training_data_input,
    target_column_name="Outcome",
    primary_metric="accuracy",
    n_cross_validations=5,
    enable_model_explainability=True,
)

classification_job.set_limits(
    timeout_minutes=60, 
    trial_timeout_minutes=20, 
    max_trials=5,
    enable_early_termination=True,
)

# Training properties are optional
classification_job.set_training(
    allowed_training_algorithms=["SVM", "MultinomialNaiveBayes", "LogisticRegression", "LightGBM", "XGBoostClassifier"], 
    enable_onnx_compatible_models=True,
    enable_stack_ensemble=True,
    enable_vote_ensemble=True,
)

# submit the AutoML job
returned_job = ml_client.jobs.create_or_update(
    classification_job
) 
aml_url = returned_job.studio_url
print("Monitor your job at", aml_url)

azureml:mltable_diabetes_data_asset2:1
Monitor your job at https://ml.azure.com/runs/silver_clock_yhft57nzyq?wsid=/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo&tid=3c2288b7-c4ac-4ad8-a4f3-32a569108be3
