In [6]:
from azure.ai.ml import command, Input, MLClient, UserIdentityConfiguration, ManagedIdentityConfiguration
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml.dsl import pipeline
from dotenv import load_dotenv
import pandas as pd
import os

# specify the details of your subscription
SUBSCRIPTION_ID = "e5615bfe-b43b-41ce-bccb-b78867c2ce63"
RESOURCE_GROUP = "rg-dp100-demo-001"
WORKSPACE_NAME = "mlw-dp100-demo"
DATASTORE_NAME = "blob_diabetes_datastore_sas"
URIFILE_DATA_ASSET_NAME="urifile_diabetes_data_asset"
MLTABLE_DATA_ASSET_NAME="mltable_diabetes_data_asset"
STORAGE_ACCOUNT_NAME = "stdp100demo"
CONTAINER_NAME = "diabetesdatacontainer"
STORAGE_ACC_SAS_TOKEN = os.getenv("STORAGE_ACC_SAS_TOKEN")
ENV_NAME = "diabetes-env"
# get a handle to the subscription
load_dotenv("../python.env")

ml_client = MLClient(DefaultAzureCredential(), 
                     subscription_id=SUBSCRIPTION_ID, 
                     resource_group_name=RESOURCE_GROUP,
                     workspace_name=WORKSPACE_NAME)

In [2]:
# datastore_name="blob_diabetes_datastore_sas"
# data_asset_name="urifile_diabetes_data_asset"
# mltable_data_asset_name="mltable_diabetes_data_asset"

### 1. Create a Datastore to link to blob storage

In [3]:
from azure.ai.ml.entities import SasTokenConfiguration
from azure.ai.ml.entities import AzureBlobDatastore


store_sas = AzureBlobDatastore(
    name=DATASTORE_NAME,
    description="Datastore for Diabetes data, created with SAS token authorization.",
    account_name=STORAGE_ACCOUNT_NAME,
    container_name=CONTAINER_NAME,
    credentials=SasTokenConfiguration(
        sas_token= STORAGE_ACC_SAS_TOKEN
    ),
)

ml_client.create_or_update(store_sas)

AzureBlobDatastore({'type': <DatastoreType.AZURE_BLOB: 'AzureBlob'>, 'name': 'blob_diabetes_datastore_sas', 'description': 'Datastore for Diabetes data, created with SAS token authorization.', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0x0000025F8A81AAD0>, 'credentials': {'type': 'sas'}, 'container_name': 'diabetesdatacontainer', 'account_name': 'stdp100demo', 'endpoint': 'core.windows.net', 'protocol': 'https'})

In [7]:
[datastore.name for datastore in ml_client.datastores.list()]

['blob_diabetes_datastore_sas',
 'blob_titanic_identity',
 'blob_titanic_account_identity',
 'blob_titanic_sas',
 'azureml_globaldatasets',
 'blobdatastore2',
 'blobdatastore',
 'workspaceartifactstore',
 'workspaceworkingdirectory',
 'workspaceblobstore',
 'workspacefilestore']

In [14]:
diabetes_datastore = ml_client.datastores.get(DATASTORE_NAME)
diabetes_datastore.__dict__

{'_type': <DatastoreType.AZURE_BLOB: 'AzureBlob'>,
 'name': 'blob_diabetes_datastore_sas',
 'description': 'Datastore for Diabetes data, created with SAS token authorization.',
 'tags': {},
 'properties': {},
 'print_as_yaml': True,
 '_id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas',
 '_Resource__source_path': None,
 '_base_path': 'd:\\Repositories\\GitHub\\dp-100',
 '_creation_context': None,
 '_serialize': <msrest.serialization.Serializer at 0x25faeed3910>,
 'credentials': {'type': 'sas'},
 'container_name': 'diabetesdatacontainer',
 'account_name': 'stdp100demo',
 'endpoint': 'core.windows.net',
 'protocol': 'https'}

### 2. Create a URI File Data Asset from the Datastore

In [16]:
urifile_data_asset_path = f'azureml://datastores/{diabetes_datastore.name}/paths/diabetes.csv'

my_data = Data(
    path=urifile_data_asset_path,
    type=AssetTypes.URI_FILE,
    description="Data asset created from datastore",
    name=URIFILE_DATA_ASSET_NAME,
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'urifile_diabetes_data_asset', 'description': 'Data asset created from datastore', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/urifile_diabetes_data_asset/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000025FAEED7E10>, 'serialize': <msrest.serialization.Serializer object at 0x0000025FAE8897D0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_

In [22]:
[(asset.name, asset.latest_version) for asset in ml_client.data.list()]

[('titanicdata', '1'),
 ('bike-rentals', '1'),
 ('titanic', '1'),
 ('dataset', '01b454e5'),
 ('titanic_data_asset_from_local', '1'),
 ('titanic_data_asset_from_blob', '1'),
 ('titanic_data_asset_from_datastore_sas', '1'),
 ('titanic_data_asset_from_datastore_identity', '3'),
 ('titanic_data_asset_from_datastore_account_identity', '1'),
 ('titanic_data_asset_from_storage_account', '3'),
 ('titanic_folder_data_asset_from_datastore_sas', '1'),
 ('urifile_diabetes_data_asset', '1')]

### 3. Test reading the data asset

In [20]:
# to always get the latest version of the data asset we just created,
# we can use the following code to iterate through the list of data assets
# and find the one we want based on the name set initially
data_asset_tuple = [(asset.name, asset.latest_version) for asset in ml_client.data.list() if asset.name == URIFILE_DATA_ASSET_NAME][0]
data_asset_tuple

('urifile_diabetes_data_asset', '1')

In [25]:
data_asset = ml_client.data.get(name=data_asset_tuple[0], version=data_asset_tuple[1])
df = pd.read_csv(data_asset.path)
df.head(2)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0


### 4. Create MLTable Data Asset

From existing URIFile data asset

In [21]:
import mltable

data_asset_tuple = [(asset.name, asset.latest_version) for asset in ml_client.data.list() if asset.name == URIFILE_DATA_ASSET_NAME][0]
data_asset_tuple

('urifile_diabetes_data_asset', '1')

In [22]:
data_asset = ml_client.data.get(name=data_asset_tuple[0], version=data_asset_tuple[1])

In [23]:
data_asset.path

'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas/paths/diabetes.csv'

In [25]:
# create a mltable from the path
paths = [
    {'file': data_asset.path}
]
train_table = mltable.from_delimited_files(paths)

In [26]:
train_table

paths:
- file: azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas/paths/diabetes.csv
transformations:
- read_delimited:
    delimiter: ','
    empty_as_string: false
    encoding: utf8
    header: all_files_same_headers
    include_path_column: false
    infer_column_types: true
    partition_size: 20971520
    path_column: Path
    support_multi_line: false
type: mltable

In [27]:
train_table.show(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,True
1,1,85,66,29,0,26.6,0.351,31,False
2,8,183,64,0,0,23.3,0.672,32,True
3,1,89,66,23,94,28.1,0.167,21,False
4,0,137,40,35,168,43.1,2.288,33,True


In [28]:
# save the data loading steps in an MLTable file to a cloud storage.
# This code creates a new folder MLTable to the location of the datastore, to contain the file format and loading instructions.
# note that inside this folder, there will be a copy of the data file as well.
# This is because we set colocated=True, which means the data file will be copied to the same location as the MLTable file.
# This is useful when the data file is small, and we want to have a single location for both the data file and the MLTable file.

train_table.save(path=f"azureml://subscriptions/{SUBSCRIPTION_ID}/resourcegroups/{RESOURCE_GROUP}/workspaces/{WORKSPACE_NAME}/datastores/{DATASTORE_NAME}/paths/diabetes_mltable", 
                colocated=True, show_progress=True, overwrite=True)

paths:
- file: azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas/paths/diabetes.csv
transformations:
- read_delimited:
    delimiter: ','
    empty_as_string: false
    encoding: utf8
    header: all_files_same_headers
    include_path_column: false
    infer_column_types: true
    partition_size: 20971520
    path_column: Path
    support_multi_line: false
type: mltable

In [29]:
# using the path above, which contains both the data file and the MLtable yaml file, we can create a new data asset of type MLTable
my_path = f"azureml://subscriptions/{SUBSCRIPTION_ID}/resourcegroups/{RESOURCE_GROUP}/workspaces/{WORKSPACE_NAME}/datastores/{DATASTORE_NAME}/paths/diabetes_mltable"

my_data = Data(
    path=my_path,
    type=AssetTypes.MLTABLE,
    description="MLTable data asset created from datastore",
    name=MLTABLE_DATA_ASSET_NAME,
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas/paths/diabetes.csv'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'mltable_diabetes_data_asset', 'description': 'MLTable data asset created from datastore', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/mltable_diabetes_data_asset/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x000001FC9E6D0AD0>, 'serialize': <msrest.serialization.Serializer object at 0x000001FC9FD6EA10>, 'version': '1', 'lat

In [30]:
# let's list all data assets and find the one we just created
[(asset.name, asset.latest_version) for asset in ml_client.data.list()]

[('titanicdata', '1'),
 ('bike-rentals', '1'),
 ('titanic', '1'),
 ('dataset', '01b454e5'),
 ('titanic_data_asset_from_local', '1'),
 ('titanic_data_asset_from_blob', '1'),
 ('titanic_data_asset_from_datastore_sas', '1'),
 ('titanic_data_asset_from_datastore_identity', '3'),
 ('titanic_data_asset_from_datastore_account_identity', '1'),
 ('titanic_data_asset_from_storage_account', '3'),
 ('titanic_folder_data_asset_from_datastore_sas', '1'),
 ('urifile_diabetes_data_asset', '1'),
 ('mltable_diabetes_data_asset', '1')]

In [31]:
# if we set colocated=False, the data file will not be copied to the same location as the MLTable file.
# Instead, the MLTable file will contain the path to the data file.
train_table.save(path=f"azureml://subscriptions/{SUBSCRIPTION_ID}/resourcegroups/{RESOURCE_GROUP}/workspaces/{WORKSPACE_NAME}/datastores/{DATASTORE_NAME}/paths/diabetes_mltable2", 
                colocated=False, show_progress=True, overwrite=True)

paths:
- file: azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas/paths/diabetes.csv
transformations:
- read_delimited:
    delimiter: ','
    empty_as_string: false
    encoding: utf8
    header: all_files_same_headers
    include_path_column: false
    infer_column_types: true
    partition_size: 20971520
    path_column: Path
    support_multi_line: false
type: mltable

In [32]:
# now using the new path, we can create a new data asset of type MLTable
# the data file is not copied to the same location on the blob container as the MLTable file
my_path = f"azureml://subscriptions/{SUBSCRIPTION_ID}/resourcegroups/{RESOURCE_GROUP}/workspaces/{WORKSPACE_NAME}/datastores/{DATASTORE_NAME}/paths/diabetes_mltable2"
my_data = Data(
    path=my_path,
    type=AssetTypes.MLTABLE,
    description="MLTable data asset created from datastore, with colocated=False, for testing purposes.",
    name="mltable_diabetes_data_asset2",
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_diabetes_datastore_sas/paths/diabetes.csv'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'mltable_diabetes_data_asset2', 'description': 'MLTable data asset created from datastore, with colocated=False, for testing purposes.', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/mltable_diabetes_data_asset2/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x000001FC9FDF7750>, 'serialize': <msrest.serialization.Serializer obje

### 5. Create environment

In [7]:
from azure.ai.ml.entities import Environment


env_docker_conda = Environment(
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu22.04",
    conda_file="../conda_yamls/diabetes-env.yaml",
    name="diabetes-env",
    description="Environment created for diabetes experiment.",
)
ml_client.environments.create_or_update(env_docker_conda)

Environment({'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'diabetes-env', 'description': 'Environment created for diabetes experiment.', 'tags': {}, 'properties': {'azureml.labels': 'latest'}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/environments/diabetes-env/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100\\diabetes', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x000001E952C217D0>, 'serialize': <msrest.serialization.Serializer object at 0x000001E951759BD0>, 'version': '1', 'latest_version': None, 'conda_file': {'channels': ['anaconda', 'conda-forge'], 'dependencies': ['python=3.11.7', 'pip=21.3.1', {'pip': ['azureml-fsspec==1.3.0', 'mltable==1.6.0', 'azure-ai-ml == 1.12.1', 'pandas==2.1.4', 's

In [8]:
# we can verify that the environment has been created
envs = ml_client.environments.list()
print([(env.name, env.latest_version) for env in envs])

[('diabetes-env', '1'), ('titanic-env', '5'), ('CliV2AnonymousEnvironment', '0'), ('pytorch-env', '1'), ('testenv-conda-002', '1'), ('testenv-conda', '1'), ('testenv', '1'), ('AzureML-AI-Studio-Development', '1'), ('AzureML-ACPT-pytorch-1.13-py38-cuda11.7-gpu', '10'), ('AzureML-ACPT-pytorch-1.12-py38-cuda11.6-gpu', '14'), ('AzureML-ACPT-pytorch-1.12-py39-cuda11.6-gpu', '14'), ('AzureML-ACPT-pytorch-1.11-py38-cuda11.5-gpu', '14'), ('AzureML-ACPT-pytorch-1.11-py38-cuda11.3-gpu', '17'), ('AzureML-responsibleai-0.21-ubuntu20.04-py38-cpu', '7'), ('AzureML-responsibleai-0.20-ubuntu20.04-py38-cpu', '9'), ('AzureML-tensorflow-2.5-ubuntu20.04-py38-cuda11-gpu', '27'), ('AzureML-tensorflow-2.6-ubuntu20.04-py38-cuda11-gpu', '26'), ('AzureML-tensorflow-2.7-ubuntu20.04-py38-cuda11-gpu', '26'), ('AzureML-sklearn-1.0-ubuntu20.04-py38-cpu', '36'), ('AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu', '36'), ('AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu', '44'), ('AzureML-pytorch-1.8-ubuntu18.04-py37