In [24]:
from azure.ai.ml import command, Input, MLClient, UserIdentityConfiguration, ManagedIdentityConfiguration
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml.dsl import pipeline
from dotenv import load_dotenv
import pandas as pd

# specify the details of your subscription
SUBSCRIPTION_ID = "e5615bfe-b43b-41ce-bccb-b78867c2ce63"
RESOURCE_GROUP = "rg-dp100-demo-001"
WORKSPACE_NAME = "mlw-dp100-demo"
DATASTORE_NAME = "blobdatastore2"

# get a handle to the subscription
load_dotenv("python.env")

ml_client = MLClient(DefaultAzureCredential(), 
                     subscription_id=SUBSCRIPTION_ID, 
                     resource_group_name=RESOURCE_GROUP,
                     workspace_name=WORKSPACE_NAME)

In [7]:
envs = ml_client.environments.list()
# we can print the name and latest version of the environments
print([(env.name, env.latest_version) for env in envs])
env = ml_client.environments.get(name="titanic-env", version=2)

[('titanic-env', '2'), ('CliV2AnonymousEnvironment', '0'), ('pytorch-env', '1'), ('testenv-conda-002', '1'), ('testenv-conda', '1'), ('testenv', '1'), ('AzureML-AI-Studio-Development', '1'), ('AzureML-ACPT-pytorch-1.13-py38-cuda11.7-gpu', '10'), ('AzureML-ACPT-pytorch-1.12-py38-cuda11.6-gpu', '14'), ('AzureML-ACPT-pytorch-1.12-py39-cuda11.6-gpu', '14'), ('AzureML-ACPT-pytorch-1.11-py38-cuda11.5-gpu', '14'), ('AzureML-ACPT-pytorch-1.11-py38-cuda11.3-gpu', '17'), ('AzureML-responsibleai-0.21-ubuntu20.04-py38-cpu', '7'), ('AzureML-responsibleai-0.20-ubuntu20.04-py38-cpu', '9'), ('AzureML-tensorflow-2.5-ubuntu20.04-py38-cuda11-gpu', '27'), ('AzureML-tensorflow-2.6-ubuntu20.04-py38-cuda11-gpu', '26'), ('AzureML-tensorflow-2.7-ubuntu20.04-py38-cuda11-gpu', '26'), ('AzureML-sklearn-1.0-ubuntu20.04-py38-cpu', '36'), ('AzureML-pytorch-1.10-ubuntu18.04-py38-cuda11-gpu', '36'), ('AzureML-pytorch-1.9-ubuntu18.04-py37-cuda11-gpu', '44'), ('AzureML-pytorch-1.8-ubuntu18.04-py37-cuda11-gpu', '43'), ('

### 1. Try Reading the Data from Registered Datastore

- Main Reference: 
    - Using `command()`: https://learn.microsoft.com/en-us/azure/machine-learning/how-to-read-write-data-v2?view=azureml-api-2&tabs=python
    - Using Pandas: https://learn.microsoft.com/en-us/azure/machine-learning/tutorial-pipeline-python-sdk?view=azureml-api-2 
- The path to datastore must follow this format - note the "paths" constant: `azureml://datastores/<data_store_name>/paths/<subfolder/file.extension>`
- Note that the datatore was set up to connect to a specific container name, thus the container name is alrealdy treated as the root folder and not included in the path above. If the container name is specified, or any wrong path is provided, StreamNotFound error will be thrown, indicating that the data is not found with the (wrong) path.
- The compute target must be a compute cluster, else if using compute instance, the `UserError` of not being the owner of the compute will arise (unknown reason)
- For simplicity, if using a custom environment, use the `Environment` instance as the input to the command environment argument, instead of an address.

In [28]:
datastore_path = "azureml://datastores/blobdatastore2/paths/titanic_train.csv"
"azureml://datastores/[a-zA-Z0-9_]+/paths/.*"
data_type = AssetTypes.URI_FILE
mode = InputOutputModes.RO_MOUNT
identity = ManagedIdentityConfiguration()
env = ml_client.environments.get(name="testenv", version=1)

In [29]:
inputs = {
    "input_data": Input(type=data_type, path=datastore_path, mode=mode)
}
# This command job uses the head Linux command to print the first 10 lines of the file
job = command(
    command="head ${{inputs.input_data}}",
    inputs=inputs,
    environment=env,
    compute="vmcluster-ml-dev",
    identity=identity,
)
# Submit the command
ml_client.jobs.create_or_update(job)



Experiment,Name,Type,Status,Details Page
dp-100,olive_jackal_lhmsy9phql,command,Starting,Link to Azure Machine Learning studio


In [30]:
# we can directly read the data from the datastore using the long-form URI:
PATH = 'titanic.csv'
uri = f'azureml://subscriptions/{SUBSCRIPTION_ID}/resourcegroups/{RESOURCE_GROUP}/workspaces/{WORKSPACE_NAME}/datastores/{DATASTORE_NAME}/paths/{PATH}'
df = pd.read_csv(uri)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


### Creating the 1st Component - data_prep

In [31]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

# getting our custom registered environment:
pipeline_job_env = ml_client.environments.get(name="titanic-env", version=3)
# pointing to the source folder of the component
data_prep_src_dir = "./src/components"
# define the URI path to the data
PATH = 'titanic.csv'
uri = f'azureml://subscriptions/{SUBSCRIPTION_ID}/resourcegroups/{RESOURCE_GROUP}/workspaces/{WORKSPACE_NAME}/datastores/{DATASTORE_NAME}/paths/{PATH}'

data_prep_component = command(
    name="data_prep_titanic_survival",
    display_name="Data preparation for training",
    description="reads input, split the input to train and test",
    inputs={"data": Input(type="uri_folder"),
            "test_train_ratio": Input(type="number"),
            },
    outputs=dict(train_data=Output(type="uri_folder", mode="rw_mount"),
                 test_data=Output(type="uri_folder", mode="rw_mount"),
                 ),
    # The source folder of the component
    code=data_prep_src_dir,
    command="""python data_prep.py \
            --data ${{inputs.data}} --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [32]:
#  Optional:
# Now we register the component to the workspace
data_prep_component = ml_client.create_or_update(data_prep_component.component)

# Create (register) the component in your workspace
print(
    f"Component {data_prep_component.name} with Version {data_prep_component.version} is registered"
)

[32mUploading components (0.0 MBs): 100%|##########| 1547/1547 [00:00<00:00, 19791.03it/s]
[39m



Component data_prep_titanic_survival with Version 1 is registered


### Create component 2: training (using yaml definition)