In [1]:
from azure.ai.ml import command, Input, MLClient, UserIdentityConfiguration, ManagedIdentityConfiguration
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml.dsl import pipeline
from dotenv import load_dotenv
import pandas as pd
import os

# specify the details of your subscription
SUBSCRIPTION_ID = "e5615bfe-b43b-41ce-bccb-b78867c2ce63"
RESOURCE_GROUP = "rg-dp100-demo-001"
WORKSPACE_NAME = "mlw-dp100-demo"
DATASTORE_NAME = "blobdatastore2"

# get a handle to the subscription
load_dotenv("python.env")

ml_client = MLClient(DefaultAzureCredential(), 
                     subscription_id=SUBSCRIPTION_ID, 
                     resource_group_name=RESOURCE_GROUP,
                     workspace_name=WORKSPACE_NAME)

### Method 1: using the default identity

- the identity understood by the MLClient from the step above
- recall that the MLClient is using the "pythonapp" SP, which has contributor role to the resource group, and hence can create/manage resources inside the RG like the storage account below

- However, the datastore created by the registered app identity cannot browse data in the UI.
- tried assigning the security group with various storage account roles but nothing changed: Storage Blob Reader/Contributor/Owner, Admin Owner, Reader and Data Access
- As a result, the data asset created via this datastore cannot access the actual data on blob

In [2]:
from azure.ai.ml.entities import AzureBlobDatastore

store_id = AzureBlobDatastore(
    name="blob_titanic_identity",
    description="Datastore for Titanic data, created with identity based authorization",
    account_name="stdp100demo",
    container_name="datacontainer"
)

ml_client.create_or_update(store_id)

AzureBlobDatastore({'type': <DatastoreType.AZURE_BLOB: 'AzureBlob'>, 'name': 'blob_titanic_identity', 'description': 'Datastore for Titanic data, created with identity based authorization', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/datastores/blob_titanic_identity', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0x00000142A39A7F10>, 'credentials': <azure.ai.ml.entities._credentials.NoneCredentialConfiguration object at 0x00000142A3723DD0>, 'container_name': 'datacontainer', 'account_name': 'stdp100demo', 'endpoint': 'core.windows.net', 'protocol': 'https'})

### Method 2: using the SAS token signed by one of the access keys

Unlike the issue above, using SAS is smooth. Datastore can browse the blob from UI, the data asset via this datastore can also see the data, and parse it to pandas dataframe

In [6]:
# prerequisite: create a SAS token for the storage account with appropriate permissions

from azure.ai.ml.entities import SasTokenConfiguration

store_sas = AzureBlobDatastore(
    name="blob_titanic_sas",
    description="Datastore for Titanic data, created with SAS token authorization.",
    account_name="stdp100demo",
    container_name="datacontainer",
    credentials=SasTokenConfiguration(
        sas_token= "?sv=2022-11-02&ss=bfqt&srt=sco&sp=rwdlacupiytfx&se=2024-02-08T00:40:37Z&st=2024-01-24T16:40:37Z&spr=https&sig=l%2Btc2qkT5YY6odyz57Q2BYArtGq9UB9Fxo7ElApAuS0%3D"
    ),
)

ml_client.create_or_update(store_sas)

AzureBlobDatastore({'type': <DatastoreType.AZURE_BLOB: 'AzureBlob'>, 'name': 'blob_titanic_sas', 'description': 'Datastore for Titanic data, created with SAS token authorization.', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/datastores/blob_titanic_sas', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': None, 'serialize': <msrest.serialization.Serializer object at 0x0000020144D3F1D0>, 'credentials': {'type': 'sas'}, 'container_name': 'datacontainer', 'account_name': 'stdp100demo', 'endpoint': 'core.windows.net', 'protocol': 'https'})

### Method 3: using he account key directly (a.k.a Shared Key authorization, recommended)

### List datastores in the workspace

In [2]:
[store.name for store in ml_client.datastores.list()]

['blob_titanic_identity',
 'blob_titanic_account_identity',
 'blob_titanic_sas',
 'azureml_globaldatasets',
 'blobdatastore2',
 'blobdatastore',
 'workspaceartifactstore',
 'workspaceworkingdirectory',
 'workspaceblobstore',
 'workspacefilestore']

### List data assets in the workspace

In [10]:
[(asset.name, asset.latest_version) for asset in ml_client.data.list()]

[('titanicdata', '1'),
 ('bike-rentals', '1'),
 ('titanic', '1'),
 ('dataset', '01b454e5'),
 ('titanic_data_asset_from_local', '1'),
 ('titanic_data_asset_from_blob', '1'),
 ('titanic_data_asset_from_datastore_sas', '1'),
 ('titanic_data_asset_from_datastore_identity', '3')]

In [8]:
assets = [asset for asset in ml_client.data.list()]
asset = assets[-1]
asset.__dict__

{'_skip_validation': False,
 '_mltable_schema_url': None,
 '_referenced_uris': None,
 'type': 'uri_file',
 '_is_anonymous': False,
 '_auto_increment_version': True,
 'auto_delete_setting': None,
 'name': 'titanic_data_asset_from_datastore_identity',
 'description': None,
 'tags': {},
 'properties': {},
 'print_as_yaml': True,
 '_id': None,
 '_Resource__source_path': None,
 '_base_path': 'd:\\Repositories\\GitHub\\dp-100',
 '_creation_context': <azure.ai.ml.entities._system_data.SystemData at 0x18bfc8efc50>,
 '_serialize': <msrest.serialization.Serializer at 0x18bfc903050>,
 '_version': None,
 'latest_version': '3',
 '_path': None,
 'datastore': None}

### Getting the actual data via data asset

In [2]:
# from the data asset created from SAS-authorized datastore
data_asset = ml_client.data.get("titanic_data_asset_from_datastore_sas", version="1")
df = pd.read_csv(data_asset.path)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [3]:
# from the data asset created from account-key-authorized datastore
data_asset = ml_client.data.get("titanic_data_asset_from_datastore_account_identity", version="1")
df = pd.read_csv(data_asset.path)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C
