In [28]:
from azure.ai.ml import command, Input, MLClient, UserIdentityConfiguration, ManagedIdentityConfiguration
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes, InputOutputModes
from azure.ai.ml.dsl import pipeline
from dotenv import load_dotenv
import pandas as pd
import os

# specify the details of your subscription
SUBSCRIPTION_ID = "e5615bfe-b43b-41ce-bccb-b78867c2ce63"
RESOURCE_GROUP = "rg-dp100-demo-001"
WORKSPACE_NAME = "mlw-dp100-demo"
DATASTORE_NAME = "blobdatastore2"

# get a handle to the subscription
load_dotenv("python.env")

ml_client = MLClient(DefaultAzureCredential(), 
                     subscription_id=SUBSCRIPTION_ID, 
                     resource_group_name=RESOURCE_GROUP,
                     workspace_name=WORKSPACE_NAME)

### 1. URI File Data Asset

Supported paths:
- Local: `./<path>`
- Azure Blob Storage:
     `wasbs://<accountname>.blob.core.windows.net/<containername>/<path_to_data>/`
    
- Azure Data Lake Storage (Gen 2):
    `abfss://<file_system>@<account_name>.dfs.core.windows.net/<folder>/<file>`
    
- Datastore:
    `azureml://datastores/<datastore_name>/paths/<folder>/<file>`

In [2]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


# create a data asset from a local file
data_asset_local = './data/titanic.csv'

my_data = Data(
    path=data_asset_local,
    type=AssetTypes.URI_FILE,
    description="Data asset created from local file",
    name="titanic_data_asset_from_local",
)

ml_client.data.create_or_update(my_data)

[32mUploading titanic.csv[32m (< 1 MB): 100%|##########| 60.3k/60.3k [00:00<00:00, 241kB/s]
[39m



Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_data_asset_from_local', 'description': 'Data asset created from local file', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/titanic_data_asset_from_local/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000023895D71590>, 'serialize': <msrest.serialization.Serializer object at 0x0000023895D7DBD0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/workspaceblobstore/

In [18]:
# create a data asset from datastore:
# with the datastore authorized via SAS token
data_asset_datastore = 'azureml://datastores/blob_titanic_sas/paths/titanic.csv'

my_data = Data(
    path=data_asset_datastore,
    type=AssetTypes.URI_FILE,
    description="Data asset created from datastore",
    name="titanic_data_asset_from_datastore_sas",
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_data_asset_from_datastore_sas', 'description': 'Data asset created from datastore', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/titanic_data_asset_from_datastore_sas/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000023897453990>, 'serialize': <msrest.serialization.Serializer object at 0x0000023897452110>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob

In [4]:
# create a data asset from Blob storage directly
# this method faces the same issue of not being authorized to access the storage account
# seen in both SDK method and UI method.
# data_asset_blob = 'wasbs://stdp100demo.blob.core.windows.net/datacontainer/titanic.csv'
data_asset_blob = 'wasbs://datacontainer@stdp100demo.blob.core.windows.net/titanic.csv'
my_data = Data(
    path=data_asset_blob,
    type=AssetTypes.URI_FILE,
    description="Data asset created from storage account directly",
    name="titanic_data_asset_from_storage_account",
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_file', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_data_asset_from_storage_account', 'description': 'Data asset created from storage account directly', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/titanic_data_asset_from_storage_account/versions/3', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000020421F24F90>, 'serialize': <msrest.serialization.Serializer object at 0x0000020421FB75D0>, 'version': '3', 'latest_version': None, 'path': 'wasbs://datacontainer@stdp100demo.blob.core.windows.net/titanic.csv', 'datastore': None})

In [8]:
# then we can read the underlying data from the data asset by using its .path attribute
titanic_data = ml_client.data.get(name="titanic_data_asset_from_datastore_sas", version=1)
file_path = titanic_data.path
print(f"Data asset file URI: {file_path}")
pd.read_csv(file_path).head()

Data asset file URI: azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_titanic_sas/paths/titanic.csv


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### 2. URI Folder Asset

In [7]:
# create a data asset from datastore:
# with the datastore authorized via SAS token
data_asset_datastore = 'azureml://datastores/blob_titanic_sas/paths/train_test_split/'

my_data = Data(
    path=data_asset_datastore,
    type=AssetTypes.URI_FOLDER,
    description="Data asset created from datastore",
    name="titanic_folder_data_asset_from_datastore_sas",
)

ml_client.data.create_or_update(my_data)

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_folder', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_folder_data_asset_from_datastore_sas', 'description': 'Data asset created from datastore', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/titanic_folder_data_asset_from_datastore_sas/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000020421F84450>, 'serialize': <msrest.serialization.Serializer object at 0x0000020427998DD0>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo

In [32]:
# after creating the data asset as URI folder, we can get its URI 
# this is identical to the datastore URI based on which the data asset is created
titanic_folder_data = ml_client.data.get(name="titanic_folder_data_asset_from_datastore_sas", version=1)
folder_path = titanic_folder_data.path

# to get the datastore URI from the AML UI, select the datastore, browse it, and click the ... button, then "Copy URI"
datastore_uri = 'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp100-demo/datastores/blob_titanic_sas/paths/train_test_split/'
folder_path == datastore_uri

In [36]:
# Datastore URIs are implementation of the Filessytem spec (fsspec) protocol.
# from the URI of the folder, we can perform folder-like operations on it
from azureml.fsspec import AzureMachineLearningFileSystem

# instantiate file system using the data asset URI
fs = AzureMachineLearningFileSystem(folder_path)

fs.ls() # list folders/files in datastore 'datastorename'

# we can iterate through the files in the folder and read them into a dataframe:
for file in fs.ls():
    print(file)
    print(type(file))
    df = pd.read_csv(fs.open(file))
    print(df.head(2))

train_test_split/titanic_test.csv
<class 'str'>
   PassengerId  Survived  Pclass                                 Name     Sex  \
0           10         1       2  Nasser, Mrs. Nicholas (Adele Achem)  female   
1           12         1       1             Bonnell, Miss. Elizabeth  female   

    Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
0  14.0      1      0  237736  30.0708   NaN        C  
1  58.0      0      0  113783  26.5500  C103        S  
train_test_split/titanic_train.csv
<class 'str'>
   PassengerId  Survived  Pclass                   Name   Sex  Age  SibSp  \
0          496         0       3  Yousseff, Mr. Gerious  male  NaN      0   
1          649         0       3     Willey, Mr. Edward  male  NaN      0   

   Parch         Ticket     Fare Cabin Embarked  
0      0           2627  14.4583   NaN        C  
1      0  S.O./P.P. 751   7.5500   NaN        S  


In [37]:
# we can introduce a logic to only open the .csv files in the folder
for file in fs.ls():
    if file.endswith('.csv'):
        print(file)
        print(type(file))
        df = pd.read_csv(fs.open(file))
        print(df.head(2))

train_test_split/titanic_test.csv
<class 'str'>
   PassengerId  Survived  Pclass                                 Name     Sex  \
0           10         1       2  Nasser, Mrs. Nicholas (Adele Achem)  female   
1           12         1       1             Bonnell, Miss. Elizabeth  female   

    Age  SibSp  Parch  Ticket     Fare Cabin Embarked  
0  14.0      1      0  237736  30.0708   NaN        C  
1  58.0      0      0  113783  26.5500  C103        S  
train_test_split/titanic_train.csv
<class 'str'>
   PassengerId  Survived  Pclass                   Name   Sex  Age  SibSp  \
0          496         0       3  Yousseff, Mr. Gerious  male  NaN      0   
1          649         0       3     Willey, Mr. Edward  male  NaN      0   

   Parch         Ticket     Fare Cabin Embarked  
0      0           2627  14.4583   NaN        C  
1      0  S.O./P.P. 751   7.5500   NaN        S  


In [38]:
# an alternative way to materialize URI folder into pandas is to use the mltable package
import mltable

titanic_folder_data = ml_client.data.get(name="titanic_folder_data_asset_from_datastore_sas", version=1)
folder_path = titanic_folder_data.path

path = {'folder': folder_path}

tbl = mltable.from_delimited_files(paths=[path])
df = tbl.to_pandas_dataframe()
print(df.shape)
df.head(2)

(891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,10,True,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
1,12,True,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.55,C103,S


### MLTable Data Asset

- We need to include a MLTable file in the same folder as the data we want to read. The MLTable file includes the path pointing to the data you want to read, and how to read the data.
- Otherwise there is a warning "Unable to access MLTable metadata at path"
- The data asset is still created but cannot be read.

```bash
type: mltable
paths:
  - pattern: ./*.csv
transformations:
  - read_delimited:
      delimiter: ','
      encoding: ascii
      header: all_files_same_headers
```

In [39]:
data_asset_datastore = 'azureml://datastores/blob_titanic_sas/paths/titanic.csv'

my_data = Data(
    path=data_asset_datastore,
    type=AssetTypes.MLTABLE,
    description="MLTable data asset created from datastore",
    name="titanic_mltable_data_asset_from_datastore_sas",
)

ml_client.data.create_or_update(my_data)

Unable to access MLTable metadata at path azureml://datastores/blob_titanic_sas/paths/titanic.csv


Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'titanic_mltable_data_asset_from_datastore_sas', 'description': 'MLTable data asset created from datastore', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourceGroups/rg-dp100-demo-001/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-demo/data/titanic_mltable_data_asset_from_datastore_sas/versions/1', 'Resource__source_path': None, 'base_path': 'd:\\Repositories\\GitHub\\dp-100', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000020401A11A50>, 'serialize': <msrest.serialization.Serializer object at 0x0000020427F98210>, 'version': '1', 'latest_version': None, 'path': 'azureml://subscriptions/e5615bfe-b43b-41ce-bccb-b78867c2ce63/resourcegroups/rg-dp100-demo-001/workspaces/mlw-dp1