## 1. Install dependencies
Before starting off, if you are running the notebook on Azure Machine Learning Studio or running first time locally, you will need the following packages

In [None]:
! pip install azure-ai-ml==1.23.1
! pip install azure-identity==1.19.0

### 2.1 Connect to Azure Machine Learning workspace

Before we dive in the code, you'll need to connect to your workspace. The workspace is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning.

We are using `DefaultAzureCredential` to get access to workspace. `DefaultAzureCredential` should be capable of handling most scenarios. If you want to learn more about other available credentials, go to [set up authentication doc](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-setup-authentication?tabs=sdk), [azure-identity reference doc](https://learn.microsoft.com/en-us/python/api/azure-identity/azure.identity?view=azure-python).

In [None]:
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from azure.identity import DefaultAzureCredential
import mltable
import pandas as pd
import os

pd.set_option('display.max_colwidth', None)

credential = DefaultAzureCredential()

ml_client = MLClient.from_config(credential)

## 3 Process exports
The Azure Machine Learning feature "Data Labeling" can be used to label data and export the created labels.  
This process needs to be done manually for now in each project under with the following settings:  
Asset type: Labeled  
Export Format: Azure ML dataset  
Include these details in the export output: Labeler details

After the export successfully ran, the code below will search for the latest versions of the exported datasets and combine those.


In [None]:
labeling_projects_to_combine = ["project_based_on_data_from_notebook", "eco_labeling_1"]
new_dataset_name = "combined_dataset"
new_dataset_description = "This is the combined dataset of the projects: " + ", ".join(labeling_projects_to_combine)

In [None]:
projects_with_datasets = {}
for p in labeling_projects_to_combine:
    projects_with_datasets[p] = []


for d in ml_client.data.list():
    for p in labeling_projects_to_combine:
        if p in d.name:
            projects_with_datasets[p].append(d)

# get latest export per project
for p, datasets in projects_with_datasets.items():
    projects_with_datasets[p] = sorted(datasets, key=lambda x: x.name, reverse=True)[0]

projects_with_datasets

In [None]:
list_of_df = []
for p, dataset in projects_with_datasets.items():
    data_asset = ml_client.data.get(dataset.name, version="1")
    tbl = mltable.load(f'azureml:/{data_asset.id}')
    df = tbl.to_pandas_dataframe()
    list_of_df.append(df)

In [None]:
df = pd.concat(list_of_df)
df.head(5)

In [None]:
# will format the streaminfo object of the MLTable to back to a string like:
# azureml://subscriptions/5121ed4d-cbd2-4f4d-a566-99571d08db8f/resourcegroups/eco-azureml-rg/workspaces/eco-azureml/datastores/workspaceblobstore/paths/odFridgeObjects/images/85.jpg

df.image_url = df.image_url.apply(
    lambda x: "azureml://subscriptions/" +
    x.arguments["subscription"] + 
    "/resourcegroups/" + x.arguments["resourceGroup"] +
    "/workspaces/" + x.arguments["workspaceName"] +
    "/datastores/" + x.arguments["datastoreName"] +
    "/paths" + x.resource_id.lstrip(x.arguments["datastoreName"])
    )

df.image_url = df.image_url.astype(str)

In [None]:
# optionally filter on subset of labels
labels_subset = ["beverage"]

def filter_labels(row, labels_subset):
    filtered_labels = [d for d in row['label'] if d['label'] in labels_subset]
    filtered_confidence = [row['label_confidence'][i] for i, d in enumerate(row['label']) if d['label'] in labels_subset]
    return pd.Series([filtered_labels, filtered_confidence])


if labels_subset:
    new_dataset_description = new_dataset_description + " filtered for the following labels: " + ", ".join(labels_subset)
    df[['label', 'label_confidence']] = df.apply(filter_labels, labels_subset=labels_subset, axis=1)

df.label.head(5)

In [None]:
df.label_confidence.head(5)

In [None]:
folder = new_dataset_name
os.makedirs(folder, exist_ok=True)

filename = "labeledDatapoints_1.jsonl"
filename_with_folder = folder + "/" + filename

with open(filename_with_folder, "w") as f:
    f.write(df.to_json(orient='records', lines=True, force_ascii=False).replace("\\/","/"))

In [None]:
# create paths to the data files
paths = [{"file": filename_with_folder}]

# create an MLTable from the data files
tbl = mltable.from_json_lines_files(
    paths=paths
)

tbl = tbl.convert_column_types({"image_url": "stream_info"})

tbl.save(folder)

# Define the Data asset object
my_data = Data(
    path=folder,
    type=AssetTypes.MLTABLE,
    description=new_dataset_description,
    name=new_dataset_name
)

# Create the data asset in the workspace
ml_client.data.create_or_update(my_data)