In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# ml_client = MLClient(subscription_id="f804f2da-c27b-45ac-bf80-16d4d331776d",
#                     workspace_name="mlw-mlopsv2clas-505prod",
#                     resource_group_name="rg-mlopsv2clas-505prod",
#                     credential=DefaultAzureCredential()
#                     )
# ml_client = MLClient(subscription_id = "e62983d6-29cb-4435-b8d2-b19887c7a735",
#                      resource_group_name = "mltable_PoC",
#                      workspace_name = "mltable_poc",
#                     credential=DefaultAzureCredential()
#                     )

# --subscription f804f2da-c27b-45ac-bf80-16d4d331776d --resource-group rg-mltable-profiler --workspace-name mlw-mltable-profiler

ml_client = MLClient(subscription_id = "f804f2da-c27b-45ac-bf80-16d4d331776d",
                     resource_group_name = "rg-mltable-profiler",
                     workspace_name = "mlw-mltable-profiler",
                     credential=DefaultAzureCredential()
                        )
print(ml_client)

Class WorkspaceHubOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f84591de610>,
         subscription_id=f804f2da-c27b-45ac-bf80-16d4d331776d,
         resource_group_name=rg-mltable-profiler,
         workspace_name=mlw-mltable-profiler)


In [2]:
# list datasets in the workspace
datasets = ml_client.data.list()
for dataset in datasets:
    if dataset.type == "mltable":
        print(dataset.name)

In [3]:
import glob
import re
import os

def build_pair_list(images_dir, masks_dir, 
                    images_filename_pattern, masks_filename_pattern):
    """Builds a list of pairs of paths to image/mask.

    Returns:
        image_masks_pairs (List[tuple(str, str)])
    """
    parsing_stats = {
        "masks_not_matching": 0,
        "images_not_matching": 0,
        "images_without_masks": 0,
    }
    # search for all masks matching file name pattern
    masks_filename_pattern = re.compile(masks_filename_pattern)

    masks_paths = []
    for file_path in glob.glob(masks_dir + "/**/*", recursive=True):
        matches = masks_filename_pattern.match(os.path.basename(file_path))
        if matches:
            masks_paths.append((matches.group(1), file_path))
        else:
            # keep some stats
            parsing_stats["masks_not_matching"] += 1
    masks_paths = dict(masks_paths)  # turn list of tuples into a map

    # search for all images matching file name pattern
    images_filename_pattern = re.compile(images_filename_pattern)
    images_paths = []
    for file_path in glob.glob(images_dir + "/**/*", recursive=True):
        matches = images_filename_pattern.match(os.path.basename(file_path))
        if matches:
            images_paths.append((matches.group(1), file_path))
        else:
            # keep some stats
            parsing_stats["images_not_matching"] += 1

    # now match images and masks
    images = []  # list of images
    masks = []  # list of masks (ordered like images)
    image_masks_pairs = []  # list of tuples

    for image_key, image_path in images_paths:
        if image_key in masks_paths:
            images.append(image_path)
            masks.append(masks_paths[image_key])
            image_masks_pairs.append((image_key, image_path, masks_paths[image_key]))
        else:
            print(
                f"Image {image_path} doesn't have a corresponding mask."
            )
            # keep some stats
            parsing_stats["images_without_masks"] += 1

    parsing_stats["found_pairs"] = len(image_masks_pairs)

    print(f"Finished parsing images/masks paths: {parsing_stats}")

    return image_masks_pairs

In [4]:
image_masks_pairs = build_pair_list(images_dir="./data/images/", 
                                    masks_dir="./data/annotations/trimaps/", 
                                    images_filename_pattern = "(.*)\\.jpg",
                                    masks_filename_pattern = "(.*)\\.png"
                                    )
image_masks_pairs[0:5]

Finished parsing images/masks paths: {'masks_not_matching': 0, 'images_not_matching': 3, 'images_without_masks': 0, 'found_pairs': 7390}


[('Abyssinian_1',
  './data/images/Abyssinian_1.jpg',
  './data/annotations/trimaps/Abyssinian_1.png'),
 ('Abyssinian_10',
  './data/images/Abyssinian_10.jpg',
  './data/annotations/trimaps/Abyssinian_10.png'),
 ('Abyssinian_100',
  './data/images/Abyssinian_100.jpg',
  './data/annotations/trimaps/Abyssinian_100.png'),
 ('Abyssinian_101',
  './data/images/Abyssinian_101.jpg',
  './data/annotations/trimaps/Abyssinian_101.png'),
 ('Abyssinian_102',
  './data/images/Abyssinian_102.jpg',
  './data/annotations/trimaps/Abyssinian_102.png')]

In [5]:
import json
import os

dataset_parent_dir = "."
images_dir="./data/images/", 
masks_dir="./data/annotations/trimaps/", 
images_filename_pattern = "(.*)\\.jpg",
masks_filename_pattern = "(.*)\\.png"
images_ds = "pet_images"
dataset_dir = "./data"
                                    

# We'll copy each JSONL file within its related MLTable folder
training_mltable_path = os.path.join(dataset_parent_dir, "mltable-folder")

# First, let's create the folders if they don't exist
os.makedirs(training_mltable_path, exist_ok=True)

# Path to the training and validation files
train_annotations_file = os.path.join(training_mltable_path, "image_mask.jsonl")


index = 0
# Scan each sub directary and generate a jsonl line per image, distributed on train and valid JSONL files
with open(train_annotations_file, "w") as jsonl:
        
    uri_folder_data_asset = ml_client.data.get(name=images_ds, version=1)
    print(uri_folder_data_asset)
    # Baseline of json line dictionary
    json_line_sample = {
        "image_url": uri_folder_data_asset.path+"/images/",
        "mask_url": uri_folder_data_asset.path+"/annotations/trimaps/",
    }

    # Iterate over each image mask pair
    for image_key, image, mask in image_masks_pairs:
        json_line = dict(json_line_sample)
        json_line["image_url"] += f"{image_key}"
        json_line["mask_url"] += f"{image_key}"

        jsonl.write(json.dumps(json_line) + "\n")

creation_context:
  created_at: '2023-09-14T06:15:04.361758+00:00'
  created_by: Ali Bina
  created_by_type: User
  last_modified_at: '2023-09-14T06:15:04.370483+00:00'
description: OXFORD-IIIT PET Dataset
id: /subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-mltable-profiler/providers/Microsoft.MachineLearningServices/workspaces/mlw-mltable-profiler/data/pet_images/versions/1
name: pet_images
path: azureml://subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourcegroups/rg-mltable-profiler/workspaces/mlw-mltable-profiler/datastores/workspaceblobstore/paths/LocalUpload/e96e8f00a277573a37b4ac32dec5a077/data/
properties: {}
tags: {}
type: uri_folder
version: '1'

