In [None]:
Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

In [None]:
# Load required python packages
from azureml.core import Workspace, Run, Model, Dataset
from azureml.data import OutputFileDatasetConfig
from datetime import datetime
from easydict import EasyDict as edict
import numpy as np
import pandas as pd 
from azureml.core import Experiment
import os

In [None]:
# Check current conda env if it is "automl-eunk"
import sys
print(sys.executable)

In [None]:
# Setup workspace info
subscription_id = '<my-subscription-id>'
resource_group = '<my-resource-group>'
workspace_name = '<my-workspace-name>'

ws = Workspace(subscription_id, resource_group, workspace_name)

In [None]:
# Get all dataset for current workspace
ws.datasets

In [None]:
# Store dataset list to edict
ed_datasets = edict(ws.datasets)

# Show dataset list
datasets_list = list(ed_datasets.keys())
datasets_list

In [None]:
# Process tags using Pandas dataframe 
# Load all data info to Pandas Dataframe
ds_list = []
ds_dict = {}
for _, _dataset in enumerate(datasets_list):
    ds = Dataset.get_by_name(workspace, _dataset)
    for j in range(1, ds.version+1): # This code is for getting all dataset version data
        j = str(j)
        vds = Dataset.get_by_name(workspace, _dataset, version=j)
        ds_dict = vds.tags
        ds_dict["dataset_id"] = vds.id
        ds_dict["dataset_name"] = vds.name
        ds_dict["dataset_version"] = vds.version
        ds_list.append(ds_dict)
df_dataset = pd.DataFrame.from_dict(ds_list) 

In [None]:
# Create tag filter statement 
def filter_dataset_using_tags(**taglist):
    # Step1. create filter condition list
    filter_condition_list = []
    for k, v in taglist.items():
        condition = f'({k}==\'{v}\')'
        filter_condition_list.append(condition)
    # Step2. join condition list
    condition = '&'.join(filter_condition_list)
    # Step3. show query result
    display(df_dataset.query(condition))

In [None]:
# Search tags depends on various condition.
# Case1 - check the dataset which is labeler_ver (0.1)
taglist = {'labeler_ver':'0.1'}
filter_dataset_using_tags(**taglist)

In [None]:
# Case2 - check the dataset which is labeler_ver (0.1) and augmentor_ver(0.1)
taglist = {'labeler_ver':'0.1', 'augmentor_ver':'0.1'}
filter_dataset_using_tags(**taglist)

In [None]:
# Case3 - check the dataset with dataset_name
taglist = {'dataset_name':'word_nn_train_positive_sample_pcm'}
filter_dataset_using_tags(**taglist)

In [None]:
# Case4 - check the dataset with a channel value of 2 or more. 
condition = 'channels >= \'2\''
df_dataset.query(condition)

In [None]:
# Case5 - check the number of files in the datasets that have the tag of pitch_ratio 
df_dataset[df_dataset.pitch_ratio.notnull()]

In [None]:
# Case6 - check the list of files in the dataset with the tags pitch_ratio and volume_ratio. 
df_dataset[df_dataset.pitch_ratio.notnull() & df_dataset.volume_ratio.notnull()]