In [None]:
# repo_dir = "Repos"   # Set this to be where your github repos are located.
# %load_ext autoreload
# %autoreload 2

# # Update the load path so python can find modules for the model
# import sys
# from pathlib import Path
# sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
# sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))

In [None]:
# Prerequisites
from eye_ai.eye_ai import EyeAI
from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration
from deriva_ml import MLVocab as vc

In [None]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

In [None]:
cache_dir = '/data'
working_dir = 'data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

# Configuration

Set up an execution for dataset creation.  Note that this configuration will download all of the assets associated
with the dataset.  If you only need the metadata,then set download_assets to `False`.

In [None]:
# RID of source dataset, if any.
source_dataset = '2-7K8W'

EA.add_term(vc.workflow_type, "Test Workflow", description="A test Workflow for new DM")
# Workflow instance
test_workflow = Workflow(
    name="Dataset creation template",
    url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/templates/template_dataset.ipynb",
    workflow_type="Test Workflow"
)
# Configuration instance.

# Set to False if you only need the metadata from the bag, and not the assets.
download_assets = True

config = ExecutionConfiguration(
    # Comment out the following line if you don't need the assets.
    datasets=[source_dataset] if download_assets else [],
    workflow=test_workflow,
    description="Template instance of a dataset partitioned workflow")

# Initialize execution
execution = EA.create_execution(config)

In [None]:
print(execution)

# Create DatasetBag

All of the bags in the execution spec are automatically downloaded, so we just need to get the path where they are located from the
execution configuration.

In [None]:
ds_bag = DatasetBag(execution.bag_paths[0])

Now that we have a handle to the downloaded dataset, lets get the list of subjects in the dataset, so we can subset them to
make a new dataset.  Once we have done that, we can compute whatever subset we want.

If you don't want subjects, just generate the list of RIDs of whatever objects you need.

In [None]:
subject_df = ds_bag.get_table_as_dataframe('Subject')

# Add code to select which subjects you want to include in this dataset.  The result should
# be a list of Subject RIDs.
subject_rids = subject_df.RID.tolist()
training_rids = subject_rids[0:2] #slice the dataset and extract a list of subject rid
test_rids = subject_rids[2:4]
validation_rids = subject_rids[4:]

In [None]:
subject_df

# Create dataset

We will create a dataset for each of the partitions, and one dataset to represent the complete set of data we have.

In [None]:
partitioned_dataset = execution.create_dataset(['LAC'], description='A multimodal training dataset with partioning')
training_dataset = execution.create_dataset(['LAC', 'Training'], description='A multimodal training dataset')
test_dataset = execution.create_dataset(['LAC', 'Testing'], description='A multimodal test dataset')
validation_dataset = execution.create_dataset(['LAC', 'Validation'], description='A multimodal validation dataset')

# Add subjects into the new dataset

In [None]:
EA.add_dataset_members(dataset_rid=training_dataset, members=training_rids)
EA.add_dataset_members(dataset_rid=test_dataset, members=test_rids)
EA.add_dataset_members(dataset_rid=validation_dataset, members=validation_rids)

# Add subdatasets to a dataset

In [None]:
EA.add_dataset_members(dataset_rid=partitioned_dataset, members= [training_dataset, test_dataset, validation_dataset])

# Upload results

The datasets have already been uploaded to the catalog.  However, we want to record any metadata about the execution, hence we need to do this last step.

In [None]:
# upload assets to catalog
execution.upload_execution_outputs(clean_folder=True)