In [5]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI

import pandas as pd
from pathlib import Path, PurePath
import logging

from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration
from deriva_ml import MLVocab as vc
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [7]:
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

You are already logged in.


In [8]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)



In [None]:
# RID of source dataset, if any.
source_dataset = '2-277G'

# EA.add_term(vc.workflow_type, "Create Dataset Workflow", description="A workflow to test creating a new dataset in eyeAI")
# Workflow instance
workflow_instance = Workflow(
    name="Dataset splitter creation",
    url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Huy/VGG19_DATA_SPLIT.ipynb",
    workflow_type="Create Dataset Workflow"
)
# Configuration instance.

# Set to False if you only need the metadata from the bag, and not the assets.
download_assets = True

config = ExecutionConfiguration(
    # Comment out the following line if you don't need the assets.
    # datasets=[source_dataset] if download_assets else [],
    datasets=[source_dataset],
    workflow=workflow_instance,
    description="Splitting the original dataset.")

# Initialize execution
execution = EA.create_execution(config)

In [None]:
print(execution)

In [None]:
ds_bag = DatasetBag(execution.dataset_paths[0])

In [None]:
ds_bag.list_tables()

In [None]:
ds_bag.get_table_as_dataframe('Image')

In [None]:
angle_2_df =  EA.filter_angle_2(ds_bag)
angle_2_df

In [None]:
image_rids = angle_2_df.RID.tolist()
print(len(image_rids))

In [None]:
import random, copy
# <100 images = 10 sets
# 100 to 999 = 5 sets
# 1000 to 2999 = 3 sets
# >=3000 = 1 set

def range_split(images):
    num_of_split = 0
    if images < 100:
        num_of_split = 10
    elif images >= 100 and images < 1000:
        num_of_split = 5
    elif images >= 1000 and images < 3000:
        num_of_split = 3
    else:
        num_of_split = 1
    return num_of_split

def split_dataset(image_rids):
    num_image_split = [10, 200, 500, 1000, 2000, 3000]
    res = {}
    for num_images in num_image_split:
        current_subset_sets = []
        random.shuffle(image_rids)
        curr_image_rids = image_rids
        num_split = range_split(num_images)
        for _ in range(num_split):
            subset = curr_image_rids[:num_images]
            current_subset_sets.append(subset)
            curr_image_rids =  curr_image_rids[num_images:]
        res[num_images] = current_subset_sets
    return res 

sets = split_dataset(image_rids)

In [None]:
for key, value in sets.items():
    master_dataset = execution.create_dataset(['LAC'], description=f'The VGG19 master dataset consists of multiple sub-datasets, with each dataset containing {key} images.')
    training_sets = []
    for i, item in enumerate(value, start=1):
        training_dataset = execution.create_dataset(['LAC', 'Training'], description=f'A VGG19 training dataset of {key} images, No {i}')
        EA.add_dataset_members(dataset_rid=training_dataset, members=item)
        training_sets.append(training_dataset)
    EA.add_dataset_members(dataset_rid=master_dataset, members=training_sets)

In [None]:
execution.upload_execution_outputs(clean_folder=True)