In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI

import pandas as pd
from pathlib import Path, PurePath
import logging

from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration, DatasetVersion
from deriva_ml import MLVocab as vc
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

In [None]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

In [None]:
# RID of source dataset, if any.
datasets = [
            '4-N9XE',
            '4-NAPT',
            '4-NBG6',
            '4-NC9J',
            '4-ND2Y', # 200
            '4-NDWY',
            '4-NFVT',
            '4-NHTP',
            '4-NKSJ',
            '4-NNRE', # 500
            '4-NQQY',
            '4-NVNA',
            '4-NZJP', # 1000
            '2-277G',
           ]

to_be_download = []
for dataset in datasets:
    ds_dict = {
        'rid': dataset,
        'materialize':True,
        'version':EA.dataset_version(dataset_rid=dataset),
    }
    to_be_download.append(ds_dict)


# EA.add_term(vc.workflow_type, "Create Dataset Workflow", description="A workflow to test creating a new dataset in eyeAI")
# Workflow instance
workflow_instance = EA.create_workflow(
    name="Dataset splitter creation",
    workflow_type="Create Dataset Workflow"
)
# Configuration instance.

# Set to False if you only need the metadata from the bag, and not the assets.
download_assets = False

config = ExecutionConfiguration(
    # Comment out the following line if you don't need the assets.
    datasets=to_be_download,
    workflow=workflow_instance,
    description="Splitting the original dataset.")

# Initialize execution
execution = EA.create_execution(config)

In [None]:
print(execution)

In [None]:
ds_bag_list = [execution.datasets[i] for i in range(13)]
ds_bag_train = execution.datasets[13]

In [None]:
angle_2_df =  EA.filter_angle_2(ds_bag_train)
angle_2_df

In [None]:
image_diag = ds_bag_train.get_table_as_dataframe('Image_Diagnosis')
image_diag = image_diag[image_diag['Diagnosis_Tag'] == 'Initial Diagnosis']
image_diag

In [None]:
train_excluded_df = pd.read_csv("~/train_no_optic_disc_image_ids.csv")
train_excluded = train_excluded_df["ID"].tolist()

In [None]:
merged_df = pd.merge(angle_2_df, image_diag, left_on='RID', right_on='Image', how='inner')
df_filtered = merged_df[['Filename', 'Diagnosis_Image' ,'RID_x']]
df_filtered = df_filtered.rename(columns={'RID_x': 'RID'})
df_filtered = df_filtered[~df_filtered["RID"].isin(train_excluded)]
df_filtered

In [None]:
total_count = len(df_filtered['Diagnosis_Image'])
print(f"Total values in Diagnosis_Image: {total_count}")

# Count occurrences of each unique value
value_counts = df_filtered['Diagnosis_Image'].value_counts()
print("\nCounts of each unique value in Diagnosis_Image:")
print(value_counts)

diagnosis_values = df_filtered['Diagnosis_Image'].unique()
print("Values in Diagnosis_Image column:")
print(diagnosis_values)

In [None]:
df_filtered

In [None]:
rids_in_other_ds = {200: [], 500: [], 1000: []}

for ds_bag_item in ds_bag_list:
    images = ds_bag_item.get_table_as_dataframe("Image")["RID"].tolist()
    key = len(images) /2 
    rid_list = rids_in_other_ds.get(key, [])
    rid_list.extend(images)
    rids_in_other_ds[key] = rid_list

In [None]:
import random, copy


def range_split(images):
    num_of_split = 0
    if images < 100:
        num_of_split = 10
    elif images >= 100 and images < 1000:
        num_of_split = 3
    elif images >= 1000 and images < 3000:
        num_of_split = 2
    else:
        num_of_split = 1
    return num_of_split

def split_dataset(df):
    # num_image_split = [10, 200, 500, 1000, 2000, 3000]
    num_image_split = [200, 500, 1000]
    res = {}

    suspected_glaucoma = df[df['Diagnosis_Image'] == 'Suspected Glaucoma']['RID'].tolist()
    no_glaucoma = df[df['Diagnosis_Image'] == 'No Glaucoma']['RID'].tolist()

    suspected_glaucoma = [rid for rid in suspected_glaucoma if rid not in train_excluded]
    no_glaucoma = [rid for rid in no_glaucoma if rid not in train_excluded]
        
    
    for num_images in num_image_split:
        current_subset_sets = []
        num_split = range_split(num_images)
        
        random.shuffle(suspected_glaucoma)
        random.shuffle(no_glaucoma)

          
        curr_suspected_glaucoma_rids = suspected_glaucoma
        curr_no_glaucoma_rids = no_glaucoma

        curr_suspected_glaucoma_rids = [rid for rid in curr_suspected_glaucoma_rids if rid not in rids_in_other_ds[num_images]]
        curr_no_glaucoma_rids = [rid for rid in curr_no_glaucoma_rids if rid not in rids_in_other_ds[num_images]]
        
        print(len(curr_suspected_glaucoma_rids))
        print(len(curr_no_glaucoma_rids))
        print("wat")
        for _ in range(num_split):
            if len(curr_suspected_glaucoma_rids) < num_images or len(curr_no_glaucoma_rids) < num_images:
                curr_suspected_glaucoma_rids = suspected_glaucoma
                curr_no_glaucoma_rids = no_glaucoma
                random.shuffle(curr_suspected_glaucoma_rids)
                random.shuffle(curr_no_glaucoma_rids)
            subset_suspected_glaucoma = curr_suspected_glaucoma_rids[:num_images]
            subset_no_glaucoma =  curr_no_glaucoma_rids[:num_images]
            concat = subset_suspected_glaucoma+subset_no_glaucoma
            current_subset_sets.append(concat)
            curr_suspected_glaucoma_rids =  curr_suspected_glaucoma_rids[num_images:]
            curr_no_glaucoma_rids =  curr_no_glaucoma_rids[num_images:]
                
        res[num_images] = current_subset_sets
    return res 

sets = split_dataset(df_filtered)

In [None]:
for key, value in sets.items():
    flattened = [item for sublist in value for item in sublist]
    all_unique = len(flattened) == len(set(flattened))
    print("Unique?", all_unique)
    print(key)
    print(len(value))
    print("Length dataset")
    for v in value:
        print(len(v))

In [None]:
data = {
    200: [],
    500: [],
    1000: [],
}

for ds_bag in ds_bag_list:
    df_image_len = int(len(ds_bag.get_table_as_dataframe('Image')['RID'].tolist())/2)
    if df_image_len in data:
        dataset_list = data[df_image_len]
    dataset_list.append(ds_bag)
    data[df_image_len] = dataset_list

In [None]:
ds_bag_master = {200 : '4-N9X6', 500: '4-NDWP', 1000: '4-NQQP'}

In [None]:
with execution.execute() as exec:
    for key, value in sets.items():
        master_dataset = ds_bag_master[key]
        training_sets = []
        val = 6 if key == 200 or key == 500 else 4
        for i, item in enumerate(value, start=val):
            training_dataset = execution.create_dataset(['LAC', 'Training'], description=f'A training dataset of {key} images for each diagnosis, No {i}')
            EA.add_dataset_members(dataset_rid=training_dataset, members=item)
            training_sets.append(training_dataset)
        EA.add_dataset_members(dataset_rid=master_dataset, members=training_sets)

In [None]:
execution.upload_execution_outputs(clean_folder=True)