# Connect Eye-AI and Load Libraries

In [15]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# # Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-exec"))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
# Prerequisites
import json
import os

# EyeAI, Deriva, VGG19
from deriva_ml import DatasetSpec, DatasetBag, Workflow, ExecutionConfiguration, VersionPart
from deriva_ml import MLVocab as vc
from eye_ai.eye_ai import EyeAI

# ML Analytics
import pandas as pd
import numpy as np

# Other Utilities
from pathlib import Path, PurePath
import logging
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [17]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
#host = 'dev.eye-ai.org' #for dev testing
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2025-07-03 08:19:47,621 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-07-03 08:19:47,622 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-07-03 08:19:47,624 - INFO - Setting up RefreshTokenAuthorizer with auth_client=[instance:140220945086400]
2025-07-03 08:19:47,624 - INFO - Setting up a RenewingAuthorizer. It will use an auth type of Bearer and can handle 401s.
2025-07-03 08:19:47,625 - INFO - RenewingAuthorizer will start by using access_token with hash "e5b617d200a9e9a145ca7fb531df7efae087ba12052804b426fac09533479771"
2025-07-03 08:19:47,625 - INFO - Executing token refresh without client credentials
2025-07-03 08:19:47,626 - INFO - Fetching new token from Globus Auth
2025-07-03 08:19:48,069 - INFO - request done (success)
2025-07-03 08:19:48,070 - INFO - RenewingAuthorizer.

You are already logged in.


# Configuration

In [18]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2025-07-03 08:19:53,501 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-07-03 08:19:53,501 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


In [None]:


#ml_instance.increment_dataset_version(dataset_rid='2-N93J', component= VersionPart.patch, description='Update to latest deriva-ml schema')

In [19]:
datasets = [ "4-S42W" ] # for new LAC balanced
#datasets = [ '2-C9PR' ], '4-YQVM' ] # for USC healthy/glaucoma
#datasets = [ "2-N93J" ] # for new LAC balanced

data_to_download = []
for dataset in datasets:
    ds_dict = {
        'rid': dataset,
        'materialize':False,
        'version':EA.dataset_version(dataset_rid=dataset),
    }
    data_to_download.append(ds_dict)

dataset_workflow = EA.add_workflow( 
    Workflow(
        name="Make Dataset by KB",
        url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Make_Dataset.ipynb",
        workflow_type="Test Workflow",
        )
    )

config = ExecutionConfiguration(
    datasets=data_to_download,
    workflow=dataset_workflow,
    description="Instance of making a dataset; in this case, creating an angle-2 subset of 4-S42W",
    )

execution = EA.create_execution(config)

2025-07-03 08:20:40,432 - INFO - Materialize bag 4-S42W... 
2025-07-03 08:20:40,542 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-07-03 08:20:40,543 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-07-03 08:20:42,236 - INFO - Using cached bag for  4-S42W Version:0.2.1
2025-07-03 08:20:42,236 - INFO - Loading /data/4-S42W_2c92df824a3b41ec944c1a5dce5c71fd97f72b0fbcbe53d3a229327621274dfd/Dataset_4-S42W
2025-07-03 08:20:42,986 - INFO - Creating new database for dataset: 4-S42W in /data/kb_766/EyeAI_working/4-S42W@33E-APAZ-TTM4.db
2025-07-03 08:20:43,048 - INFO - Downloading assets ...
2025-07-03 08:20:43,333 - INFO - Initialize status finished.


In [20]:
print(execution)

caching_dir: /data
_working_dir: /data/kb_766/EyeAI_working
execution_rid: 4-Z6K4
workflow_rid: 4-Z5Y4
asset_paths: {}
configuration: datasets=[DatasetSpec(rid='4-S42W', materialize=False, version=DatasetVersion(major=0, minor=2, patch=1))] assets=[] workflow='4-Z5Y4' parameters={} description='Instance of making a dataset; in this case, creating an angle-2 subset of 4-S42W' argv=['/home/kb_766/.conda/envs/my-tensorflow-conda/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/kb_766/.local/share/jupyter/runtime/kernel-9b90ba4d-9b2b-4f49-b3ee-c52e2a09d828.json']


# Get Pertinent Datasets

In [21]:
ds_bag = execution.datasets[0]

In [None]:
# Function to update column names
pd.options.mode.copy_on_write = True
def updateCols(df, cols, colDict):
    df = df[cols]
    df.rename( columns = colDict, inplace = True )
    for c in set(cols).intersection( set(colDict) ): cols[cols.index(c)] = colDict.get(c)
    return df

# For USC Multimodal dataset healthy/glaucoma

In [None]:
# For building the USC dataset for healthy/glaucoma
ds_bag_OG = execution.datasets[0]
ds_bag_healthy = execution.datasets[1]

# For building the new LAC dataset with matching
# ds_bag = exec.datasets[0]
# diagsTall = EA.image_tall(ds_bag, 'Initial Diagnosis')

In [None]:
# For building the USC dataset for healthy/glaucoma

# Merge datasets to create master dataframe

modalities = EA.extract_modality(ds_bag_OG)

masterDF = pd.merge(
    ds_bag_OG.get_table_as_dataframe('Image')[[ 'RID', 'Observation' ]],
    ds_bag_OG.get_table_as_dataframe('Execution_Image_Fundus_Laterality')[[ 'Image', 'Image_Side' ]],
    left_on = 'RID',
    right_on = 'Image',
    how = 'right').drop( 'RID', axis = 1 )

masterDF = pd.merge(
    ds_bag_OG.get_table_as_dataframe('Observation')[[ 'RID', 'Subject' ]],
    masterDF,
    left_on = 'RID',
    right_on = 'Observation',
    how = 'right').drop( 'RID', axis = 1 )

masterDF = pd.merge(
    masterDF,
    EA.multimodal_wide(ds_bag_OG)[[ 'RID_Subject', 'Image_Side', 'RID_Clinic' ]],
    left_on = [ 'Subject', 'Image_Side' ],
    right_on = [ 'RID_Subject', 'Image_Side' ],
    how = 'left').drop( 'RID_Subject', axis = 1 )

masterDF = masterDF[ ~ pd.isna(masterDF['RID_Clinic']) ]

masterDF = pd.merge(
    masterDF,
    modalities['Clinic'][[ 'RID_Clinic', 'CDR', 'Condition_Label' ]],
    on = 'RID_Clinic',
    how = 'left')

# For LAC dataset with matching

In [None]:
# For building the new LAC dataset with matching

# Merge datasets to create master dataframe
# Used for unifying all data in order to make a matched dataset


cols = ['Subject_RID', 'Image_RID', 'Image_Side', 'Diagnosis_Image']
colDict = {}
masterDF = updateCols(diagsTall, cols, colDict)

masterDF = pd.merge( masterDF,
        ds_bag.get_table_as_dataframe('Subject'),
        left_on = 'Subject_RID',
        right_on = 'RID',
        how = 'left')

cols.extend(['Subject_ID','Subject_Gender','Subject_Ethnicity'])
colDict = {'Subject_ID':'EyePacs_ID'}
masterDF = updateCols(masterDF, cols, colDict)

masterDF = pd.merge( masterDF,
        ds_bag.get_table_as_dataframe('Image'),
        left_on = 'Image_RID',
        right_on = 'RID',
        suffixes = ('', '_right'),
        how = 'left')

cols.extend(['Observation'])
colDict = {'Observation':'Observation_RID'}
masterDF = updateCols(masterDF, cols, colDict)

masterDF = pd.merge( masterDF,
        ds_bag.get_table_as_dataframe('Observation'),
        left_on = 'Observation_RID',
        right_on = 'RID',
        how = 'left')

cols.extend(['date_of_encounter', 'Age'])
masterDF = updateCols(masterDF, cols, colDict)

patientDF = masterDF.drop_duplicates(subset=['Subject_RID', 'Diagnosis_Image']).reset_index() # patient level
eyeDF = masterDF.drop_duplicates(subset=['Subject_RID', 'Diagnosis_Image', 'Image_Side']).reset_index() # eye level

# Select Test Set

# For USC Multimodal dataset healthy/glaucoma

In [None]:
# For building the USC dataset for healthy/glaucoma

# Select cases - 170 total, take 150
caseDF = masterDF[ masterDF['Condition_Label'].isin( ['POAG', 'PACG'] ) ]
caseDF = caseDF[ caseDF['CDR'] != '' ]
caseDF = caseDF[ caseDF['CDR'] > 0.6 ]
testDF = caseDF.sample( n = 150 , random_state = 42, replace = False )
testImage_RIDS = testDF['Image']

# Select controls
testImage_RIDS = pd.concat( [ testImage_RIDS, 
        EA.filter_angle_2( ds_bag_healthy ).sample( n = 150 , random_state = 42, replace = False )['RID'] ], ignore_index=True )

testImage_RIDS

In [None]:
masterDF[ masterDF['Image'].isin(testImage_RIDS) ]['Condition_Label']

# For LAC dataset with matching

In [None]:
# For building the new LAC dataset with matching
# Select cases


# Target total size ~600 images, so ~300 patients
# Select ~150 case patients

# Drop "Indian subcontinent origin" because there are so few, also none in original test set
caseDF = patientDF[ patientDF['Diagnosis_Image'] == 'Suspected Glaucoma']
caseDF = caseDF[ - caseDF['Subject_Ethnicity'].isin(['Latin American', 'Indian subcontinent origin', '']) ]

# Drop cases if too many, for African, Asian, ethnicity not specified
caseDF = caseDF.drop( caseDF[ caseDF['Subject_Ethnicity'] == 'African Descent'].iloc[0:45].index )
caseDF = caseDF.drop( caseDF[ caseDF['Subject_Ethnicity'] == 'Asian'].iloc[0:22].index )
caseDF = caseDF.drop( caseDF[ caseDF['Subject_Ethnicity'] == 'ethnicity not specified'].iloc[0:81].index )

caseKey, caseCount = np.unique( caseDF['Subject_Ethnicity'], return_counts=True )
caseCounts = dict( zip( caseKey, caseCount ))

In [None]:
# For building the new LAC dataset with matching
# Select controls


cntrlRatio = 1  # meaning 1 case to 2 controls
bins = list(range(0, 101, 10))  # age bins

cntrlDF = patientDF[ patientDF['Diagnosis_Image'] == 'No Glaucoma']
cntrlDF = cntrlDF[ - cntrlDF['Subject_Ethnicity'].isin(['Latin American', 'Indian subcontinent origin', 'Multi-racial', '']) ]

cntrlKey, cntrlCount = np.unique( cntrlDF['Subject_Ethnicity'], return_counts=True )
cntrlCounts = dict( zip( cntrlKey, cntrlCount ))

cntrlRIDs = []
for e in pd.unique( cntrlDF['Subject_Ethnicity'] ):
    needMore = 0
    if cntrlCounts[e] > (cntrlRatio * caseCounts[e]):
        # Enough controls to try gender matching
        for s in pd.unique( caseDF[ caseDF['Subject_Ethnicity'] == e ].loc[:,'Subject_Gender']):
            tempCaseDF = caseDF[ (caseDF['Subject_Ethnicity'] == e) &  (caseDF['Subject_Gender'] == s) ]
            tempCntrlDF = cntrlDF[ (cntrlDF['Subject_Ethnicity'] == e) & (cntrlDF['Subject_Gender'] == s) ]

            if tempCntrlDF.shape[0] > (cntrlRatio * tempCaseDF.shape[0]):
                # Enough to try age matching
                i, c = np.unique( pd.cut(tempCaseDF['Age'], bins = bins), return_counts=True )
                tCounts = dict( zip( i, c ))
                cntrlAges = pd.cut(tempCntrlDF['Age'], bins = bins )
                for ind in i:
                    cntrlMatch = np.where( cntrlAges == ind )[0]
                    if len(cntrlMatch) < ( cntrlRatio * tCounts[ind] ):
                        # Not enough controls for this age bin matching, take all of them
                        cntrlRIDs.extend( tempCntrlDF.loc[ cntrlAges.index[cntrlMatch]].loc[:,'Subject_RID'] )
                        needMore = needMore + ( ( cntrlRatio * tCounts[ind] ) - len(cntrlMatch) )
                    else:
                        # More than enough controls for this age bin matching, only take enough
                        cntrlRIDs.extend( tempCntrlDF.loc[ cntrlAges.index[cntrlMatch]].loc[:,'Subject_RID'].iloc[0:( cntrlRatio * tCounts[ind] )] )
            
                if needMore > 0:
                    # Not enough were age matched, take more gender matched
                    cntrlRIDs.extend( list(set(tempCntrlDF['Subject_RID']) - set(cntrlRIDs))[0:needMore] )
                    needMore = 0

            else:
                # Not enough for gender + age matching in this gender, do ethnicity matching
                # First take all for that gender, then use needMore
                cntrlRIDs.extend(tempCntrlDF['Subject_RID'])
                needMore = needMore + ( (cntrlRatio * tempCaseDF.shape[0]) - tempCntrlDF.shape[0] )

        if needMore > 0:
            # Not enough were gender matched, take more ethnicity matched
            cntrlRIDs.extend( list( set(caseDF[caseDF['Subject_Ethnicity'] == e].loc[:,'Subject_RID']) - set(cntrlRIDs))[0:needMore] )
            needMore = 0
            
    else:
        # Not enough controls for ethnicity + gender + age matching, take all of them
        cntrlRIDs.extend( cntrlDF[ cntrlDF['Subject_Ethnicity'] == e ].loc[:,'Subject_RID'] )

In [None]:
# Put it all together in new TEST SET
testDF = masterDF[ masterDF['Subject_RID'].isin( cntrlRIDs + list( caseDF['Subject_RID'] ) )]

# Create Dataset

In [None]:


# test_dataset = execution.create_dataset(['LAC', 'Test'], description='A race/gender/age matched test dataset')
# EA.add_dataset_members( dataset_rid = test_dataset, members = testDF['Subject_RID'])

test_dataset = execution.create_dataset(['USC', 'Test'], description='A test dataset for photograph interpretation for referable glaucoma - UPDATED TO ANGLE 2')
EA.add_dataset_members( dataset_rid = test_dataset, members = testImage_RIDS )


# Angle-2 Subset

In [26]:
angle2_image = EA.filter_angle_2(ds_bag)
angle2_dataset = execution.create_dataset(['USC', 'Test'], description='A test dataset of images for photograph interpretation for referable glaucoma - FILTERED TO ANGLE 2')
EA.add_dataset_members( dataset_rid = angle2_dataset, members = angle2_image['RID'] )
EA.add_dataset_members( dataset_rid = datasets[0], members = [angle2_dataset] )

ValidationError: 1 validation error for Dataset.add_dataset_members
members
  Input should be a valid list [type=list_type, input_value='4-Z6K8', input_type=str]
    For further information visit https://errors.pydantic.dev/2.10/v/list_type