# Connect Eye-AI and Load Libraries

In [1]:
%load_ext autoreload
%autoreload 2

# # Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / "eye-ai-ml"))
sys.path.insert(0, str(Path.home() / "eye-ai-exec"))

In [2]:
# Prerequisites
import json
import os

# EyeAI, Deriva, VGG19
from deriva_ml import DatasetSpec, DatasetBag, Workflow, ExecutionConfiguration, VersionPart
from deriva_ml import MLVocab as vc
from eye_ai.eye_ai import EyeAI

# ML Analytics
import pandas as pd
import numpy as np

# Other Utilities
from pathlib import Path, PurePath
import logging
from datetime import datetime

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [3]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
#host = 'dev.eye-ai.org' #for dev testing
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2025-06-16 15:20:25,696 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-06-16 15:20:25,697 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


You are already logged in.


# Configuration

In [4]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2025-06-16 15:20:27,388 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-06-16 15:20:27,388 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


In [8]:


#ml_instance.increment_dataset_version(dataset_rid='2-N93J', component= VersionPart.patch, description='Update to latest deriva-ml schema')

DatasetVersion(major=2, minor=0, patch=1)

In [8]:

datasets = [ '2-C9PR', '4-YQVM' ] # for USC healthy/glaucoma
#datasets = [ "2-N93J" ] # for new LAC balanced

data_to_download = []
for dataset in datasets:
    ds_dict = {
        'rid': dataset,
        'materialize':False,
        'version':EA.dataset_version(dataset_rid=dataset),
    }
    data_to_download.append(ds_dict)

dataset_workflow = EA.add_workflow( 
    Workflow(
        name="Make Dataset by KB",
        url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Make_Dataset.ipynb",
        workflow_type="Test Workflow",
        )
    )

config = ExecutionConfiguration(
    datasets=data_to_download,
    workflow=dataset_workflow,
    description="Instance of making a dataset",
    )

execution = EA.create_execution(config)

2025-06-16 15:30:08,963 - INFO - Materialize bag 2-C9PR... 
2025-06-16 15:30:09,175 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-06-16 15:30:09,175 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-06-16 15:30:10,740 - INFO - Creating new MINID for dataset 2-C9PR
2025-06-16 15:30:11,788 - INFO - Downloading dataset minid for catalog: 2-C9PR@2.6.0
2025-06-16 15:30:11,790 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-06-16 15:30:11,790 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-06-16 15:30:11,828 - INFO - Processing export config file: /tmp/tmpfzypxftb/download_spec.j

In [5]:
print(execution)

caching_dir: /home/kb_766/deriva-ml/cache
_working_dir: /home/kb_766/deriva-ml/DerivaML_working
execution_rid: 4-S42C
workflow_rid: 4-M4TT
asset_paths: [PosixPath('/home/kb_766/deriva-ml/DerivaML_working/4-S42C/asset/predictions_results.csv')]
configuration: datasets=[DatasetSpec(rid='2-N93J', materialize=False, version=DatasetVersion(major=2, minor=0, patch=1))] assets=['2-C8JM'] workflow='4-M4TT' description='Instance of linking VGG19 predictions to patient-level data'


# Get Pertinent Datasets

In [25]:
# Function to update column names
pd.options.mode.copy_on_write = True
def updateCols(df, cols, colDict):
    df = df[cols]
    df.rename( columns = colDict, inplace = True )
    for c in set(cols).intersection( set(colDict) ): cols[cols.index(c)] = colDict.get(c)
    return df

# For USC Multimodal dataset healthy/glaucoma

In [24]:
# For building the USC dataset for healthy/glaucoma
ds_bag_OG = execution.datasets[0]
ds_bag_healthy = execution.datasets[1]

# For building the new LAC dataset with matching
# ds_bag = exec.datasets[0]
# diagsTall = EA.image_tall(ds_bag, 'Initial Diagnosis')

In [101]:
# For building the USC dataset for healthy/glaucoma

# Merge datasets to create master dataframe

modalities = EA.extract_modality(ds_bag_OG)

masterDF = pd.merge(
    ds_bag_OG.get_table_as_dataframe('Image')[[ 'RID', 'Observation' ]],
    ds_bag_OG.get_table_as_dataframe('Execution_Image_Fundus_Laterality')[[ 'Image', 'Image_Side' ]],
    left_on = 'RID',
    right_on = 'Image',
    how = 'right').drop( 'RID', axis = 1 )

masterDF = pd.merge(
    ds_bag_OG.get_table_as_dataframe('Observation')[[ 'RID', 'Subject' ]],
    masterDF,
    left_on = 'RID',
    right_on = 'Observation',
    how = 'right').drop( 'RID', axis = 1 )

masterDF = pd.merge(
    masterDF,
    EA.multimodal_wide(ds_bag_OG)[[ 'RID_Subject', 'Image_Side', 'RID_Clinic' ]],
    left_on = [ 'Subject', 'Image_Side' ],
    right_on = [ 'RID_Subject', 'Image_Side' ],
    how = 'left').drop( 'RID_Subject', axis = 1 )

masterDF = masterDF[ ~ pd.isna(masterDF['RID_Clinic']) ]

masterDF = pd.merge(
    masterDF,
    modalities['Clinic'][[ 'RID_Clinic', 'CDR', 'Condition_Label' ]],
    on = 'RID_Clinic',
    how = 'left')

# For LAC dataset with matching

In [23]:
# For building the new LAC dataset with matching

# Merge datasets to create master dataframe
# Used for unifying all data in order to make a matched dataset


cols = ['Subject_RID', 'Image_RID', 'Image_Side', 'Diagnosis_Image']
colDict = {}
masterDF = updateCols(diagsTall, cols, colDict)

masterDF = pd.merge( masterDF,
        ds_bag.get_table_as_dataframe('Subject'),
        left_on = 'Subject_RID',
        right_on = 'RID',
        how = 'left')

cols.extend(['Subject_ID','Subject_Gender','Subject_Ethnicity'])
colDict = {'Subject_ID':'EyePacs_ID'}
masterDF = updateCols(masterDF, cols, colDict)

masterDF = pd.merge( masterDF,
        ds_bag.get_table_as_dataframe('Image'),
        left_on = 'Image_RID',
        right_on = 'RID',
        suffixes = ('', '_right'),
        how = 'left')

cols.extend(['Observation'])
colDict = {'Observation':'Observation_RID'}
masterDF = updateCols(masterDF, cols, colDict)

masterDF = pd.merge( masterDF,
        ds_bag.get_table_as_dataframe('Observation'),
        left_on = 'Observation_RID',
        right_on = 'RID',
        how = 'left')

cols.extend(['date_of_encounter', 'Age'])
masterDF = updateCols(masterDF, cols, colDict)

patientDF = masterDF.drop_duplicates(subset=['Subject_RID', 'Diagnosis_Image']).reset_index() # patient level
eyeDF = masterDF.drop_duplicates(subset=['Subject_RID', 'Diagnosis_Image', 'Image_Side']).reset_index() # eye level

NameError: name 'diagsTall' is not defined

# Select Test Set

# For USC Multimodal dataset healthy/glaucoma

In [145]:
masterDF

Unnamed: 0,Subject,Observation,Image,Image_Side,RID_Clinic,CDR,Condition_Label
0,2-7MEG,2-7STC,2-BDZG,Right,4-2RA4,0.6,POAG
1,2-7MEG,2-7STC,2-BMVC,Left,4-2RA2,0.85,POAG
2,2-7P4G,2-7XDA,2-BHEA,Right,4-2XHA,0.99,POAG
3,2-7P4G,2-7XDA,2-BWWG,Left,4-2XH8,0.9,POAG
4,2-7KYC,2-7RRC,2-C0V0,Right,4-2PQC,0.9,POAG
...,...,...,...,...,...,...,...
328,2-7NCY,2-7VVG,2-C31E,Left,4-2V9C,,POAG
329,2-7MYG,2-7TWM,2-BMZ2,Right,4-2SW8,0.9,POAG
330,2-7MYG,2-7TWM,2-BH2M,Left,4-2SW6,0.85,POAG
331,2-7NZ2,2-7X1C,2-BYD0,Right,4-2X02,0.7,POAG


In [151]:
# For building the USC dataset for healthy/glaucoma

# Select cases - 170 total, take 150
caseDF = masterDF[ masterDF['Condition_Label'].isin( ['POAG', 'PACG'] ) ]
caseDF = caseDF[ caseDF['CDR'] != '' ]
caseDF = caseDF[ caseDF['CDR'] > 0.6 ]
testDF = caseDF.sample( n = 150 , random_state = 42, replace = False )
testImage_RIDS = testDF['Image']

# Select controls
testImage_RIDS = pd.concat( [ testImage_RIDS, 
        EA.filter_angle_2( ds_bag_healthy ).sample( n = 150 , random_state = 42, replace = False )['RID'] ], ignore_index=True )

testImage_RIDS

0      2-C0QJ
1      2-BXQA
2      2-C16W
3      2-BJ0R
4      2-BS4A
        ...  
295    4-YRPE
296    4-YRY2
297    4-YRT2
298    4-YS06
299    4-YS4E
Length: 300, dtype: object

In [155]:
masterDF[ masterDF['Image'].isin(testImage_RIDS) ]['Condition_Label']

1      POAG
3      POAG
4      POAG
5      POAG
6      POAG
       ... 
323    POAG
324    POAG
329    POAG
330    POAG
331    POAG
Name: Condition_Label, Length: 150, dtype: object

# For LAC dataset with matching

In [106]:
# For building the new LAC dataset with matching
# Select cases


# Target total size ~600 images, so ~300 patients
# Select ~150 case patients

# Drop "Indian subcontinent origin" because there are so few, also none in original test set
caseDF = patientDF[ patientDF['Diagnosis_Image'] == 'Suspected Glaucoma']
caseDF = caseDF[ - caseDF['Subject_Ethnicity'].isin(['Latin American', 'Indian subcontinent origin', '']) ]

# Drop cases if too many, for African, Asian, ethnicity not specified
caseDF = caseDF.drop( caseDF[ caseDF['Subject_Ethnicity'] == 'African Descent'].iloc[0:45].index )
caseDF = caseDF.drop( caseDF[ caseDF['Subject_Ethnicity'] == 'Asian'].iloc[0:22].index )
caseDF = caseDF.drop( caseDF[ caseDF['Subject_Ethnicity'] == 'ethnicity not specified'].iloc[0:81].index )

caseKey, caseCount = np.unique( caseDF['Subject_Ethnicity'], return_counts=True )
caseCounts = dict( zip( caseKey, caseCount ))

In [107]:
# For building the new LAC dataset with matching
# Select controls


cntrlRatio = 1  # meaning 1 case to 2 controls
bins = list(range(0, 101, 10))  # age bins

cntrlDF = patientDF[ patientDF['Diagnosis_Image'] == 'No Glaucoma']
cntrlDF = cntrlDF[ - cntrlDF['Subject_Ethnicity'].isin(['Latin American', 'Indian subcontinent origin', 'Multi-racial', '']) ]

cntrlKey, cntrlCount = np.unique( cntrlDF['Subject_Ethnicity'], return_counts=True )
cntrlCounts = dict( zip( cntrlKey, cntrlCount ))

cntrlRIDs = []
for e in pd.unique( cntrlDF['Subject_Ethnicity'] ):
    needMore = 0
    if cntrlCounts[e] > (cntrlRatio * caseCounts[e]):
        # Enough controls to try gender matching
        for s in pd.unique( caseDF[ caseDF['Subject_Ethnicity'] == e ].loc[:,'Subject_Gender']):
            tempCaseDF = caseDF[ (caseDF['Subject_Ethnicity'] == e) &  (caseDF['Subject_Gender'] == s) ]
            tempCntrlDF = cntrlDF[ (cntrlDF['Subject_Ethnicity'] == e) & (cntrlDF['Subject_Gender'] == s) ]

            if tempCntrlDF.shape[0] > (cntrlRatio * tempCaseDF.shape[0]):
                # Enough to try age matching
                i, c = np.unique( pd.cut(tempCaseDF['Age'], bins = bins), return_counts=True )
                tCounts = dict( zip( i, c ))
                cntrlAges = pd.cut(tempCntrlDF['Age'], bins = bins )
                for ind in i:
                    cntrlMatch = np.where( cntrlAges == ind )[0]
                    if len(cntrlMatch) < ( cntrlRatio * tCounts[ind] ):
                        # Not enough controls for this age bin matching, take all of them
                        cntrlRIDs.extend( tempCntrlDF.loc[ cntrlAges.index[cntrlMatch]].loc[:,'Subject_RID'] )
                        needMore = needMore + ( ( cntrlRatio * tCounts[ind] ) - len(cntrlMatch) )
                    else:
                        # More than enough controls for this age bin matching, only take enough
                        cntrlRIDs.extend( tempCntrlDF.loc[ cntrlAges.index[cntrlMatch]].loc[:,'Subject_RID'].iloc[0:( cntrlRatio * tCounts[ind] )] )
            
                if needMore > 0:
                    # Not enough were age matched, take more gender matched
                    cntrlRIDs.extend( list(set(tempCntrlDF['Subject_RID']) - set(cntrlRIDs))[0:needMore] )
                    needMore = 0

            else:
                # Not enough for gender + age matching in this gender, do ethnicity matching
                # First take all for that gender, then use needMore
                cntrlRIDs.extend(tempCntrlDF['Subject_RID'])
                needMore = needMore + ( (cntrlRatio * tempCaseDF.shape[0]) - tempCntrlDF.shape[0] )

        if needMore > 0:
            # Not enough were gender matched, take more ethnicity matched
            cntrlRIDs.extend( list( set(caseDF[caseDF['Subject_Ethnicity'] == e].loc[:,'Subject_RID']) - set(cntrlRIDs))[0:needMore] )
            needMore = 0
            
    else:
        # Not enough controls for ethnicity + gender + age matching, take all of them
        cntrlRIDs.extend( cntrlDF[ cntrlDF['Subject_Ethnicity'] == e ].loc[:,'Subject_RID'] )

In [117]:
# Put it all together in new TEST SET
testDF = masterDF[ masterDF['Subject_RID'].isin( cntrlRIDs + list( caseDF['Subject_RID'] ) )]

5       2-CDB4
6       2-CDB4
41      2-CDCE
42      2-CDCE
81      2-CDDW
         ...  
6968    2-CN8P
6969    2-CN8P
6985    2-CN9A
6986    2-CN9A
6987    2-CN9A
Name: Subject_RID, Length: 645, dtype: object

# Create Dataset

In [152]:


# test_dataset = execution.create_dataset(['LAC', 'Test'], description='A race/gender/age matched test dataset')
# EA.add_dataset_members( dataset_rid = test_dataset, members = testDF['Subject_RID'])

test_dataset = execution.create_dataset(['USC', 'Test'], description='A test dataset for photograph interpretation for referable glaucoma - UPDATED TO ANGLE 2')
EA.add_dataset_members( dataset_rid = test_dataset, members = testImage_RIDS )
