# Connect Eye-AI and Load Libraries

In [259]:
# repo_dir = "Repos"   # Set this to be where your github repos are located.
# %load_ext autoreload
# %autoreload 2

# # Update the load path so python can find modules for the model
# import sys
# from pathlib import Path
# sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
# sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))

In [260]:
# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI

import pandas as pd
from pathlib import Path, PurePath
import logging

from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration
from deriva_ml import MLVocab as vc
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [342]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
# host = 'dev.eye-ai.org' for dev testing
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2025-02-06 08:02:50,493 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-06 08:02:50,494 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


You are already logged in.


In [343]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2025-02-06 08:02:52,756 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-06 08:02:52,757 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


# Configuration

In [344]:
# RID of source dataset, if any.
source_dataset = '2-277M'

# EA.add_term(vc.workflow_type, "Test Workflow", description="A test Workflow for new DM")

# Workflow instance
preds_workflow = Workflow(
    name="LAC data template",
    url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Get_VGGPreds.ipynb",
    workflow_type="Diagnosis_Analysis"
)

# Configuration instance.
config = ExecutionConfiguration(
    datasets = [{'rid':source_dataset, 'materialize':False}],
    # Materialize set to False if you only need the metadata from the bag, and not the assets.
    assets = ['2-C8JM'],
    workflow = preds_workflow,
    description = "Instance of linking VGG19 predictions to patient-level data")

# Initialize execution
execution = EA.create_execution(config)


2025-02-06 08:02:57,808 - INFO - Configuration validation successful!
2025-02-06 08:02:58,797 - INFO - Initializing downloader: GenericDownloader v1.7.5 [Python 3.10.13, Linux-5.10.210-201.852.amzn2.x86_64-x86_64-with-glibc2.26]
2025-02-06 08:02:58,800 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-06 08:02:58,801 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-02-06 08:02:58,802 - INFO - Validating credentials for host: www.eye-ai.org
2025-02-06 08:02:58,838 - INFO - Creating bag directory: /tmp/tmpahepa4ja/Dataset_2-277M
2025-02-06 08:02:58,843 - INFO - Creating bag for directory /tmp/tmpahepa4ja/Dataset_2-277M
2025-02-06 08:02:58,843 - INFO - Creating data directory
2025-02-06 08:02:58,853 - INFO - Moving /tmp/tmpahepa4ja/Dataset_2-277M/tmpchdgruly to data
2025-02-06 08:02

In [403]:
print(execution)

caching_dir: /data
working_dir: /data/kb_766/EyeAI_working
execution_rid: 4-M4XR
workflow_rid: 4-M4TT
dataset_paths: [PosixPath('/data/2-277M_3d5894a55b711ed128a6c813c5f41e51b996cf02138edeec784add3445e16d10/Dataset_2-277M')]
asset_paths: [PosixPath('/data/kb_766/EyeAI_working/4-M4XR/asset/predictions_results.csv')]
configuration: datasets=[DatasetSpec(rid='2-277M', materialize=False)] assets=['2-C8JM'] workflow=Workflow(name='LAC data template', url='https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Get_VGGPreds.ipynb', workflow_type='Diagnosis_Analysis', version=None, description=None) description='Instance of linking VGG19 predictions to patient-level data'


# Get Pertinent Datasets

In [347]:
ds_bag = DatasetBag(execution.dataset_paths[0])

In [556]:
# Get expert consensus diagnosis
# experts = ['Benjamin Xu', 'Brandon Wong', 'Van Nguyen']
# dxExpertOG = EA.image_tall(ds_bag, 'AI_glaucomasuspect_test')
# dxExpertOG = dxExpertOG[ dxExpertOG['Full_Name'].isin(experts) ]

ridStore = []
dxStore = []
cDxStore = []
cdrStore = []
for id in list(dxExpertOG['Image_RID'].unique()):
    ridStore.append(id)
    dxTemp = dxExpertOG[ dxExpertOG['Image_RID'] == id ]
    dxCDR = dxTemp[dxTemp['Cup_Disk_Ratio'].apply(type) == float] 
    if len(dxCDR) > 0:
        cdrStore.append( round( dxCDR['Cup_Disk_Ratio'].sum() / len(dxCDR['Cup_Disk_Ratio']), 1 ) )
        if (dxTemp['Diagnosis_Image'] == 'Suspected Glaucoma').sum() > 1:
            dxStore.append('Suspected Glaucoma')
            cDxStore.append( (dxTemp['Diagnosis_Image'] == 'Suspected Glaucoma').sum() )
        else:
            dxStore.append('No Glaucoma')
            cDxStore.append( (dxTemp['Diagnosis_Image'] == 'No Glaucoma').sum() )
    else:
        cdrStore.append('')
        dxStore.append('Not Graded, Bad Quality')
        cDxStore.append(3)

dxExpert = pd.DataFrame({'RID_Image':ridStore, 'Diagnosis_Image_Expert':dxStore, 'Diagnosis_Image_Expert_Count':cDxStore, 'CDR_Expert':cdrStore})

In [563]:
ds_bag.get_table_as_dataframe('Observation').columns

eye-ai:Observation


Index(['RID', 'RCT', 'RMT', 'RCB', 'RMB', 'Observation_ID', 'Subject', 'hba1c',
       'glaucoma_hx', 'visual_acuity_right', 'visual_acuity_left',
       'date_of_encounter', 'reviewed_date', 'provider', 'consultant',
       'dr_level', 'consult_id', 'site_mrn', 'assessment_and_recommendation',
       'additional_comments', 'return_time_frame',
       'referral_status_time_frame', 'Subject_hypertension',
       'Subject_insulin_dependent', 'Subject_pregnant', 'Subject_cataract',
       'Subject_maculopathy', 'Subject_other', 'Subject_image_quality'],
      dtype='object')

In [577]:
# Function to update column names
pd.options.mode.copy_on_write = True
def updateCols(df, cols, colDict):
    df = df[cols]
    df.rename( columns = colDict, inplace = True )
    for c in set(cols).intersection( set(colDict) ): cols[cols.index(c)] = colDict.get(c)
    return df

cols = ['Image', 'Diagnosis_Image_Optom', 'Diagnosis_Image_CNN']
colDict = {'Image':'RID_Image', 'Observation':'RID_Observation', 'Subject':'RID_Subject'}

# Build up diagnosis DF for Optom and CNN
diags = ds_bag.get_table_as_dataframe('Image_Diagnosis')
diags = pd.merge( diags[diags['Execution'] == '2-C6E0'],
                   diags[diags['Diagnosis_Tag'] == 'Initial Diagnosis'],
                   on = 'Image', how = 'left', suffixes = ['_CNN', '_Optom'])

diags = updateCols( diags, cols, colDict )
del(cols[0])
cols[:0] = ['RID_Image', 'Diagnosis_Image_Expert', 'Diagnosis_Image_Expert_Count', 'CDR_Expert']

# Merge onto diagnosis DF for Expert
diags = pd.merge( dxExpert, diags, on = 'RID_Image', how = 'left' )

# Link to image data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Image'),
                  diags,
                  left_on = 'RID', right_on = 'RID_Image', 
                  how = 'right')

cols[:0] = ['Observation', 'Image_Side']
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to observation data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Observation'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Observation', 
                   how = 'right')

cols[:0] = ['Subject', 'site_mrn', 'date_of_encounter','hba1c', 'dr_level', 'glaucoma_hx', 'Subject_image_quality']
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to subject data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Subject'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Subject', 
                   how = 'right')

cols[:0] = ['RID_Subject', 'site_mrn', 'Subject_Gender', 'Subject_Ethnicity']
del(cols[4]) # remove duplicated RID_Subject
del(cols[4]) # and site_mrn
linkdDF = updateCols( linkdDF, cols, colDict )

eye-ai:Image_Diagnosis
eye-ai:Image
eye-ai:Observation
eye-ai:Subject


In [579]:
# Get Predictions from Execution 2-C6E0 (VGG19 on test set)
preds = pd.read_csv(execution.asset_paths[0])

# Get RID Image from Filename
preds['Filename'] = preds['Filename'].apply(lambda x: x.split("_")[3].split(".")[0])

# Link back to full DF
linkdDF = pd.merge( linkdDF,
                   preds[['Filename', 'Probability Score']],
                   left_on = 'RID_Image', right_on = 'Filename', 
                   how = 'left')

cols.append('Probability Score')
colDict = {'Probability Score':'Diagnosis_CNN_Prob'}
linkdDF = updateCols( linkdDF, cols, colDict )

linkdDF

Unnamed: 0,RID_Subject,site_mrn,Subject_Gender,Subject_Ethnicity,date_of_encounter,hba1c,dr_level,glaucoma_hx,Subject_image_quality,RID_Observation,Image_Side,RID_Image,Diagnosis_Image_Expert,Diagnosis_Image_Expert_Count,CDR_Expert,Diagnosis_Image_Optom,Diagnosis_Image_CNN,Diagnosis_CNN_Prob,Diagnosis_CNN_Prob.1
0,6RDP,100132639,F,Other,2019-09-08,7.1,Mild nonproliferative diabetic retinopathy,No,Adequate,76M4,Left,9ZYM,Suspected Glaucoma,3,0.7,Suspected Glaucoma,Suspected Glaucoma,0.982755,0.982755
1,6RDP,100132639,F,Other,2019-09-08,7.1,Mild nonproliferative diabetic retinopathy,No,Adequate,76M4,Right,9ZYR,Suspected Glaucoma,3,0.6,Suspected Glaucoma,Suspected Glaucoma,0.983176,0.983176
2,6RE0,100001258,M,Asian,2021-07-18,6.9,Mild nonproliferative diabetic retinopathy,No,Adequate,76MG,Left,AAZ6,Suspected Glaucoma,3,0.7,Suspected Glaucoma,Suspected Glaucoma,0.780545,0.780545
3,6RE0,100001258,M,Asian,2021-07-18,6.9,Mild nonproliferative diabetic retinopathy,No,Adequate,76MG,Right,AAZC,Suspected Glaucoma,3,0.8,Suspected Glaucoma,Suspected Glaucoma,0.907014,0.907014
4,6REA,101417587,M,African Descent,2022-06-01,0.0,No apparent diabetic retinopathy,No,Adequate,76MT,Right,AF76,No Glaucoma,3,0.3,Suspected Glaucoma,No Glaucoma,0.047736,0.047736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,76J0,100239661,M,Latin American,2021-07-05,,No apparent diabetic retinopathy,No,Adequate,7NV4,Right,AANW,Suspected Glaucoma,3,0.7,Suspected Glaucoma,Suspected Glaucoma,0.637393,0.637393
996,76J2,100171233,M,Latin American,2020-02-10,8.1,Mild nonproliferative diabetic retinopathy,No,Excellent,7NV6,Right,9CWT,No Glaucoma,3,0.2,No Glaucoma,No Glaucoma,0.002526,0.002526
997,76J2,100171233,M,Latin American,2020-02-10,8.1,Mild nonproliferative diabetic retinopathy,No,Excellent,7NV6,Left,9CWW,No Glaucoma,3,0.2,No Glaucoma,No Glaucoma,0.010679,0.010679
998,76KC,100118650,F,Latin American,2017-02-26,9.7,DR not determined,No,Insufficient for Full Interpretation,7NWG,Left,7R0C,Suspected Glaucoma,2,0.5,No Glaucoma,Suspected Glaucoma,0.876154,0.876154


In [599]:
# Make a subject level DF
def getMaxDx(dxList):
    if (dxList == 'Suspected Glaucoma').sum() > 0:
        return 'Suspected Glaucoma'
    else:
        return 'No Glaucoma'



'Suspected Glaucoma'

# Calculate Parity Metrics

In [529]:
# Define functions
def glcRate(xTab):
    return (xTab.iloc[0,1] + xTab.iloc[1,1]) / xTab.to_numpy().sum()
    
def accuracy(xTab):
    return (xTab.iloc[0,0] + xTab.iloc[1,1]) / xTab.to_numpy().sum()

def tpr(xTab):
    return xTab.iloc[1,1] / (xTab.iloc[1,1] + xTab.iloc[0,1])

def tnr(xTab):
    return xTab.iloc[0,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fpr(xTab):
    return xTab.iloc[1,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fnr(xTab):
    return xTab.iloc[0,1] / (xTab.iloc[0,1] + xTab.iloc[1,1])

def getParityMetrics(matrixList):
    vals = { 'n':{}, 'glcRate':{}, 'accuracy':{}, 'tpr':{}, 'tnr':{}, 'fpr':{}, 'fnr':{} }
    for e in matrixList.keys():
        vals['n'][e] = matrixList[e].to_numpy().sum()
        vals['glcRate'][e] = glcRate( matrixList[e] )
        vals['accuracy'][e] = accuracy( matrixList[e] )
        vals['tpr'][e] = tpr( matrixList[e] )
        vals['tnr'][e] = tnr( matrixList[e] )
        vals['fpr'][e] = fpr( matrixList[e] )
        vals['fnr'][e] = fnr( matrixList[e] )
    return pd.DataFrame.from_dict(vals).transpose().loc[:,['All', 'Latin American', 'African Descent', 'Asian', 'Caucasian', 'ethnicity not specified', 'Other']]

# Make confusion matrices CNN vs. Optom/Trained labels
matrixList = {}
matrixList['All'] = pd.crosstab( linkdDF['Diagnosis_Image_CNN'], linkdDF['Diagnosis_Image_Optom'] )
for e in pd.unique( linkdDF['Subject_Ethnicity'] ):
    matrixList[e] = pd.crosstab( linkdDF[ linkdDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_CNN'], linkdDF[ linkdDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_Optom'] )

# Save off all parity metrics
getParityMetrics( matrixList )

Unnamed: 0,All,Latin American,African Descent,Asian,Caucasian,ethnicity not specified,Other
n,1000.0,706.0,86.0,52.0,20.0,106.0,30.0
glcRate,0.5,0.467422,0.72093,0.461538,0.4,0.54717,0.6
accuracy,0.807,0.787535,0.848837,0.807692,0.7,0.877358,0.966667
tpr,0.738,0.709091,0.806452,0.708333,0.375,0.827586,0.944444
tnr,0.876,0.856383,0.958333,0.892857,0.916667,0.9375,1.0
fpr,0.124,0.143617,0.041667,0.107143,0.083333,0.0625,0.0
fnr,0.262,0.290909,0.193548,0.291667,0.625,0.172414,0.055556


In [532]:
# Make confusion matrices CNN vs. Expert labels
expertGradedDF = linkdDF[linkdDF['Diagnosis_Image_Expert'] != 'Not Graded, Bad Quality']

matrixList = {}
matrixList['All'] = pd.crosstab( expertGradedDF['Diagnosis_Image_CNN'], expertGradedDF['Diagnosis_Image_Expert'] )
for e in pd.unique( linkdDF['Subject_Ethnicity'] ):
    matrixList[e] = pd.crosstab( expertGradedDF[ expertGradedDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_CNN'], expertGradedDF[ expertGradedDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_Expert'] )

getParityMetrics( matrixList )

Unnamed: 0,All,Latin American,African Descent,Asian,Caucasian,ethnicity not specified,Other
n,983.0,696.0,86.0,49.0,20.0,102.0,30.0
glcRate,0.387589,0.362069,0.569767,0.346939,0.15,0.431373,0.533333
accuracy,0.836216,0.823276,0.860465,0.857143,0.95,0.872549,0.833333
tpr,0.853018,0.825397,0.897959,0.882353,1.0,0.931818,0.875
tnr,0.825581,0.822072,0.810811,0.84375,0.941176,0.827586,0.785714
fpr,0.174419,0.177928,0.189189,0.15625,0.058824,0.172414,0.214286
fnr,0.146982,0.174603,0.102041,0.117647,0.0,0.068182,0.125


In [533]:
# Make confusion matrices Optom vs. Expert labels
matrixList = {}
matrixList['All'] = pd.crosstab( expertGradedDF['Diagnosis_Image_Optom'], expertGradedDF['Diagnosis_Image_Expert'] )
for e in pd.unique( linkdDF['Subject_Ethnicity'] ):
    matrixList[e] = pd.crosstab( expertGradedDF[ expertGradedDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_Optom'], expertGradedDF[ expertGradedDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_Expert'] )

getParityMetrics( matrixList )

Unnamed: 0,All,Latin American,African Descent,Asian,Caucasian,ethnicity not specified,Other
n,983.0,696.0,86.0,49.0,20.0,102.0,30.0
glcRate,0.387589,0.362069,0.569767,0.346939,0.15,0.431373,0.533333
accuracy,0.76704,0.741379,0.77907,0.836735,0.75,0.872549,0.866667
tpr,0.850394,0.793651,0.938776,0.941176,1.0,1.0,0.9375
tnr,0.714286,0.711712,0.567568,0.78125,0.705882,0.775862,0.785714
fpr,0.285714,0.288288,0.432432,0.21875,0.294118,0.224138,0.214286
fnr,0.149606,0.206349,0.061224,0.058824,0.0,0.0,0.0625


In [351]:



# Space to stop autoscroll




# Upload Results

In [568]:
# crete asset path
asset_type_name = "Diagnosis_Analysis"
asset_path = execution.execution_asset_path(asset_type_name)

# save assets to asset_path
linkdDF.to_csv(asset_path/'SubjectToVGG19.csv', index=False)
#parityMetrics.to_csv(asset_path/'ParityMetrics.csv', index=False)

# upload assets to catalog
execution.upload_execution_outputs(clean_folder=True)

2025-02-07 11:16:06,651 - INFO - Initializing uploader: GenericUploader v1.7.5 [Python 3.10.13, Linux-5.10.210-201.852.amzn2.x86_64-x86_64-with-glibc2.26]
2025-02-07 11:16:06,652 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-07 11:16:06,653 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-02-07 11:16:06,695 - INFO - Scanning files in directory [/data/kb_766/EyeAI_working/deriva-ml/execution/4-M4XR/execution-asset]...
2025-02-07 11:16:06,696 - INFO - Including file: [/data/kb_766/EyeAI_working/deriva-ml/execution/4-M4XR/execution-asset/Diagnosis_Analysis/SubjectToVGG19.csv].
2025-02-07 11:16:06,697 - INFO - Processing: [/data/kb_766/EyeAI_working/deriva-ml/execution/4-M4XR/execution-asset/Diagnosis_Analysis/SubjectToVGG19.csv]
2025-02-07 11:16:06,698 - INFO - Computed metadata

{'Diagnosis_Analysis/SubjectToVGG19.csv': FileUploadState(state=<UploadState.success: 0>, status='Complete', result={'URL': '/hatrac/execution_asset/170c13d8038be9bfee8d8e5b9c2d2d22.SubjectToVGG19.csv:9PO4_cp0z0vg_TT9FaorpAYcL7CHQ644', 'RID': '4-M51E', 'RCT': '2025-02-07T19:16:07.109964+00:00', 'RMT': '2025-02-07T19:16:07.109964+00:00', 'RCB': 'https://auth.globus.org/6022643c-876c-4a47-bafa-5b9fac2c7782', 'RMB': 'https://auth.globus.org/6022643c-876c-4a47-bafa-5b9fac2c7782', 'Filename': 'SubjectToVGG19.csv', 'Description': None, 'Length': 183854, 'MD5': '170c13d8038be9bfee8d8e5b9c2d2d22', 'Execution_Asset_Type': 'Diagnosis_Analysis'}, rid='4-M51E')}