In [259]:
# repo_dir = "Repos"   # Set this to be where your github repos are located.
# %load_ext autoreload
# %autoreload 2

# # Update the load path so python can find modules for the model
# import sys
# from pathlib import Path
# sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
# sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))

In [260]:
# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI

import pandas as pd
from pathlib import Path, PurePath
import logging

from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration
from deriva_ml import MLVocab as vc
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [261]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
# host = 'dev.eye-ai.org' for dev testing
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2025-02-05 16:24:22,617 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-05 16:24:22,618 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


You are already logged in.


In [262]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2025-02-05 16:24:23,327 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-05 16:24:23,328 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


# Configuration

In [263]:
# RID of source dataset, if any.
source_dataset = '2-277M'

# EA.add_term(vc.workflow_type, "Test Workflow", description="A test Workflow for new DM")

# Workflow instance
preds_workflow = Workflow(
    name="LAC data template",
    url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Get_VGGPreds.ipynb",
    workflow_type="Diagnosis_Analysis"
)

# Configuration instance.
config = ExecutionConfiguration(
    datasets = [{'rid':source_dataset, 'materialize':False}],
    # Materialize set to False if you only need the metadata from the bag, and not the assets.
    assets = ['2-C8JM'],
    workflow = preds_workflow,
    description = "Instance of linking VGG19 predictions to patient-level data")

# Initialize execution
execution = EA.create_execution(config)


2025-02-05 16:24:25,126 - INFO - Configuration validation successful!
2025-02-05 16:24:26,047 - INFO - Initializing downloader: GenericDownloader v1.7.5 [Python 3.10.13, Linux-5.10.210-201.852.amzn2.x86_64-x86_64-with-glibc2.26]
2025-02-05 16:24:26,050 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-02-05 16:24:26,050 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-02-05 16:24:26,052 - INFO - Validating credentials for host: www.eye-ai.org
2025-02-05 16:24:26,086 - INFO - Creating bag directory: /tmp/tmp1g1xfaex/Dataset_2-277M
2025-02-05 16:24:26,088 - INFO - Creating bag for directory /tmp/tmp1g1xfaex/Dataset_2-277M
2025-02-05 16:24:26,089 - INFO - Creating data directory
2025-02-05 16:24:26,089 - INFO - Moving /tmp/tmp1g1xfaex/Dataset_2-277M/tmpq5kpud_f to data
2025-02-05 16:24

In [264]:
print(execution)

caching_dir: /data
working_dir: /data/kb_766/EyeAI_working
execution_rid: 4-M4WJ
workflow_rid: 4-M4TT
dataset_paths: [PosixPath('/data/2-277M_30e333e8c5102b3da0db96eab8566027459f01ad23d696b921494b609b34ccac/Dataset_2-277M')]
asset_paths: [PosixPath('/data/kb_766/EyeAI_working/4-M4WJ/asset/predictions_results.csv')]
configuration: datasets=[DatasetSpec(rid='2-277M', materialize=False)] assets=['2-C8JM'] workflow=Workflow(name='LAC data template', url='https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Get_VGGPreds.ipynb', workflow_type='Diagnosis_Analysis', version=None, description=None) description='Instance of linking VGG19 predictions to patient-level data'


# Get Pertinent Datasets

In [265]:
ds_bag = DatasetBag(execution.dataset_paths[0])

In [266]:
# Function to update column names
pd.options.mode.copy_on_write = True
def updateCols(df, cols, colDict):
    df = df[cols]
    df.rename( columns = colDict, inplace = True )
    for c in set(cols).intersection( set(colDict) ): cols[cols.index(c)] = colDict.get(c)
    return df

cols = ['Image', 'RID_Optom', 'Diagnosis_Image_Optom', 'RID_CNN', 'Diagnosis_Image_CNN']

# Build up diagnosis DF
diagCNN = ds_bag.get_table_as_dataframe('Image_Diagnosis')
diagCNN = pd.merge( diagCNN[diagCNN['Execution'] == '2-C6E0'],
                   diagCNN[diagCNN['Diagnosis_Tag'] == 'Initial Diagnosis'],
                   on = 'Image', how = 'left', suffixes = ['_CNN', '_Optom'])

colDict = {'RID_CNN':'RID_Diagnosis_CNN', 'RID_Optom':'RID_Diagnosis_Optom', 'Image':'RID_Image'}
diagCNN = updateCols( diagCNN, cols, colDict )

# Link to image data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Image'),
                  diagCNN,
                  left_on = 'RID', right_on = 'RID_Image', 
                  how = 'right')

cols[:0] = ['Observation', 'Image_Side']
colDict = {'Observation':'RID_Observation'}
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to observation data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Observation'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Observation', 
                   how = 'right')

cols[:0] = ['Subject', 'hba1c', 'glaucoma_hx', 'Subject_image_quality']
colDict = {'Subject':'RID_Subject'}
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to subject data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Subject'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Subject', 
                   how = 'right')

cols[:0] = ['RID_Subject', 'Subject_Gender', 'Subject_Ethnicity']
del(cols[3]) # remove duplicated RID_Subject
linkdDF = updateCols( linkdDF, cols, colDict )

linkdDF

eye-ai:Image_Diagnosis
eye-ai:Image
eye-ai:Observation
eye-ai:Subject


Unnamed: 0,RID_Subject,Subject_Gender,Subject_Ethnicity,hba1c,glaucoma_hx,Subject_image_quality,RID_Observation,Image_Side,RID_Image,RID_Diagnosis_Optom,Diagnosis_Image_Optom,RID_Diagnosis_CNN,Diagnosis_Image_CNN
0,70GG,F,Latin American,7.8,No,Adequate,7FHY,Left,7P0A,4-92KC,No Glaucoma,3-X96E,No Glaucoma
1,70GG,F,Latin American,7.8,No,Adequate,7FHY,Right,7P0C,4-92KE,No Glaucoma,3-X96G,No Glaucoma
2,6YWM,M,Latin American,8.9,No,Adequate,7DT0,Left,7P9A,4-92YG,No Glaucoma,3-X96J,No Glaucoma
3,6YWM,M,Latin American,8.9,No,Adequate,7DT0,Right,7P9J,4-92YR,No Glaucoma,3-X96M,No Glaucoma
4,6VP8,F,Caucasian,7.1,No,Adequate,7AA0,Left,7PE8,4-934E,No Glaucoma,3-X96P,Suspected Glaucoma
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,72A6,F,ethnicity not specified,8.7,No,Adequate,7HEP,Left,AH5P,4-8ZEA,Suspected Glaucoma,3-XBAG,Suspected Glaucoma
1090,74R4,M,Latin American,8.4,No,Adequate,7KZW,Right,AH6P,4-8ZFA,Suspected Glaucoma,3-XBAJ,No Glaucoma
1091,74R4,M,Latin American,8.4,No,Adequate,7KZW,Left,AH6R,4-8ZFC,Suspected Glaucoma,3-XBAM,Suspected Glaucoma
1092,7438,F,African Descent,11.7,No,Adequate,7KAA,Right,AHCR,4-8ZMW,Suspected Glaucoma,3-XBAP,Suspected Glaucoma


In [269]:
# Get Predictions from Execution 2-C6E0 (VGG19 on test set)
preds = pd.read_csv(execution.asset_paths[0])

# Get RID Image from Filename
preds['Filename'] = preds['Filename'].apply(lambda x: x.split("_")[3].split(".")[0])

# Link back to full DF
linkdDF = pd.merge( linkdDF,
                   preds[['Filename', 'Probability Score']],
                   left_on = 'RID_Image', right_on = 'Filename', 
                   how = 'left')

cols.append('Probability Score')
colDict = {'Probability Score':'Diagnosis_CNN_Prob'}
linkdDF = updateCols( linkdDF, cols, colDict )

linkdDF

Unnamed: 0,RID_Subject,Subject_Gender,Subject_Ethnicity,hba1c,glaucoma_hx,Subject_image_quality,RID_Observation,Image_Side,RID_Image,RID_Diagnosis_Optom,Diagnosis_Image_Optom,RID_Diagnosis_CNN,Diagnosis_Image_CNN,Diagnosis_CNN_Prob,Diagnosis_CNN_Prob.1
0,70GG,F,Latin American,7.8,No,Adequate,7FHY,Left,7P0A,4-92KC,No Glaucoma,3-X96E,No Glaucoma,0.005275,0.005275
1,70GG,F,Latin American,7.8,No,Adequate,7FHY,Right,7P0C,4-92KE,No Glaucoma,3-X96G,No Glaucoma,0.195267,0.195267
2,6YWM,M,Latin American,8.9,No,Adequate,7DT0,Left,7P9A,4-92YG,No Glaucoma,3-X96J,No Glaucoma,0.012258,0.012258
3,6YWM,M,Latin American,8.9,No,Adequate,7DT0,Right,7P9J,4-92YR,No Glaucoma,3-X96M,No Glaucoma,0.060034,0.060034
4,6VP8,F,Caucasian,7.1,No,Adequate,7AA0,Left,7PE8,4-934E,No Glaucoma,3-X96P,Suspected Glaucoma,0.663840,0.663840
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1089,72A6,F,ethnicity not specified,8.7,No,Adequate,7HEP,Left,AH5P,4-8ZEA,Suspected Glaucoma,3-XBAG,Suspected Glaucoma,0.994912,0.994912
1090,74R4,M,Latin American,8.4,No,Adequate,7KZW,Right,AH6P,4-8ZFA,Suspected Glaucoma,3-XBAJ,No Glaucoma,0.399114,0.399114
1091,74R4,M,Latin American,8.4,No,Adequate,7KZW,Left,AH6R,4-8ZFC,Suspected Glaucoma,3-XBAM,Suspected Glaucoma,0.933033,0.933033
1092,7438,F,African Descent,11.7,No,Adequate,7KAA,Right,AHCR,4-8ZMW,Suspected Glaucoma,3-XBAP,Suspected Glaucoma,0.981390,0.981390


In [300]:
# Define functions
def accuracy(xTab):
    return (xTab.iloc[0,0] + xTab.iloc[1,1]) / xTab.to_numpy().sum()

def tpr(xTab):
    return xTab.iloc[1,1] / (xTab.iloc[1,1] + xTab.iloc[0,1])

def tnr(xTab):
    return xTab.iloc[0,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fpr(xTab):
    return xTab.iloc[1,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fnr(xTab):
    return xTab.iloc[0,1] / (xTab.iloc[0,1] + xTab.iloc[1,1])

# Make confusion matrices
matrixList = {}
for e in pd.unique( linkdDF['Subject_Ethnicity'] ):
    matrixList[e] = pd.crosstab( linkdDF[ linkdDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_CNN'], linkdDF[ linkdDF['Subject_Ethnicity'] == e ]['Diagnosis_Image_Optom'] )

# Save off all parity metrics
vals = { 'n':{}, 'accuracy':{}, 'tpr':{}, 'tnr':{}, 'fpr':{}, 'fnr':{} }

for e in matrixList.keys():
    vals['n'][e] = matrixList[e].to_numpy().sum()
    vals['accuracy'][e] = accuracy( matrixList[e] )
    vals['tpr'][e] = tpr( matrixList[e] )
    vals['tnr'][e] = tnr( matrixList[e] )
    vals['fpr'][e] = fpr( matrixList[e] )
    vals['fnr'][e] = fnr( matrixList[e] )

parityMetrics = pd.DataFrame.from_dict(vals).transpose()
parityMetrics

Unnamed: 0,Latin American,Caucasian,Asian,ethnicity not specified,Other,African Descent
n,778.0,24.0,56.0,112.0,34.0,90.0
accuracy,0.791774,0.708333,0.821429,0.848214,0.970588,0.855556
tpr,0.726316,0.4,0.75,0.78125,0.95,0.818182
tnr,0.854271,0.928571,0.892857,0.9375,1.0,0.958333
fpr,0.145729,0.071429,0.107143,0.0625,0.0,0.041667
fnr,0.273684,0.6,0.25,0.21875,0.05,0.181818


In [None]:



# Space to stop autoscroll




In [None]:
with execution.execute() as exec:
    print("An ML Execution.") 

# Upload results

In [None]:
# crete asset path
asset_type_name = ""
asset_path = execution.execution_asset_path(asset_type_name)
# save assets to asset_path


# upload assets to catalog
execution.upload_execution_outputs(clean_folder=True)