# Connect Eye-AI and Load Libraries

In [1]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))

In [2]:
# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI

import pandas as pd
import numpy as np
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
from pathlib import Path, PurePath
import logging

#from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration
from deriva_ml import DerivaML, Workflow, ExecutionConfiguration, VersionPart
from deriva_ml import MLVocab as vc
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [9]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
#host = 'dev.eye-ai.org' #for dev testing
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

2025-09-15 10:10:51,366 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-09-15 10:10:51,380 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-09-15 10:10:51,393 - INFO - Setting up RefreshTokenAuthorizer with auth_client=[instance:140292188711440]
2025-09-15 10:10:51,394 - INFO - Setting up a RenewingAuthorizer. It will use an auth type of Bearer and can handle 401s.
2025-09-15 10:10:51,396 - INFO - RenewingAuthorizer will start by using access_token with hash "e33ef7a3348b9005ac42cf9fecd81c2548fb1126913f3d767c810a77f8468aad"
2025-09-15 10:10:51,397 - INFO - Executing token refresh without client credentials
2025-09-15 10:10:51,398 - INFO - Fetching new token from Globus Auth
2025-09-15 10:10:51,924 - INFO - request done (success)
2025-09-15 10:10:51,925 - INFO - RenewingAuthorizer.

You are already logged in.


# Configuration

In [10]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

2025-09-15 10:10:56,705 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-09-15 10:10:56,705 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>


In [59]:

source_dataset = '4-Z6K8' # "2-277G"  #"2-N93J"
#asset_RID = ["2-C8JM"]
ml_instance = DerivaML(host, catalog_id="eye-ai")

#ml_instance.increment_dataset_version(dataset_rid='4-Z6K8', component= VersionPart.patch, description='Update to latest deriva-ml schema')

preds_workflow = EA.add_workflow( 
    Workflow(
        name="LAC data template",
        url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Get_VGGPreds.ipynb",
        workflow_type="Test Workflow",
        )
    )

config = ExecutionConfiguration(
    datasets=[
        {
            "rid": source_dataset,
            "materialize": False,
            "version": ml_instance.dataset_version(source_dataset),
        }
    ],
    #assets=asset_RID,
    workflow=preds_workflow,
    description="Instance of linking VGG19 predictions to patient-level data",
    )

exec = ml_instance.create_execution(config)

2025-09-15 11:10:01,149 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-09-15 11:10:01,150 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-09-15 11:10:03,451 - INFO - Materialize bag 4-Z6K8... 
2025-09-15 11:10:03,575 - INFO - Creating client of type <class 'globus_sdk.services.auth.client.native_client.NativeAppAuthClient'> for service "auth"
2025-09-15 11:10:03,575 - INFO - Finished initializing AuthLoginClient. client_id='8ef15ba9-2b4a-469c-a163-7fd910c9d111', type(authorizer)=<class 'globus_sdk.authorizers.base.NullAuthorizer'>
2025-09-15 11:10:05,162 - INFO - Creating new MINID for dataset 4-Z6K8
2025-09-15 11:10:06,336 - INFO - Downloading dataset minid for catalog: 4-Z6K8@0.5.2
2025-09-15 11:10:06,337 - INFO - Creating client of type <class 'globus_sdk.services.auth.client

In [60]:
print(exec)

caching_dir: /home/kb_766/deriva-ml/DerivaML_working/cache
_working_dir: /home/kb_766/deriva-ml/DerivaML_working
execution_rid: 5-50W4
workflow_rid: 4-M4TT
asset_paths: {}
configuration: datasets=[DatasetSpec(rid='4-Z6K8', materialize=False, version=DatasetVersion(major=0, minor=5, patch=2))] assets=[] workflow='4-M4TT' parameters={} description='Instance of linking VGG19 predictions to patient-level data' argv=['/home/kb_766/.conda/envs/my-tensorflow-conda/lib/python3.10/site-packages/ipykernel_launcher.py', '-f', '/home/kb_766/.local/share/jupyter/runtime/kernel-46e911a8-ebc8-4d68-857b-2a6a11a15e06.json']


# Work with Data

In [61]:
#Get Data
ds_bag = exec.datasets[0]
# imageDF = ds_bag.get_table_as_dataframe('Image_Diagnosis')
# angle2DF = EA.filter_angle_2( ds_bag )
# trainDF = EA.image_tall(ds_bag, 'Initial Diagnosis')

In [76]:
# Get expert consensus diagnosis
experts = ['Benjamin Xu', 'Brandon Wong', 'Van Nguyen']
dxExpertOG = EA.image_tall(ds_bag, 'AI_glaucomasuspect_test')
dxExpertOG = dxExpertOG[ dxExpertOG['Full_Name'].isin(experts) ]

# Remove images graded Unknown by any grader (indicates unable to grade)
unknownRIDs = dxExpertOG.loc[dxExpertOG['Diagnosis_Image'] == 'Unknown', 'Image_RID']
dxExpertOG.drop(dxExpertOG[dxExpertOG['Image_RID'].isin(unknownRIDs)].index, inplace = True)

ridStore = []
dxStore = []
cDxStore = []
cdrStore = []
byxStore = []
bwStore = []
vnStore = []

for id in list(dxExpertOG['Image_RID'].unique()):
    ridStore.append(id)
    dxTemp = dxExpertOG[ dxExpertOG['Image_RID'] == id ]
    byxStore.append( dxTemp[ dxTemp['Full_Name'] == 'Benjamin Xu' ].loc[:,'Diagnosis_Image'].iloc[0] )
    bwStore.append( dxTemp[ dxTemp['Full_Name'] == 'Brandon Wong' ].loc[:,'Diagnosis_Image'].iloc[0] )
    vnStore.append( dxTemp[ dxTemp['Full_Name'] == 'Van Nguyen' ].loc[:,'Diagnosis_Image'].iloc[0] )
    dxCDR = dxTemp[dxTemp['Cup_Disk_Ratio'].apply(type) == float] 
    if len(dxCDR) > 0:
        cdrStore.append( round( dxCDR['Cup_Disk_Ratio'].sum() / len(dxCDR['Cup_Disk_Ratio']), 1 ) )
        if (dxTemp['Diagnosis_Image'] == 'Suspected Glaucoma').sum() > 1:
            dxStore.append('Suspected Glaucoma')
            cDxStore.append( (dxTemp['Diagnosis_Image'] == 'Suspected Glaucoma').sum() )
        else:
            dxStore.append('No Glaucoma')
            cDxStore.append( (dxTemp['Diagnosis_Image'] == 'No Glaucoma').sum() )
    else:
        cdrStore.append('')
        dxStore.append('Not Graded, Bad Quality')
        cDxStore.append(3)

dxExpert = pd.DataFrame({'RID_Image':ridStore, 'Diagnosis_Image_Expert':dxStore, 'Diagnosis_Image_Expert_Count':cDxStore, 'Diagnosis_BYX':byxStore, 'Diagnosis_BW':bwStore, 'Diagnosis_VN':vnStore, 'CDR_Expert':cdrStore})

In [87]:

# Get Predictions from Execution 5-50TW (currently sitting in my working directory) (VGG19 on test set 4-Z6K8)

preds = pd.read_csv('/data/kb_766/EyeAI_working/5-50TW/asset/VGG19_TrueCrop_Model_4-Z6K8_Sep_12_2025_predictions_results.csv')
preds['Image'] = preds['Filename'].apply(lambda x: x.split("_")[2].split(".")[0])
preds['Diagnosis_Image'] = preds['Prediction']
preds

Unnamed: 0,Filename,True Label,Prediction,Probability Score,Image,Diagnosis_Image
0,No_Glaucoma/Cropped_2-DCKR.JPG,0.0,1,0.513919,2-DCKR,1
1,No_Glaucoma/Cropped_2-DCKY.JPG,0.0,0,0.414791,2-DCKY,0
2,No_Glaucoma/Cropped_2-DCN0.JPG,0.0,1,0.501660,2-DCN0,1
3,No_Glaucoma/Cropped_2-DCN2.JPG,0.0,1,0.778332,2-DCN2,1
4,No_Glaucoma/Cropped_2-DCPP.JPG,0.0,0,0.039846,2-DCPP,0
...,...,...,...,...,...,...
651,Suspected_Glaucoma/Cropped_2-DC70.JPG,1.0,1,0.948401,2-DC70,1
652,Suspected_Glaucoma/Cropped_2-DC8A.JPG,1.0,0,0.490659,2-DC8A,0
653,Suspected_Glaucoma/Cropped_2-DC8C.JPG,1.0,0,0.132261,2-DC8C,0
654,Suspected_Glaucoma/Cropped_2-DCE0.JPG,1.0,1,0.970269,2-DCE0,1


In [81]:
ds_bag.get_table_as_dataframe('Image_Diagnosis')

Unnamed: 0,RID,RCT,RMT,RCB,RMB,Execution,Image,Feature_Name,Diagnosis_Image,Image_Quality,Diagnosis_Tag,Diagnosis_Status,Cup_Disk_Ratio,Comments,Process
0,4-5426,2024-11-27 23:34:14.681507+00,2025-07-01 23:47:58.365627+00,https://auth.globus.org/3769492a-b197-4063-952...,https://auth.globus.org/3769492a-b197-4063-952...,4-53ZE,2-CZD6,Diagnosis,Suspected Glaucoma,,Initial Diagnosis,,,,2-CCCJ
1,4-542A,2024-11-27 23:34:14.681507+00,2025-07-01 23:47:58.365627+00,https://auth.globus.org/3769492a-b197-4063-952...,https://auth.globus.org/3769492a-b197-4063-952...,4-53ZE,2-CZDA,Diagnosis,Suspected Glaucoma,,Initial Diagnosis,,,,2-CCCJ
2,4-543C,2024-11-27 23:34:14.681507+00,2025-07-01 23:47:58.365627+00,https://auth.globus.org/3769492a-b197-4063-952...,https://auth.globus.org/3769492a-b197-4063-952...,4-53ZE,2-CZEC,Diagnosis,Suspected Glaucoma,,Initial Diagnosis,,,,2-CCCJ
3,4-5448,2024-11-27 23:34:14.681507+00,2025-07-01 23:47:58.365627+00,https://auth.globus.org/3769492a-b197-4063-952...,https://auth.globus.org/3769492a-b197-4063-952...,4-53ZE,2-CZF8,Diagnosis,Suspected Glaucoma,,Initial Diagnosis,,,,2-CCCJ
4,4-544A,2024-11-27 23:34:14.681507+00,2025-07-01 23:47:58.365627+00,https://auth.globus.org/3769492a-b197-4063-952...,https://auth.globus.org/3769492a-b197-4063-952...,4-53ZE,2-CZFA,Diagnosis,Suspected Glaucoma,,Initial Diagnosis,,,,2-CCCJ
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3425,5-50PJ,2025-09-10 13:55:10.688384+00,2025-09-10 13:55:10.688384+00,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,5-2ET8,2-E63M,Diagnosis,Suspected Glaucoma,Good,AI_glaucomasuspect_test,Validated,0.7,,
3426,5-50PM,2025-09-10 13:55:18.174012+00,2025-09-10 13:55:18.174012+00,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,5-2ET8,2-DC8A,Diagnosis,No Glaucoma,Good,AI_glaucomasuspect_test,Validated,0.5,,
3427,5-50S6,2025-09-11 06:18:39.705875+00,2025-09-11 06:18:40.28431+00,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,5-2ET8,2-D9PR,Diagnosis,Unknown,Unknown,AI_glaucomasuspect_test,Validated,0.1,,
3428,5-50S8,2025-09-11 06:18:47.452494+00,2025-09-11 06:18:47.452494+00,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,https://auth.globus.org/fb1b3a7f-f953-418d-83e...,5-2ET8,2-D8S4,Diagnosis,Unknown,Unknown,AI_glaucomasuspect_test,Validated,0.1,,


In [98]:
# Function to update column names
pd.options.mode.copy_on_write = True
def updateCols(df, cols, colDict):
    df = df[cols]
    df.rename( columns = colDict, inplace = True )
    for c in set(cols).intersection( set(colDict) ): cols[cols.index(c)] = colDict.get(c)
    return df

cols = ['Image', 'Diagnosis_Image_Optom', 'Diagnosis_Image_CNN', 'Probability Score']
colDict = {'Image':'RID_Image', 'Observation':'RID_Observation', 'Subject':'RID_Subject', 'Probability Score':'Diagnosis_CNN_Prob'}

# Build up diagnosis DF for Optom and CNN

# Code block for 2-277M test set with 2-C6EO predictions (saved in catalog)
# diags = ds_bag.get_table_as_dataframe('Image_Diagnosis')
# diags = pd.merge( diags[diags['Execution'] == '2-C6E0'],
#                    diags[diags['Diagnosis_Tag'] == 'Initial Diagnosis'],
#                    on = 'Image', how = 'left', suffixes = ['_CNN', '_Optom'])

# Code block for 4-Z6K8 test set with 5-50TW predictions (taken from my WD, to be saved as execution)
diags = ds_bag.get_table_as_dataframe('Image_Diagnosis')
diags = pd.merge( preds,
                 diags[diags['Diagnosis_Tag'] == 'Initial Diagnosis'],
                 on = 'Image', how = 'left', suffixes = ['_CNN', '_Optom'])

diags = updateCols( diags, cols, colDict )

del(cols[0])
cols[:0] = ['RID_Image', 'Diagnosis_Image_Expert', 'Diagnosis_Image_Expert_Count', 'Diagnosis_BYX', 'Diagnosis_BW', 'Diagnosis_VN', 'CDR_Expert']

# Merge onto diagnosis DF for Expert
diags = pd.merge( dxExpert, diags, on = 'RID_Image', how = 'left' )

# Link to image data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Image'),
                  diags,
                  left_on = 'RID', right_on = 'RID_Image', 
                  how = 'right')

cols[:0] = ['Observation', 'Image_Side']
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to observation data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Observation'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Observation', 
                   how = 'right')

cols[:0] = ['Subject', 'date_of_encounter', 'Age', 'hba1c', 'dr_level', 'glaucoma_hx', 'consultant', 'Subject_image_quality']  # removed site_mrn
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to subject data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Subject'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Subject', 
                   how = 'right')

cols[:0] = ['RID_Subject', 'Subject_Gender', 'Subject_Ethnicity']  # removed site_mrn
del(cols[ np.where( np.array(cols)=='RID_Subject' )[0][1] ]) # remove duplicated RID_Subject
linkdDF = updateCols( linkdDF, cols, colDict )

Unnamed: 0,RID_Subject,Subject_Gender,Subject_Ethnicity,date_of_encounter,Age,hba1c,dr_level,glaucoma_hx,consultant,Subject_image_quality,...,RID_Image,Diagnosis_Image_Expert,Diagnosis_Image_Expert_Count,Diagnosis_BYX,Diagnosis_BW,Diagnosis_VN,CDR_Expert,Diagnosis_Image_Optom,Diagnosis_Image_CNN,Diagnosis_CNN_Prob
0,2-CDB4,M,African Descent,2024-02-28,64,7.5,No apparent diabetic retinopathy,No,Naro Babaian Marukian,Adequate,...,2-DCKR,No Glaucoma,3,No Glaucoma,No Glaucoma,No Glaucoma,0.5,No Glaucoma,1,0.513919
1,2-CDB4,M,African Descent,2024-02-28,64,7.5,No apparent diabetic retinopathy,No,Naro Babaian Marukian,Adequate,...,2-DCKY,No Glaucoma,3,No Glaucoma,No Glaucoma,No Glaucoma,0.4,No Glaucoma,0,0.414791
2,2-CDCE,M,African Descent,2023-05-22,54,6.1,No apparent diabetic retinopathy,No,Sumavamsi Tiriveedhi,Good,...,2-E45C,No Glaucoma,2,No Glaucoma,No Glaucoma,Suspected Glaucoma,0.5,No Glaucoma,0,0.393440
3,2-CDCE,M,African Descent,2023-05-22,54,6.1,No apparent diabetic retinopathy,No,Sumavamsi Tiriveedhi,Good,...,2-E45M,Suspected Glaucoma,2,No Glaucoma,Suspected Glaucoma,Suspected Glaucoma,0.6,No Glaucoma,1,0.595273
4,2-CDDW,M,Caucasian,2023-06-06,52,10.8,No apparent diabetic retinopathy,No,Jessica Young,Adequate,...,2-DHET,No Glaucoma,2,No Glaucoma,No Glaucoma,Suspected Glaucoma,0.4,No Glaucoma,1,0.686498
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
637,2-CN8P,F,African Descent,2024-05-23,68,6.2,DR not determined,No,Alicia Liu,Adequate,...,2-D9NA,No Glaucoma,3,No Glaucoma,No Glaucoma,No Glaucoma,0.4,Suspected Glaucoma,1,0.965940
638,2-CN8P,F,African Descent,2024-05-23,68,6.2,DR not determined,No,Alicia Liu,Adequate,...,2-D9NC,No Glaucoma,3,No Glaucoma,No Glaucoma,No Glaucoma,0.3,Suspected Glaucoma,0,0.444845
639,2-CN9A,F,African Descent,2023-10-03,41,6.0,No apparent diabetic retinopathy,No,Tina Zheng,Adequate,...,2-D27E,No Glaucoma,2,No Glaucoma,No Glaucoma,Suspected Glaucoma,0.5,Suspected Glaucoma,1,0.891928
640,2-CN9A,F,African Descent,2023-10-03,41,6.0,No apparent diabetic retinopathy,No,Tina Zheng,Adequate,...,2-D27M,No Glaucoma,3,No Glaucoma,No Glaucoma,No Glaucoma,0.4,Suspected Glaucoma,0,0.144418


In [109]:
from sklearn.metrics import roc_auc_score

y_true = linkdDF['Diagnosis_Image_Expert'].map({"No Glaucoma": 0, "Suspected Glaucoma": 1})
y_pred = linkdDF['Diagnosis_CNN_Prob']

auc = roc_auc_score(y_true, y_pred)
print("AUROC:", auc)

AUROC: 0.8502722231217615


In [107]:
# NOTHING FURTHER NEEDED FOR 4-Z6K8 test set with 5-50TW predictions
# LINKDDF IS COMPLETE!

linkdDF.to_csv("ImagesVGG19For4Z6K8.csv", index=False)

In [None]:
# Get Predictions from Execution 2-C6E0 (VGG19 on test set 2-277M)
preds = pd.read_csv(exec.asset_paths[0])

# Get RID Image from Filename
preds['Filename'] = preds['Filename'].apply(lambda x: x.split("_")[3].split(".")[0])

# Link back to full DF
linkdDF = pd.merge( linkdDF,
                   preds[['Filename', 'Probability Score']],
                   left_on = 'RID_Image', right_on = 'Filename', 
                   how = 'left')

cols.append('Probability Score')
colDict = {'Probability Score':'Diagnosis_CNN_Prob'}
linkdDF = updateCols( linkdDF, cols, colDict )

In [None]:
# Make a subject level DF
def getMaxDx(dxList):
    if (dxList == 'Suspected Glaucoma').sum() > 0:
        return 'Suspected Glaucoma'
    elif (dxList == 'Not Graded, Bad Quality').sum() > 0:
        return 'Not Graded, Bad Quality'
    else:
        return 'No Glaucoma'

def getMaxCDR(cdrList):
    t = [x for x in tempDF['CDR_Expert'] if isinstance(x, (int, float))]
    if len(t) > 0:
        return max(t)
    else:
        return ''

idList = []
genderList = []
ethnicityList = []
dxExpertList = []
cdrMaxList = []
dxOptomList = []
dxCNNList = []
probCNNList = []

for id in pd.unique( linkdDF['RID_Subject'] ):
    tempDF = linkdDF[ linkdDF['RID_Subject'] == id ]
    idList.append(id)
    genderList.append( tempDF['Subject_Gender'].iloc[0] )
    ethnicityList.append( tempDF['Subject_Ethnicity'].iloc[0] )
    dxExpertList.append( getMaxDx( tempDF['Diagnosis_Image_Expert'] ) )
    cdrMaxList.append( getMaxCDR( tempDF['CDR_Expert'] ) )
    dxOptomList.append( getMaxDx( tempDF['Diagnosis_Image_Optom'] ) )
    dxCNNList.append( getMaxDx( tempDF['Diagnosis_Image_CNN'] ) )
    probCNNList.append( tempDF['Diagnosis_CNN_Prob'].max() )

dxSubjectDF = pd.DataFrame({'RID_Subject':idList, 'Subject_Gender':genderList, 'Subject_Ethnicity':ethnicityList, 'Diagnosis_Image_Expert':dxExpertList, 'CDR_Expert':cdrMaxList, 'Diagnosis_Image_Optom':dxOptomList, 'Diagnosis_Image_CNN':dxCNNList, 'Diagnosis_CNN_Prob':probCNNList})


In [None]:
tempDF

In [None]:
np.unique( dxSubjectDF['Subject_Ethnicity'], return_counts=True )

# DEFINE Parity Metrics

In [None]:
# Define functions for Parity Metrics

def glcRate(xTab):
    return (xTab.iloc[0,1] + xTab.iloc[1,1]) / xTab.to_numpy().sum()

def predPosRate(xTab):
    return (xTab.iloc[1,0] + xTab.iloc[1,1]) / xTab.to_numpy().sum()
    
def accuracy(xTab):
    return (xTab.iloc[0,0] + xTab.iloc[1,1]) / xTab.to_numpy().sum()

def tpr(xTab):
    return xTab.iloc[1,1] / (xTab.iloc[1,1] + xTab.iloc[0,1])

def tnr(xTab):
    return xTab.iloc[0,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fpr(xTab):
    return xTab.iloc[1,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fnr(xTab):
    return xTab.iloc[0,1] / (xTab.iloc[0,1] + xTab.iloc[1,1])

def getParityMetrics(matrixList):
    vals = { 'n':{}, 'glcRate':{}, 'accuracy':{}, 'tpr':{}, 'tnr':{}, 'fpr':{}, 'fnr':{} }
    for e in matrixList.keys():
        vals['n'][e] = matrixList[e].to_numpy().sum()
        vals['glcRate'][e] = glcRate( matrixList[e] )
        vals['accuracy'][e] = accuracy( matrixList[e] )
        vals['tpr'][e] = tpr( matrixList[e] )
        vals['tnr'][e] = tnr( matrixList[e] )
        vals['fpr'][e] = fpr( matrixList[e] )
        vals['fnr'][e] = fnr( matrixList[e] )
    return pd.DataFrame.from_dict(vals).transpose().loc[:,['All', 'Latin American', 'African Descent', 'Asian', 'Caucasian', 'ethnicity not specified', 'Other']]

def getParityMetrics2(factorSeries, dxSeriesPred, dxSeriesActual):
    tempDF = pd.DataFrame({ 'Factor': factorSeries, 'DxPred': dxSeriesPred, 'DxActual': dxSeriesActual })
    tempDF['DxPred'] = tempDF['DxPred'].astype('category')
    tempDF['DxActual'] = tempDF['DxActual'].astype('category')
    matrixList = {}
    matrixList['All'] = pd.crosstab( dxSeriesPred, dxSeriesActual )
    for e in pd.unique( factorSeries ):
        matrixList[e] = pd.crosstab( tempDF[ tempDF['Factor'] == e ]['DxPred'], tempDF[ tempDF['Factor'] == e ]['DxActual'], dropna=False )

    vals = { 'n':{}, 'glcRate':{}, 'predPosRate':{}, 'accuracy':{}, 'tpr':{}, 'tnr':{}, 'fpr':{}, 'fnr':{} }
    for e in matrixList.keys():
        vals['n'][e] = matrixList[e].to_numpy().sum()
        vals['glcRate'][e] = glcRate( matrixList[e] )
        vals['predPosRate'][e] = predPosRate( matrixList[e] )
        vals['accuracy'][e] = accuracy( matrixList[e] )
        vals['tpr'][e] = tpr( matrixList[e] )
        vals['tnr'][e] = tnr( matrixList[e] )
        vals['fpr'][e] = fpr( matrixList[e] )
        vals['fnr'][e] = fnr( matrixList[e] )
    return pd.DataFrame.from_dict(vals).transpose()#.loc[:,['All', 'Latin American', 'African Descent', 'Asian', 'Caucasian', 'ethnicity not specified', 'Other']]

# Make Plots

In [None]:
tempDF = dxSubjectDF[ dxSubjectDF['CDR_Expert'] != '' ]
parityDF = getParityMetrics2( tempDF['CDR_Expert'], tempDF['Diagnosis_Image_CNN'], tempDF['Diagnosis_Image_Optom'] )


# SCATTER PLOTS FOR METRICS vs. CDR  -----

# test = parityDF.loc['accuracy'].to_frame()
# inds = [x for x in test.index if x != 'All']
# test = test.loc[inds]
# test['CDR'] = test.index
# test = test.sort_values( by='CDR' )
# # test.plot.scatter( x='CDR', y='tnr')
# test.plot.line( x='CDR', y='accuracy' )

# test = parityDF.loc['n'][1:10].to_frame()
# test['CDR'] = test.index


# HISTOGRAM BY CDR ALL  -----

#plt.bar(test['CDR'], test['n'], width=0.05, align='center')


# HISTOGRAM BY CDR GROUPED BY ETHNICITY  -----

tempDF['Subject_Ethnicity'] = tempDF['Subject_Ethnicity'].astype('category')
cdrEthCounts = {}
for i in pd.unique( tempDF['CDR_Expert'] ):
    cdrEthCounts[i] = tempDF[ tempDF['CDR_Expert'] == i ]['Subject_Ethnicity'].value_counts()

test = cdrEthCounts[ list( cdrEthCounts )[0] ].to_frame(name=list( cdrEthCounts )[0])
for i in list( cdrEthCounts )[1:len(list( cdrEthCounts ))]:
    test = pd.concat([test, cdrEthCounts[i].to_frame(name=i)], axis=1)

test = test.transpose()
test['CDR'] = test.index

test = test.sort_values( by='CDR' )
test

# HISTOGRAM BY CDR GROUPED BY ETHNICITY AND NORMALIZED -----

# testNorm = test.copy()

# for i in [x for x in test.columns if x != 'CDR']:
#     testNorm[i] = test[i] / test[i].sum()

# test.plot(x='CDR', kind='bar', stacked=False).legend(bbox_to_anchor=(1.0, 1.0))

# fig, ax = plt.subplots(figsize=(8,6))
# tempDF.groupby('Subject_Ethnicity').plot.scatter( x='CDR_Expert', y='Diagnosis_CNN_Prob', ax=ax, color=

In [None]:
prob_true, prob_pred = calibration_curve(y_true=dxSubjectDF['Diagnosis_Image_Optom'], y_prob=dxSubjectDF['Diagnosis_CNN_Prob'], pos_label='Suspected Glaucoma', n_bins=5)
plt.plot(prob_pred,prob_true)
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.axline((0,0), slope=1, color='0.0', linestyle='--')
plt.show()

In [None]:
prob_true = {}
prob_pred = {}

# CALIBRATION CURVES GROUPED BY ETHNICITY

for e in ['Latin American', 'African Descent', 'ethnicity not specified', 'Asian', 'Other','Caucasian']:
    tempDF = dxSubjectDF[dxSubjectDF['Subject_Ethnicity'] == e]
    prob_true[e], prob_pred[e] = calibration_curve(y_true=tempDF['Diagnosis_Image_Optom'], y_prob=tempDF['Diagnosis_CNN_Prob'], pos_label='Suspected Glaucoma', n_bins=10)
    plt.plot(prob_pred[e],prob_true[e], label=e)


# CALIBRATION CURVES GROUPED BY GENDER

# for e in ['M', 'F']:
#     tempDF = dxSubjectDF[dxSubjectDF['Subject_Gender'] == e]
#     prob_true[e], prob_pred[e] = calibration_curve(y_true=tempDF['Diagnosis_Image_Optom'], y_prob=tempDF['Diagnosis_CNN_Prob'], pos_label='Suspected Glaucoma', n_bins=10)
#     plt.plot(prob_pred[e],prob_true[e], label=e)
    
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.axline((0,0), slope=1, color='0.0', linestyle='--')
plt.show()

# Calculate Parity Metrics at IMAGE Level

In [None]:
# Parity metrics for CNN vs. Optom
getParityMetrics2( linkdDF['Subject_Ethnicity'], linkdDF['Diagnosis_Image_CNN'], linkdDF['Diagnosis_Image_Optom'] )

In [None]:
# Parity metrics for CNN vs. Expert labels
expertGradedDF = linkdDF[linkdDF['Diagnosis_Image_Expert'] != 'Not Graded, Bad Quality']
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_CNN'], expertGradedDF['Diagnosis_Image_Expert'] )

In [None]:
# Parity metrics for Optom vs. Expert labels
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_Optom'], expertGradedDF['Diagnosis_Image_Expert'] )

# Calculate Parity Metrics at SUBJECT Level

In [None]:
# Parity metrics for CNN vs. Optom
getParityMetrics2( dxSubjectDF['Subject_Ethnicity'], dxSubjectDF['Diagnosis_Image_CNN'], dxSubjectDF['Diagnosis_Image_Optom'] )

In [None]:
# Parity metrics for CNN vs. Expert labels
expertGradedDF = dxSubjectDF[dxSubjectDF['Diagnosis_Image_Expert'] != 'Not Graded, Bad Quality']
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_CNN'], expertGradedDF['Diagnosis_Image_Expert'] )

In [None]:
# Parity metrics for Optom vs. Expert labels
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_Optom'], expertGradedDF['Diagnosis_Image_Expert'] )

In [None]:



# Space to stop autoscroll




In [None]:
np.unique(linkdDF['consultant'], return_counts=True)

# Upload Results

In [None]:
# crete asset path
asset_type_name = "Diagnosis_Analysis"
asset_path = exec.execution_asset_path(asset_type_name)

# save assets to asset_path
linkdDF.to_csv(asset_path/'ImagesToVGG19.csv', index=False)
#dxSubjectDF.to_csv(asset_path/'SubjectsToVGG19.csv', index=False)
#parityMetrics.to_csv(asset_path/'ParityMetrics.csv', index=False)

# upload assets to catalog
exec.upload_execution_outputs(clean_folder=True)