# Connect Eye-AI and Load Libraries

In [None]:
# repo_dir = "Repos"   # Set this to be where your github repos are located.
# %load_ext autoreload
# %autoreload 2

# # Update the load path so python can find modules for the model
# import sys
# from pathlib import Path
# sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
# sys.path.insert(0, str(Path.home() / repo_dir / "deriva-ml"))

In [None]:
# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI

import pandas as pd
import numpy as np
from sklearn.calibration import calibration_curve
import matplotlib.pyplot as plt
from pathlib import Path, PurePath
import logging

#from deriva_ml import DatasetBag, Workflow, ExecutionConfiguration
from deriva_ml import DerivaML, Workflow, ExecutionConfiguration, VersionPart
from deriva_ml import MLVocab as vc
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:
# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
#host = 'dev.eye-ai.org' #for dev testing
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

# Configuration

In [None]:
cache_dir = '/data'
working_dir = '/data'
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

In [None]:

source_dataset = "2-277G"  #"2-N93J"
asset_RID = ["2-C8JM"]
ml_instance = DerivaML(host, catalog_id="eye-ai")

#ml_instance.increment_dataset_version(dataset_rid='2-277M', component= VersionPart.patch, description='Update to latest deriva-ml schema')

preds_workflow = EA.add_workflow( 
    Workflow(
        name="LAC data template",
        url="https://github.com/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/Sandbox_KB/Get_VGGPreds.ipynb",
        workflow_type="Test Workflow",
        )
    )

config = ExecutionConfiguration(
    datasets=[
        {
            "rid": source_dataset,
            "materialize": False,
            "version": ml_instance.dataset_version(source_dataset),
        }
    ],
    assets=asset_RID,
    workflow=preds_workflow,
    description="Instance of linking VGG19 predictions to patient-level data",
    )

exec = ml_instance.create_execution(config)

In [None]:
print(exec)

In [None]:
# my test


ds_bag = exec.datasets[0]
imageDF = ds_bag.get_table_as_dataframe('Image_Diagnosis')
angle2DF = EA.filter_angle_2( ds_bag )
trainDF = EA.image_tall(ds_bag, 'Initial Diagnosis')


In [None]:
mergeDF = pd.merge( angle2DF, imageDF[imageDF['Diagnosis_Tag'] == 'Initial Diagnosis'],
                  how = 'left', left_on = 'RID', right_on = 'Image')

mergeDF['Diagnosis_Image'].value_counts()

In [None]:
mergeDF.shape

In [None]:
imageDF[imageDF['Diagnosis_Tag'] == 'Initial Diagnosis'].loc[:,'Diagnosis_Image'].value_counts()


In [None]:
ds_bag = exec.datasets[0]

In [None]:
# Get expert consensus diagnosis
experts = ['Benjamin Xu', 'Brandon Wong', 'Van Nguyen']
dxExpertOG = EA.image_tall(ds_bag, 'AI_glaucomasuspect_test')
dxExpertOG = dxExpertOG[ dxExpertOG['Full_Name'].isin(experts) ]

ridStore = []
dxStore = []
cDxStore = []
cdrStore = []
byxStore = []
bwStore = []
vnStore = []

for id in list(dxExpertOG['Image_RID'].unique()):
    ridStore.append(id)
    dxTemp = dxExpertOG[ dxExpertOG['Image_RID'] == id ]
    byxStore.append( dxTemp[ dxTemp['Full_Name'] == 'Benjamin Xu' ].loc[:,'Diagnosis_Image'].iloc[0] )
    bwStore.append( dxTemp[ dxTemp['Full_Name'] == 'Brandon Wong' ].loc[:,'Diagnosis_Image'].iloc[0] )
    vnStore.append( dxTemp[ dxTemp['Full_Name'] == 'Van Nguyen' ].loc[:,'Diagnosis_Image'].iloc[0] )
    dxCDR = dxTemp[dxTemp['Cup_Disk_Ratio'].apply(type) == float] 
    if len(dxCDR) > 0:
        cdrStore.append( round( dxCDR['Cup_Disk_Ratio'].sum() / len(dxCDR['Cup_Disk_Ratio']), 1 ) )
        if (dxTemp['Diagnosis_Image'] == 'Suspected Glaucoma').sum() > 1:
            dxStore.append('Suspected Glaucoma')
            cDxStore.append( (dxTemp['Diagnosis_Image'] == 'Suspected Glaucoma').sum() )
        else:
            dxStore.append('No Glaucoma')
            cDxStore.append( (dxTemp['Diagnosis_Image'] == 'No Glaucoma').sum() )
    else:
        cdrStore.append('')
        dxStore.append('Not Graded, Bad Quality')
        cDxStore.append(3)

dxExpert = pd.DataFrame({'RID_Image':ridStore, 'Diagnosis_Image_Expert':dxStore, 'Diagnosis_Image_Expert_Count':cDxStore, 'Diagnosis_BYX':byxStore, 'Diagnosis_BW':bwStore, 'Diagnosis_VN':vnStore, 'CDR_Expert':cdrStore})

In [None]:
# Function to update column names
pd.options.mode.copy_on_write = True
def updateCols(df, cols, colDict):
    df = df[cols]
    df.rename( columns = colDict, inplace = True )
    for c in set(cols).intersection( set(colDict) ): cols[cols.index(c)] = colDict.get(c)
    return df

cols = ['Image', 'Diagnosis_Image_Optom', 'Diagnosis_Image_CNN']
colDict = {'Image':'RID_Image', 'Observation':'RID_Observation', 'Subject':'RID_Subject'}

# Build up diagnosis DF for Optom and CNN
diags = ds_bag.get_table_as_dataframe('Image_Diagnosis')
diags = pd.merge( diags[diags['Execution'] == '2-C6E0'],
                   diags[diags['Diagnosis_Tag'] == 'Initial Diagnosis'],
                   on = 'Image', how = 'left', suffixes = ['_CNN', '_Optom'])

diags = updateCols( diags, cols, colDict )
del(cols[0])
cols[:0] = ['RID_Image', 'Diagnosis_Image_Expert', 'Diagnosis_Image_Expert_Count', 'Diagnosis_BYX', 'Diagnosis_BW', 'Diagnosis_VN', 'CDR_Expert']

# Merge onto diagnosis DF for Expert
diags = pd.merge( dxExpert, diags, on = 'RID_Image', how = 'left' )

# Link to image data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Image'),
                  diags,
                  left_on = 'RID', right_on = 'RID_Image', 
                  how = 'right')

cols[:0] = ['Observation', 'Image_Side']
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to observation data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Observation'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Observation', 
                   how = 'right')

cols[:0] = ['Subject', 'date_of_encounter', 'Age', 'hba1c', 'dr_level', 'glaucoma_hx', 'consultant', 'Subject_image_quality']  # removed site_mrn
linkdDF = updateCols( linkdDF, cols, colDict )

# Link to subject data
linkdDF = pd.merge( ds_bag.get_table_as_dataframe('Subject'),
                   linkdDF,
                   left_on = 'RID', right_on = 'RID_Subject', 
                   how = 'right')

cols[:0] = ['RID_Subject', 'Subject_Gender', 'Subject_Ethnicity']  # removed site_mrn
del(cols[ np.where( np.array(cols)=='RID_Subject' )[0][1] ]) # remove duplicated RID_Subject
linkdDF = updateCols( linkdDF, cols, colDict )

In [None]:
# Get Predictions from Execution 2-C6E0 (VGG19 on test set)
preds = pd.read_csv(exec.asset_paths[0])

# Get RID Image from Filename
preds['Filename'] = preds['Filename'].apply(lambda x: x.split("_")[3].split(".")[0])

# Link back to full DF
linkdDF = pd.merge( linkdDF,
                   preds[['Filename', 'Probability Score']],
                   left_on = 'RID_Image', right_on = 'Filename', 
                   how = 'left')

cols.append('Probability Score')
colDict = {'Probability Score':'Diagnosis_CNN_Prob'}
linkdDF = updateCols( linkdDF, cols, colDict )

In [None]:
# Make a subject level DF
def getMaxDx(dxList):
    if (dxList == 'Suspected Glaucoma').sum() > 0:
        return 'Suspected Glaucoma'
    elif (dxList == 'Not Graded, Bad Quality').sum() > 0:
        return 'Not Graded, Bad Quality'
    else:
        return 'No Glaucoma'

def getMaxCDR(cdrList):
    t = [x for x in tempDF['CDR_Expert'] if isinstance(x, (int, float))]
    if len(t) > 0:
        return max(t)
    else:
        return ''

idList = []
genderList = []
ethnicityList = []
dxExpertList = []
cdrMaxList = []
dxOptomList = []
dxCNNList = []
probCNNList = []

for id in pd.unique( linkdDF['RID_Subject'] ):
    tempDF = linkdDF[ linkdDF['RID_Subject'] == id ]
    idList.append(id)
    genderList.append( tempDF['Subject_Gender'].iloc[0] )
    ethnicityList.append( tempDF['Subject_Ethnicity'].iloc[0] )
    dxExpertList.append( getMaxDx( tempDF['Diagnosis_Image_Expert'] ) )
    cdrMaxList.append( getMaxCDR( tempDF['CDR_Expert'] ) )
    dxOptomList.append( getMaxDx( tempDF['Diagnosis_Image_Optom'] ) )
    dxCNNList.append( getMaxDx( tempDF['Diagnosis_Image_CNN'] ) )
    probCNNList.append( tempDF['Diagnosis_CNN_Prob'].max() )

dxSubjectDF = pd.DataFrame({'RID_Subject':idList, 'Subject_Gender':genderList, 'Subject_Ethnicity':ethnicityList, 'Diagnosis_Image_Expert':dxExpertList, 'CDR_Expert':cdrMaxList, 'Diagnosis_Image_Optom':dxOptomList, 'Diagnosis_Image_CNN':dxCNNList, 'Diagnosis_CNN_Prob':probCNNList})


In [None]:
tempDF

In [None]:
np.unique( dxSubjectDF['Subject_Ethnicity'], return_counts=True )

##### DEFINE Parity Metrics

In [None]:
# Define functions for Parity Metrics

def glcRate(xTab):
    return (xTab.iloc[0,1] + xTab.iloc[1,1]) / xTab.to_numpy().sum()

def predPosRate(xTab):
    return (xTab.iloc[1,0] + xTab.iloc[1,1]) / xTab.to_numpy().sum()
    
def accuracy(xTab):
    return (xTab.iloc[0,0] + xTab.iloc[1,1]) / xTab.to_numpy().sum()

def tpr(xTab):
    return xTab.iloc[1,1] / (xTab.iloc[1,1] + xTab.iloc[0,1])

def tnr(xTab):
    return xTab.iloc[0,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fpr(xTab):
    return xTab.iloc[1,0] / (xTab.iloc[1,0] + xTab.iloc[0,0])

def fnr(xTab):
    return xTab.iloc[0,1] / (xTab.iloc[0,1] + xTab.iloc[1,1])

def getParityMetrics(matrixList):
    vals = { 'n':{}, 'glcRate':{}, 'accuracy':{}, 'tpr':{}, 'tnr':{}, 'fpr':{}, 'fnr':{} }
    for e in matrixList.keys():
        vals['n'][e] = matrixList[e].to_numpy().sum()
        vals['glcRate'][e] = glcRate( matrixList[e] )
        vals['accuracy'][e] = accuracy( matrixList[e] )
        vals['tpr'][e] = tpr( matrixList[e] )
        vals['tnr'][e] = tnr( matrixList[e] )
        vals['fpr'][e] = fpr( matrixList[e] )
        vals['fnr'][e] = fnr( matrixList[e] )
    return pd.DataFrame.from_dict(vals).transpose().loc[:,['All', 'Latin American', 'African Descent', 'Asian', 'Caucasian', 'ethnicity not specified', 'Other']]

def getParityMetrics2(factorSeries, dxSeriesPred, dxSeriesActual):
    tempDF = pd.DataFrame({ 'Factor': factorSeries, 'DxPred': dxSeriesPred, 'DxActual': dxSeriesActual })
    tempDF['DxPred'] = tempDF['DxPred'].astype('category')
    tempDF['DxActual'] = tempDF['DxActual'].astype('category')
    matrixList = {}
    matrixList['All'] = pd.crosstab( dxSeriesPred, dxSeriesActual )
    for e in pd.unique( factorSeries ):
        matrixList[e] = pd.crosstab( tempDF[ tempDF['Factor'] == e ]['DxPred'], tempDF[ tempDF['Factor'] == e ]['DxActual'], dropna=False )

    vals = { 'n':{}, 'glcRate':{}, 'predPosRate':{}, 'accuracy':{}, 'tpr':{}, 'tnr':{}, 'fpr':{}, 'fnr':{} }
    for e in matrixList.keys():
        vals['n'][e] = matrixList[e].to_numpy().sum()
        vals['glcRate'][e] = glcRate( matrixList[e] )
        vals['predPosRate'][e] = predPosRate( matrixList[e] )
        vals['accuracy'][e] = accuracy( matrixList[e] )
        vals['tpr'][e] = tpr( matrixList[e] )
        vals['tnr'][e] = tnr( matrixList[e] )
        vals['fpr'][e] = fpr( matrixList[e] )
        vals['fnr'][e] = fnr( matrixList[e] )
    return pd.DataFrame.from_dict(vals).transpose()#.loc[:,['All', 'Latin American', 'African Descent', 'Asian', 'Caucasian', 'ethnicity not specified', 'Other']]

# Make Plots

In [None]:
tempDF = dxSubjectDF[ dxSubjectDF['CDR_Expert'] != '' ]
parityDF = getParityMetrics2( tempDF['CDR_Expert'], tempDF['Diagnosis_Image_CNN'], tempDF['Diagnosis_Image_Optom'] )


# SCATTER PLOTS FOR METRICS vs. CDR  -----

# test = parityDF.loc['accuracy'].to_frame()
# inds = [x for x in test.index if x != 'All']
# test = test.loc[inds]
# test['CDR'] = test.index
# test = test.sort_values( by='CDR' )
# # test.plot.scatter( x='CDR', y='tnr')
# test.plot.line( x='CDR', y='accuracy' )

# test = parityDF.loc['n'][1:10].to_frame()
# test['CDR'] = test.index


# HISTOGRAM BY CDR ALL  -----

#plt.bar(test['CDR'], test['n'], width=0.05, align='center')


# HISTOGRAM BY CDR GROUPED BY ETHNICITY  -----

tempDF['Subject_Ethnicity'] = tempDF['Subject_Ethnicity'].astype('category')
cdrEthCounts = {}
for i in pd.unique( tempDF['CDR_Expert'] ):
    cdrEthCounts[i] = tempDF[ tempDF['CDR_Expert'] == i ]['Subject_Ethnicity'].value_counts()

test = cdrEthCounts[ list( cdrEthCounts )[0] ].to_frame(name=list( cdrEthCounts )[0])
for i in list( cdrEthCounts )[1:len(list( cdrEthCounts ))]:
    test = pd.concat([test, cdrEthCounts[i].to_frame(name=i)], axis=1)

test = test.transpose()
test['CDR'] = test.index

test = test.sort_values( by='CDR' )
test

# HISTOGRAM BY CDR GROUPED BY ETHNICITY AND NORMALIZED -----

# testNorm = test.copy()

# for i in [x for x in test.columns if x != 'CDR']:
#     testNorm[i] = test[i] / test[i].sum()

# test.plot(x='CDR', kind='bar', stacked=False).legend(bbox_to_anchor=(1.0, 1.0))

# fig, ax = plt.subplots(figsize=(8,6))
# tempDF.groupby('Subject_Ethnicity').plot.scatter( x='CDR_Expert', y='Diagnosis_CNN_Prob', ax=ax, color=

In [None]:
prob_true, prob_pred = calibration_curve(y_true=dxSubjectDF['Diagnosis_Image_Optom'], y_prob=dxSubjectDF['Diagnosis_CNN_Prob'], pos_label='Suspected Glaucoma', n_bins=5)
plt.plot(prob_pred,prob_true)
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.axline((0,0), slope=1, color='0.0', linestyle='--')
plt.show()

In [None]:
prob_true = {}
prob_pred = {}

# CALIBRATION CURVES GROUPED BY ETHNICITY

for e in ['Latin American', 'African Descent', 'ethnicity not specified', 'Asian', 'Other','Caucasian']:
    tempDF = dxSubjectDF[dxSubjectDF['Subject_Ethnicity'] == e]
    prob_true[e], prob_pred[e] = calibration_curve(y_true=tempDF['Diagnosis_Image_Optom'], y_prob=tempDF['Diagnosis_CNN_Prob'], pos_label='Suspected Glaucoma', n_bins=10)
    plt.plot(prob_pred[e],prob_true[e], label=e)


# CALIBRATION CURVES GROUPED BY GENDER

# for e in ['M', 'F']:
#     tempDF = dxSubjectDF[dxSubjectDF['Subject_Gender'] == e]
#     prob_true[e], prob_pred[e] = calibration_curve(y_true=tempDF['Diagnosis_Image_Optom'], y_prob=tempDF['Diagnosis_CNN_Prob'], pos_label='Suspected Glaucoma', n_bins=10)
#     plt.plot(prob_pred[e],prob_true[e], label=e)
    
plt.xlabel('Predicted Probability')
plt.ylabel('True Probability')
plt.legend(bbox_to_anchor=(1.0, 1.0))
plt.axline((0,0), slope=1, color='0.0', linestyle='--')
plt.show()

# Calculate Parity Metrics at IMAGE Level

In [None]:
# Parity metrics for CNN vs. Optom
getParityMetrics2( linkdDF['Subject_Ethnicity'], linkdDF['Diagnosis_Image_CNN'], linkdDF['Diagnosis_Image_Optom'] )

In [None]:
# Parity metrics for CNN vs. Expert labels
expertGradedDF = linkdDF[linkdDF['Diagnosis_Image_Expert'] != 'Not Graded, Bad Quality']
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_CNN'], expertGradedDF['Diagnosis_Image_Expert'] )

In [None]:
# Parity metrics for Optom vs. Expert labels
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_Optom'], expertGradedDF['Diagnosis_Image_Expert'] )

# Calculate Parity Metrics at SUBJECT Level

In [None]:
# Parity metrics for CNN vs. Optom
getParityMetrics2( dxSubjectDF['Subject_Ethnicity'], dxSubjectDF['Diagnosis_Image_CNN'], dxSubjectDF['Diagnosis_Image_Optom'] )

In [None]:
# Parity metrics for CNN vs. Expert labels
expertGradedDF = dxSubjectDF[dxSubjectDF['Diagnosis_Image_Expert'] != 'Not Graded, Bad Quality']
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_CNN'], expertGradedDF['Diagnosis_Image_Expert'] )

In [None]:
# Parity metrics for Optom vs. Expert labels
getParityMetrics2( expertGradedDF['Subject_Ethnicity'], expertGradedDF['Diagnosis_Image_Optom'], expertGradedDF['Diagnosis_Image_Expert'] )

In [None]:



# Space to stop autoscroll




In [None]:
np.unique(linkdDF['consultant'], return_counts=True)

# Upload Results

In [None]:
# crete asset path
asset_type_name = "Diagnosis_Analysis"
asset_path = exec.execution_asset_path(asset_type_name)

# save assets to asset_path
linkdDF.to_csv(asset_path/'ImagesToVGG19.csv', index=False)
#dxSubjectDF.to_csv(asset_path/'SubjectsToVGG19.csv', index=False)
#parityMetrics.to_csv(asset_path/'ParityMetrics.csv', index=False)

# upload assets to catalog
exec.upload_execution_outputs(clean_folder=True)