### Ingest chart diagnoses

In [None]:
import pandas as pd
import numpy as np

# load csv
chart_path = "/home/yukim3003/chart_diagnosis_input-02-09-2025.csv"
chart_df = pd.read_csv(chart_path,sep=",", encoding='mac_roman')

In [None]:
chart_df

In [None]:
# some subsets to check
chart_df[chart_df['Chart diagnosis'].isna()] # none (I labeled all of these as "No diagnosis")
chart_df[chart_df['Actual provider not a glaucoma specialist (X)'].notna()] # 174 rows

In [None]:
# drop NaN chart diagnoses
# chart_df = chart_df.dropna(subset=['Chart diagnosis'])

In [None]:
# if not a glaucoma provider, replace with glaucoma provider diagnosis. 
chart_df['Chart diagnosis'] = np.where(chart_df['Actual provider not a glaucoma specialist (X)'].notna(), chart_df['Chart diagnosis for column M'], chart_df['Chart diagnosis'])
# if none available (4 rows), drop so only glaucoma providers included
chart_df = chart_df.dropna(subset=['Chart diagnosis'])
# create column for final provider
chart_df['Final provider diagnosis used'] = np.where(chart_df['Actual provider not a glaucoma specialist (X)'].notna(), chart_df['Glaucoma provider  for column M'], chart_df['Actual provider'])

In [None]:
chart_df # 1838 -> 1834 rows

In [None]:
## chart_label: Generate simple GS vs POAG vs PACG vs _ column

GS_chart_labels = ['OAG suspect', 'OHTN', 'PAC suspect', 'PAC', 'ANA (anatomic narrow angle)']
POAG_chart_labels = ['POAG, unspecified', 'mild POAG', 'mild/mod POAG', 'mod POAG', 'mod/severe POAG', 'severe POAG', 'pseudoexfoliation OAG', 'pigmentary OAG', 'NTG/LTG (normal/low tension)']
PACG_chart_labels = ['PACG, unspecified', 'mild PACG', 'mild/mod PACG', 'mod PACG', 'mod/severe PACG', 'severe PACG']
Other_chart_labels = ['other OAG (freetext)', 'other ACG (freetext)', 'Other (freetext)', 'MMG/CMG (mixed mechanism)', 'NVG (neovascular)'] # other OAG and other ACG largely secondary
Na_chart_labels = ['Normal', 'No diagnosis']

# combine PACG and POAG as glaucoma
chart_df['chart_label']= chart_df['Chart diagnosis'].replace(GS_chart_labels, 'GS')
chart_df['chart_label'] = chart_df['chart_label'].replace(POAG_chart_labels, 'POAG')
chart_df['chart_label'] = chart_df['chart_label'].replace(PACG_chart_labels, 'PACG')
chart_df['chart_label'] = chart_df['chart_label'].replace(Other_chart_labels, 'Other Glaucoma')
chart_df['chart_label'] = chart_df['chart_label'].replace(Na_chart_labels, 'Normal or No dx')

In [None]:
## severity_chart_label: Generate GS vs mild vs moderate vs severe vs NA label
# if mild/mod, saved as moderate
# if mod/severe, saved as severe

GS_severity = ['OAG suspect', 'OHTN', 'PAC suspect', 'PAC', 'ANA (anatomic narrow angle)']
mild_severity = ['mild POAG', 'mild PACG']
mod_severity = ['mild/mod POAG', 'mod POAG', 'mild/mod PACG', 'mod PACG']
severe_severity = ['mod/severe POAG', 'mod/severe PACG', 'severe POAG', 'severe PACG']
indeterminate_severity = ['POAG, unspecified', 'pseudoexfoliation OAG', 'pigmentary OAG', 'NTG/LTG (normal/low tension)', 'PACG, unspecified', 'other OAG (freetext)', 'other ACG (freetext)', 'Other (freetext)', 'MMG/CMG (mixed mechanism)', 'NVG (neovascular)']
Na_chart_labels = ['Normal', 'No diagnosis']

# combine PACG and POAG as glaucoma
chart_df['severity_chart_label']= chart_df['Chart diagnosis'].replace(GS_severity, 'Suspect')
chart_df['severity_chart_label'] = chart_df['severity_chart_label'].replace(mild_severity, 'Mild')
chart_df['severity_chart_label'] = chart_df['severity_chart_label'].replace(mod_severity, 'Moderate')
chart_df['severity_chart_label'] = chart_df['severity_chart_label'].replace(severe_severity, 'Severe')
chart_df['severity_chart_label'] = chart_df['severity_chart_label'].replace(indeterminate_severity, 'Indeterminate')
chart_df['severity_chart_label'] = chart_df['severity_chart_label'].replace(Na_chart_labels, 'Normal or No dx')

In [None]:
chart_df # note patients don't all have both eyes represented

In [None]:
chart_df = chart_df.drop_duplicates(subset=['RID_Subject', 'Side'], keep='first', inplace=False)

In [None]:
# drop rows with Other or Na chart label - nvm I'll do this later in my analysis doc so I know at least that these labels were obtained
#chart_df = chart_df[chart_df['chart_label'].isin(['GS', 'POAG', 'PACG'])] #1834 -> 1700 rows

In [None]:
chart_df.to_csv('/home/yukim3003/chart_diagnosis_output-05-28-2025.csv', index=False)

### Cross-tab chart dx based on more severe eye

In [None]:
#Filter chart_df for more severe eye only
severe_eye_path = "/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/wide_full082024_severeeye.csv"
severe_eye = pd.read_csv(severe_eye_path, index_col=0)
severe_eye = severe_eye.rename(columns={'Image_Side': 'Side'})

chart_df_severe_eye = severe_eye.merge(chart_df, on=["RID_Subject", "Side"])
chart_df_severe_eye # 603 rows that have chart diagnoses done so far; 919 rows total

In [None]:
def percentages(crosstab):
    # Calculate row percentages
    row_percentages = crosstab.div(crosstab.sum(axis=1), axis=0) * 100 # we decided not to use this one
    # Calculate column percentages
    col_percentages = crosstab.div(crosstab.sum(axis=0), axis=1) * 100

    return (crosstab.astype(str) + " (" + col_percentages.round(2).astype(str) + "%)")

In [None]:
crosstab = pd.crosstab(chart_df_severe_eye['ICD-10 Label'], chart_df_severe_eye['chart_label'])
percentages(crosstab)

In [None]:
chart_df

In [None]:
### Sanity check to confirm that the labels in the google sharepoint excel match the wide_multimodal_full.csv labels --> correct!
# percent of labels that stayed the same
#sum(chart_df_severe_eye['Condition_Label'] == chart_df_severe_eye['ICD-10 Label']) / len(chart_df_severe_eye) #--> =100%, yay!

In [None]:
crosstab = pd.crosstab(chart_df['ICD-10 Label'], chart_df['chart_label'])
percentages(crosstab)

In [None]:
chart_df.loc[(chart_df.chart_label=='GS') & (chart_df['ICD-10 Label']=='POAG')]

# Compare chart diagnoses with test set grader diagnoses 05-28-2025

In [None]:
test_graded_df_pre = pd.read_csv('/home/yukim3003/test_set_grader_dx-05-28-2025.csv',sep=",", encoding='mac_roman')

In [None]:
len(test_graded_df_pre)

In [None]:
# drop eyes if any of CFP, HVF, or RNFL missing or bad enough such that Van/Kyle chose not to grade
columns_to_check = ['CDR', 'Grade', 'Severity']
# Drop rows where any of the specified columns have NaN values
test_graded_df = test_graded_df_pre.dropna(subset=columns_to_check, how='any')

In [None]:
test_graded_df.rename(columns={'Eye': 'Side'}, inplace=True)

In [None]:
df_merged = pd.merge(chart_df[['RID_Subject', 'Side', 'chart_label', 'severity_chart_label', 'Final provider diagnosis used']], test_graded_df, on=['RID_Subject', 'Side'], how='right')
df_merged.rename(columns={'Severity': 'severity_testgrader_label'}, inplace=True)

In [None]:
# which cases disagree on GS vs Glaucoma?
df_merged['chart_glaucoma_label'] = np.where(df_merged['chart_label'] =='GS', 'GS', 'Glaucoma')
df_merged['testgrader_glaucoma_label'] = np.where(df_merged['severity_testgrader_label'] =='Suspect', 'GS', 'Glaucoma')

# of both agreed Glaucoma (POAG of PACG) cases, which ones disagree in severity?
df_merged_bothglaucoma = df_merged[(df_merged['chart_glaucoma_label']=='Glaucoma') & (df_merged['testgrader_glaucoma_label']=='Glaucoma')] # 110 out of 229 eyes

In [None]:
print("Percent agreement: " + str(sum(df_merged['chart_glaucoma_label']==df_merged['testgrader_glaucoma_label']) / len(df_merged)))

crosstab = pd.crosstab(df_merged['chart_glaucoma_label'], df_merged['testgrader_glaucoma_label'])
percentages(crosstab)

In [None]:
crosstab = pd.crosstab(df_merged_bothglaucoma['severity_chart_label'], df_merged_bothglaucoma['severity_testgrader_label'])
percentages(crosstab)

In [None]:
# agreement on severity if remove Indeterminate
df = df_merged_bothglaucoma[df_merged_bothglaucoma['severity_chart_label'] != 'Indeterminate'] # 62 rows
crosstab = pd.crosstab(df['severity_chart_label'], df['severity_testgrader_label'])
percentages(crosstab)

In [None]:
df_merged

In [None]:
# Generate document of patients for Xu/Do to adjudicate
# drop rows where there is no disagreement in neither severity nor glaucoma label
df_disagreed = df_merged[df_merged['severity_chart_label'] != df_merged['severity_testgrader_label']]
len(df_disagreed)

In [None]:
df_disagreed.to_csv('/home/yukim3003/to_adjudicate_botheyes-05-28-2025.csv', index=False)