<a href="https://colab.research.google.com/github/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Diagnosis_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multimodal Initial analyses

In [None]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
#%load_ext autoreload # autoreloads updates to local packages
#%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-exec"))

# Prerequisites
import json
import os
from eye_ai.eye_ai import EyeAI
from models.severe.severity_analysis import Severity

import pandas as pd
from pathlib import Path, PurePath
import logging
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt

from deriva_ml import DatasetSpec, ExecutionConfiguration, DerivaML, Workflow
from deriva_ml import MLVocab as vc
from deriva_ml.deriva_definitions import ColumnDefinition, BuiltinTypes
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)


# Login
from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
host = 'www.eye-ai.org'
catalog_id = "eye-ai"

gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

cache_dir = Path.home() / '/data'
working_dir = Path.home() / '/data'
EA = Severity(cache_dir= cache_dir, working_dir=working_dir)
#EA = Severity(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir) # was giving an error for some reason 1/29/25

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.min_rows', 20)
pd.set_option('display.max_columns', None)

#####IF I DON'T WANNA RE-RUN BELOW, I HAVE MODELS SAVED AS BELOW. ----> SKIP TO "ADD AGE TO TABLE" CELL

In [None]:
# path to save and access future things for easier access without having to rerun an execution which takes a long time
asset_path = '/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/'

#loaded_train_path = asset_path + "wide_train_raw.csv"
#loaded_test_path = asset_path + "wide_test_raw.csv" # last updated Dec 6 2024
#wide_train_raw = pd.read_csv(loaded_train_path, index_col=0)
#wide_test_raw = pd.read_csv(loaded_test_path, index_col=0)

### New configuration

In [None]:
train_dataset = '2-C9PP'
new_v_train = EA.dataset_version(train_dataset)

# RID of source dataset, if any.
test_dataset = '2-C9PR'	
new_v_test = EA.dataset_version(test_dataset)

# Set to False if you only need the metadata from the bag, and not the assets.
config = ExecutionConfiguration(
    # Comment out the following line if you don't need the assets.
    datasets=[DatasetSpec(rid=train_dataset, version=new_v_train, materialize=False), DatasetSpec(rid=test_dataset, version=new_v_test, materialize=False)],
    assets=[],
    workflow='2-A51W', 
    description="Multimodal GS v glaucoma workflow")

# Initialize execution
execution = EA.create_execution(config)


In [None]:
# attempt to download configuration record that didn't work
# configuration_record = EA.download_execution_configuration(configuration_rid='4-3RTJ')

Connect to Eye-AI catalog.  Configure to store data local cache and working directories.  Initialize Eye-AI for pending execution based on the provided configuration file.

In [None]:
# old, before redoing EA : configuration_rid= "2-CCD4" # rid I created with my config containing minid for both train and test sets
print(execution)

In [None]:
train_bag = execution.datasets[0]
test_bag = execution.datasets[1]

# Generate multimodal wide table

In [None]:
wide_train_raw = EA.severity_analysis(train_bag)
wide_test_raw = EA.severity_analysis(test_bag)

In [None]:
print(len(wide_train_raw))
print(len(wide_test_raw))

In [None]:
# combine wide_train_raw and wide_test_raw but label as train vs raw
wide_full_raw = pd.concat([wide_train_raw, wide_test_raw], keys=['train', 'test']).reset_index(level=0).rename(columns={'level_0': 'source'})
wide_full_raw

## 01-29-2025: Alt method using saved wide_multimodal_full.csv from Aug 2024 to debug

In [None]:
#wide_full = pd.read_csv(asset_path + "wide_multimodal_full.csv")
#wide_full = wide_full.rename(columns={'Side': 'Image_Side', 'Gender': 'Subject_Gender', 'Ethnicity': 'Subject_Ethnicity', 'Label': 'Condition_Label'})
print(len(wide_full))

In [None]:
# alt using saved wide_train_raw and wide_test_raw from 12-26-2024 execution
#wide_train_raw = pd.read_csv("/home/yukim3003/KY-Scratch/wide_train_raw-4-3R4P-122624.csv", index_col=0)
#wide_test_raw = pd.read_csv("/home/yukim3003/KY-Scratch/wide_test_raw-4-3R4R-122624.csv", index_col=0)
#len(wide_train_raw) + len(wide_test_raw)

## Add age to table

In [None]:
# add age
age_path = "/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/multimodal_subject_age.csv"
age_df = pd.read_csv(age_path)
age_df.rename(columns={'RID': 'RID_Subject'}, inplace=True)
#wide_train_raw = wide_train_raw.merge(age_df, on='RID_Subject', how='left')
#wide_test_raw = wide_test_raw.merge(age_df, on='RID_Subject', how='left')
wide_full_raw = wide_full_raw.merge(age_df, on='RID_Subject', how='left') # only if using the alt method with the original wide_multimodal_full.csv above

In [None]:
len(wide_full_raw)
len(wide_full_raw['RID_Subject'].unique())

# set up features

In [None]:
#split dataset in features and target variable
demographic_fx = ['Subject_Gender', 'Subject_Ethnicity', 'Age']
clinic_fx = ['LogMAR_VA', 'IOP'] # 'Gonioscopy' - mostly NaN, not standardized annotation # CCT - mostly NaN
CDR_fx = ['CDR']
RNFL_fx = ['Average_RNFL_Thickness(μm)'] # Average_C/D_Ratio - for RNFL-derived CDR
RNFL_clockhr_fx = ['Clock_Hours_1', 'Clock_Hours_2', 'Clock_Hours_3', 'Clock_Hours_4', 'Clock_Hours_5', 'Clock_Hours_6', 'Clock_Hours_7', 'Clock_Hours_8', 'Clock_Hours_9', 'Clock_Hours_10', 'Clock_Hours_11', 'Clock_Hours_12'] # if I want to use each clock hour
RNFL_quad_fx = ['Quadrants_S', 'Quadrants_N', 'Quadrants_T', 'Quadrants_I']
RNFL_IS_fx = ['Quadrants_S', 'Quadrants_I']
HVF_fx = ['MD', 'VFI'] # 'PSD' - mostly NaN. I think PSD and PSD.1 columns should be merged to use this column if desired
GHT = ['GHT']

# redefined for AGS 02-19-2025
exam_fx = clinic_fx + CDR_fx
RNFL_fx = RNFL_fx + RNFL_IS_fx # Average_C/D_Ratio - for RNFL-derived CDR
HVF_fx = HVF_fx + GHT

# All Project Fx
all_fx_cols = demographic_fx + exam_fx + RNFL_fx + HVF_fx

### only keep rows that are not missing in ANY of all_fx_cols -- if not doing imputations

In [None]:
# only keep rows that are not nan in ANY fx in all_fx_cols

wide_full_raw= wide_full_raw.dropna(subset=all_fx_cols)
print(len(wide_full_raw))

# process chart labels

In [None]:
use_chart_labels=True # if false, uses ICD10 labels instead

In [None]:
if use_chart_labels:
    chart_path = "/home/yukim3003/chart_diagnosis_output-05-28-2025.csv"
    chart_df = pd.read_csv(chart_path)[['RID_Subject', 'Side', 'ICD-10 Label', 'chart_label']]
    chart_df.rename(columns={'Side': 'Image_Side'}, inplace=True)
    wide_full_raw = wide_full_raw.merge(chart_df, on=['RID_Subject', 'Image_Side'], how='left')
    #wide_train_raw = wide_train_raw.merge(chart_df, on=['RID_Subject', 'Image_Side'], how='inner')
    #wide_test_raw = wide_test_raw.merge(chart_df, on=['RID_Subject', 'Image_Side'], how='inner')

    # drop rows with undesired chart labels
    wide_full_raw = wide_full_raw[wide_full_raw['chart_label'].isin(['GS', 'POAG', 'PACG'])]

    # set Condition_Label as chart label, and create different ICD 10 label column
    wide_full_raw['ICD10_label_full'] = wide_full_raw['Condition_Label']
    wide_full_raw['Condition_Label'] = wide_full_raw['chart_label']
    
    ####DON'T DO THE FOLLOWING WHICH REPLACED "MISSING" CHART LABELS (REALLY JUST THOSE LABELED WITH OTHER/NORMAL/NO DIAGNOSIS) WITH ICD10 LABEL
    ## set Condition_Label to chart_label if not NAN, else just use original condition label
    #wide_full_raw['Condition_Label'] = wide_full_raw['chart_label'].fillna(wide_full_raw['Condition_Label'])
# else: by default Condition_Label is already ICD10 labels

# drop rows missing labels for analysis
wide_full_raw= wide_full_raw.dropna(subset='Condition_Label')

# drop duplicate rows (there are a few patients, eg 2-7P38, that are for some reason duplicated): 1267 rows -> 1263 rows
wide_full_raw = wide_full_raw.drop_duplicates(subset=['RID_Subject', 'Image_Side'], keep='first', inplace=False)
print(len(wide_full_raw))

In [None]:
# to generate file to obtain rows missing chart label
"""
# obtain rows missing chart label
num_missing_chart_label = wide_full_raw['chart_label'].isna().sum()
print("Number missing chart label: " + str(num_missing_chart_label))

# add MRN to table
mrn_path = "/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/multimodal_subject_age_MRN.csv"
mrn_df = pd.read_csv(mrn_path, dtype={'MRN': str})
mrn_df.rename(columns={'RID': 'RID_Subject'}, inplace=True)
wide_full_raw_mrns = wide_full_raw.merge(mrn_df, on='RID_Subject', how='left')

missing_labels = wide_full_raw_mrns[wide_full_raw_mrns['chart_label'].isna()]
selected_cols = ['RID_Subject', 'MRN', 'date_of_encounter_Clinic', 'date_of_encounter_Fundus', 'Provider', 'Image_Side', 'Condition_Label']
excel=missing_labels[selected_cols]
excel_path = '/home/yukim3003/chart_diagnosis_input_04-12-2025.csv'
excel.to_csv(excel_path, index=False)
"""

In [None]:
len(wide_full_raw['RID_Subject'].unique())

In [None]:
# save both eyes
#wide_test = wide_full_raw[wide_full_raw['source'] == 'test'].drop(columns='source').reset_index(drop=True)
#wide_test.to_csv('/home/yukim3003/wide_test_botheyes_04-18-2025.csv', index=False)
#len(wide_test['RID_Subject'].unique())



# Create new table with only more severe eye for each patient

In [None]:
# eye_ai.py:     def pick_severe_eye(self, df, rnfl_threshold, md_threshold):
# (if only one eye has a label, that's automatically the more severe eye)

In [None]:
rnfl_thresh = 0
md_thresh = 0
wide_full_severe = EA.pick_severe_eye(wide_full_raw, rnfl_thresh, md_thresh)
#wide_train = EA.pick_severe_eye(wide_train_raw, rnfl_thresh, md_thresh)
#wide_test = EA.pick_severe_eye(wide_test_raw, rnfl_thresh, md_thresh)

In [None]:
# if choose to use better eye instead
df_diff = wide_full_raw.merge(wide_full_severe, on=wide_full_raw.columns.tolist(), how='left', indicator=True)
wide_full_better = df_diff[df_diff['_merge'] == 'left_only'].drop(columns=['_merge']) # less severe eye -- though technically the subjects that only have one eye included should also be included in the better eye analysis?

In [None]:
wide_full = wide_full_severe
len(wide_full)

In [None]:
## for saving only -- generate a version of wide_full with severe eye only, before splitting into train/test
#wide_full = EA.pick_severe_eye(wide_full_raw, 0, 0)
#wide_full.to_csv(asset_path + 'wide_full082024_severeeye.csv')

# save more severe eye raw data
#wide_train.to_csv(asset_path + 'wide_train_fromfull082024_severeeye.csv') 
#wide_test.to_csv(asset_path + 'wide_test_fromfull082024_severeeye.csv')

#wide_train_nothresh = EA.pick_severe_eye(wide_train_raw, 0, 0)
#wide_test_nothresh = EA.pick_severe_eye(wide_test_raw, 0, 0)

# Show which subjects changed eyes by adding thresholds
#diff_values = wide_train.compare(wide_train_nothresh, align_axis=0, keep_shape=True, keep_equal=True) #keep_equal=False --> values that are equal are represented as NaN
#diff_values = diff_values.drop_duplicates(keep=False) # drop rows that have a duplicate
#print("# subjects where eye choice changed: %i" % (len(diff_values)/2))
#diff_values[['RID_Subject', 'Image_Side', 'Label', 'Average_RNFL_Thickness(μm)', 'MD', 'CDR']]

## train test split - after removing all NAs to get final dataset

#### Option 1: create my own 80-20 split from wide_full

In [None]:
#### DON'T DO THIS-- SHOULD BE SPLITTING BY SUBJECT, WHEREAS THIS SPLITS BY EYE
#wide_train, wide_test = train_test_split(wide_full, test_size=0.2, random_state=42)

# split using orig wide_train vs wide_test sets -- jk this would miss a bunch of patients though

# split by subject:
from sklearn.model_selection import GroupShuffleSplit 
splitter = GroupShuffleSplit(test_size=.20, n_splits=2, random_state = 42)
split = splitter.split(wide_full, groups=wide_full['RID_Subject'])
train_inds, test_inds = next(split)

wide_train = wide_full.iloc[train_inds]
wide_test = wide_full.iloc[test_inds]

print(len(wide_train))
print(len(wide_test))

In [None]:
# save these so I can repeat my results in future
#wide_train.to_csv(asset_path + 'wide_train_botheyes_02-18-2025.csv')
#wide_test.to_csv(asset_path + 'wide_test_botheyes_02-18-2025.csv')

#### Option 2: Use labeled train vs test from EyeAI split in wide_full

In [None]:
wide_train = wide_full[wide_full['source'] == 'train'].drop(columns='source').reset_index(drop=True)
wide_test = wide_full[wide_full['source'] == 'test'].drop(columns='source').reset_index(drop=True)
print(len(wide_train))
print(len(wide_test))

In [None]:
#wide_test.to_csv('/home/yukim3003/wide_test_bettereye_04-18-2025.csv', index=False)

### START HERE to use already generated data subsets ###

In [None]:
# generate table of less severe eye only
wide_train_severe = pd.read_csv(asset_path + 'wide_train_severeeye_02-09-2025.csv', index_col=0) # more severe eye
wide_test_severe = pd.read_csv(asset_path + 'wide_test_severeeye_02-09-2025.csv', index_col=0)
wide_train_both = pd.read_csv(asset_path + 'wide_train_botheyes_02-18-2025.csv', index_col=0) # both eyes
wide_test_both = pd.read_csv(asset_path + 'wide_test_botheyes_02-18-2025.csv', index_col=0)

df_diff = wide_train_both.merge(wide_train_severe, on=wide_train_both.columns.tolist(), how='left', indicator=True)
wide_train_better = df_diff[df_diff['_merge'] == 'left_only'].drop(columns=['_merge']) # less severe eye
df_diff = wide_test_both.merge(wide_test_severe, on=wide_test_both.columns.tolist(), how='left', indicator=True)
wide_test_better = df_diff[df_diff['_merge'] == 'left_only'].drop(columns=['_merge'])



In [None]:
# choose which dataset to use
wide_train = wide_train_severe
wide_test = wide_test_severe

In [None]:
wide_train['ICD10_label_full'].isna().sum()

# GS vs Glaucoma

## Process data

In [None]:
## Adjust fx_cols depending on what variables I want to include in model
#fx_cols = demographic_fx
#fx_cols = exam_fx
#fx_cols = RNFL_fx
#fx_cols = HVF_fx
#fx_cols = RNFL_fx + HVF_fx

# for univariate or all 4 domains or ML
#fx_cols = all_fx_cols

# ICD-10
#fx_cols = ["ICD10_label_full"]

# OLD: Domain knowledge fx
#fx_cols = ['Age', 'Subject_Gender', 'LogMAR_VA', 'CDR'] + RNFL_fx + ['MD', 'GHT']  # Age, Subject_Gender, VA, CDR, avg RNFL, MD, GHT outside normal limits.
ymethod="all_glaucoma"

# ALL is a required key if desiring to run univariate analyses, ridge, and elastic net
list_of_fx_cols_to_run = {
    'Demographics': demographic_fx,
    'Exam': exam_fx, 
    'OCT': RNFL_fx,
    'VF': HVF_fx,
    'OCT+VF': RNFL_fx + HVF_fx,
    'All Four Domains': all_fx_cols,
    'ALL': all_fx_cols,
    'ICD10': ["ICD10_label_full"],
}

models_glaucoma, X_train_overall, X_test_overall, y_train_overall, y_test_overall = EA.run_whole_analysis(list_of_fx_cols_to_run, wide_train, wide_test, ymethod)

In [None]:
models_glaucoma.keys()

## Counts / data info

In [None]:
print("X train size: " + str(len(X_train_overall)))
print("X test size: " + str(len(X_test_overall)))
totaln = len(X_train_overall) + len(X_test_overall)
totaln

In [None]:
counts = np.unique(y_train_overall, return_counts=True)
print("Train: " + str(counts[0]) + ": " + str(counts[1])) # #GS vs #Glaucoma
print("Percent GS vs Glaucoma in TRAIN:", counts[1] / sum(counts[1])) # percent

counts = np.unique(y_test_overall, return_counts=True)
print("Test: " + str(counts[0]) + ": " + str(counts[1])) # #GS vs #Glaucoma
print("Percent GS vs Glaucoma in TEST:", counts[1] / sum(counts[1])) # percent

print("Percent Glaucoma in TOTAL:", (sum(y_train_overall) + sum(y_test_overall)) / totaln)


In [None]:
X_train_overall.columns

In [None]:
def print_mean_fornumeric(col, dat1=X_train_overall, dat2=X_test_overall):
    totaln = len(dat1) + len(dat2)
    sumcol = np.sum(dat1[col]) + np.sum(dat2[col])
    print("Mean " + col + ": " + str(round(sumcol/totaln, 4)))

numeric_cols = ["Age", "LogMAR_VA", "IOP", "CDR", "MD", "VFI", "Average_RNFL_Thickness(μm)", "Quadrants_S", "Quadrants_I"]
for col in numeric_cols:
    print_mean_fornumeric(col)

In [None]:
def print_sd_fornumeric(col, dat1=X_train_overall, dat2=X_test_overall):
    all_v = dat1[col].tolist() + dat2[col].tolist()
    sd = np.std(all_v)
    print("Std " + col + ": " + str(round(sd, 4)))

for col in numeric_cols:
    print_sd_fornumeric(col)

In [None]:
def print_n_forcatvar(col, dat1=X_train_overall, dat2=X_test_overall):
    totaln = len(dat1) + len(dat2)
    n = sum(dat1[col]) + sum(dat2[col])
    print(col + ": " + str(n) + " (" + str(round(n/totaln*100, 2)) + "%)")

cat_cols = ['Subject_Gender_F', 'Subject_Gender_M', 'Subject_Ethnicity_African Descent', 'Subject_Ethnicity_Asian',
       'Subject_Ethnicity_Caucasian', 'Subject_Ethnicity_Latin American',
       'Subject_Ethnicity_Other', 'GHT_Borderline',
       'GHT_Outside Normal Limits', 'GHT_Within Normal Limits']
for col in cat_cols:
    print_n_forcatvar(col)

In [None]:
#Repeat above but filter by glaucoma vs suspect
train_mask = pd.Series(y_train_overall).astype(bool)
X_train_glaucoma = X_train_overall[train_mask]
X_train_GS = X_train_overall[~train_mask]

test_mask = pd.Series(y_test_overall).astype(bool)
X_test_glaucoma = X_test_overall[test_mask]
X_test_GS = X_test_overall[~test_mask]

print("GLAUCOMA")
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_glaucoma, dat2=X_test_glaucoma)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_glaucoma, dat2=X_test_glaucoma)

print(" ")

print("SUSPECT")
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_GS, dat2=X_test_GS)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_GS, dat2=X_test_GS)

In [None]:
# #NAN
### the number of rows with nan in any column will increase if I choose more features

# count number / percent of rows with nan value
num_rows_with_nan = X_train_overall.isnull().any(axis=1).sum()
print ("Number of train rows with any nan: %i" % num_rows_with_nan)

# Calculate the percentage of rows with NaN values
print ("Percent of train rows with any nan: %f" % ((num_rows_with_nan / len(X_train_overall)) * 100))

# count number / percent of rows with nan value
num_rows_with_nan = X_test_overall.isnull().any(axis=1).sum()
print ("Number of test rows with any nan: %i" % num_rows_with_nan)

# Calculate the percentage of rows with NaN values
print ("Percent of test rows with any nan: %f" % ((num_rows_with_nan / len(X_test_overall)) * 100))

# Urgent glaucoma

moderate-to-severe glaucoma = MD<= -6 AND chart label of Glaucoma

mild/GS = MD > -6 OR chart label of GS

In [None]:
ymethod = "urgent_glaucoma"

# Redefine to not include HVF
all_nonVF_fx_cols = demographic_fx + exam_fx + RNFL_fx

# ALL is a required key if desiring to run univariate analyses, ridge, and elastic net
list_of_fx_cols_to_run = {
    'Demographics': demographic_fx,
    'Exam': exam_fx, 
    'OCT': RNFL_fx,
    'Exam+OCT': exam_fx + RNFL_fx,
    'All Three Domains': all_nonVF_fx_cols,
    'ALL': all_nonVF_fx_cols,
    'for_counts_only': all_fx_cols, # included purely for counts/data info calcs below, so I can still calculate counts/stats for HVF measures # if this key isn't provided, uses ALL dataframes for counts
}

models_urgent, X_train_overall, X_test_overall, y_train_overall, y_test_overall = EA.run_whole_analysis(list_of_fx_cols_to_run, wide_train, wide_test, ymethod)

## Counts / data info

In [None]:
### note I interchangeably-ish used X_train/test_overall and wide_train/test, because they should be the same rows given I preemptively dropped all NAs, but they're formatted differently

print("X train size: " + str(len(X_train_overall)))
print("X test size: " + str(len(X_test_overall)))
totaln = len(X_train_overall) + len(X_test_overall)
totaln

In [None]:
counts = np.unique(y_train_overall, return_counts=True)
print("Train: " + str(counts[0]) + ": " + str(counts[1])) # #GS vs #Glaucoma
print("Percent mild-GS vs mod-severe in TRAIN:", counts[1] / sum(counts[1])) # percent

counts = np.unique(y_test_overall, return_counts=True)
print("Test: " + str(counts[0]) + ": " + str(counts[1])) # #GS vs #Glaucoma
print("Percent mild-GS vs mod-severe in TEST:", counts[1] / sum(counts[1])) # percent

print("Percent mod-severe in TOTAL:", (sum(y_train_overall) + sum(y_test_overall)) / totaln)


In [None]:
X_train_overall.columns

In [None]:
def print_mean_fornumeric(col, dat1=X_train_overall, dat2=X_test_overall):
    totaln = len(dat1) + len(dat2)
    sumcol = np.sum(dat1[col]) + np.sum(dat2[col])
    print("Mean " + col + ": " + str(round(sumcol/totaln, 4)))

numeric_cols = ["Age", "LogMAR_VA", "IOP", "CDR", "MD", "VFI", "Average_RNFL_Thickness(μm)", "Quadrants_S", "Quadrants_I"]
for col in numeric_cols:
    print_mean_fornumeric(col)

def print_sd_fornumeric(col, dat1=X_train_overall, dat2=X_test_overall):
    all_v = dat1[col].tolist() + dat2[col].tolist()
    sd = np.std(all_v)
    print("Std " + col + ": " + str(round(sd, 4)))

for col in numeric_cols:
    print_sd_fornumeric(col)

def print_n_forcatvar(col, dat1=X_train_overall, dat2=X_test_overall):
    totaln = len(dat1) + len(dat2)
    n = sum(dat1[col]) + sum(dat2[col])
    print(col + ": " + str(n) + " (" + str(round(n/totaln*100, 2)) + "%)")

cat_cols = ['Subject_Gender_F', 'Subject_Gender_M', 'Subject_Ethnicity_African Descent', 'Subject_Ethnicity_Asian',
       'Subject_Ethnicity_Caucasian', 'Subject_Ethnicity_Latin American',
       'Subject_Ethnicity_Other', 'GHT_Borderline','GHT_Outside Normal Limits', 'GHT_Within Normal Limits']
for col in cat_cols:
    print_n_forcatvar(col)

In [None]:
#Repeat above but filter by mod-severe vs mild-GS
train_mask = pd.Series(y_train_overall).astype(bool)
X_train_1 = X_train_overall[train_mask]
X_train_0 = X_train_overall[~train_mask]

test_mask = pd.Series(y_test_overall).astype(bool)
X_test_1 = X_test_overall[test_mask]
X_test_0 = X_test_overall[~test_mask]

print("MOD-SEVERE")
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_1, dat2=X_test_1)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_1, dat2=X_test_1)

print(" ")

print("MILD-GS")
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_0, dat2=X_test_0)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_0, dat2=X_test_0)

In [None]:
train_mask = (wide_train.Condition_Label=='GS') & (wide_train.MD<=-6)
test_mask = (wide_test.Condition_Label=='GS') & (wide_test.MD<=-6)

# since i removed all NAs, wide_train and X_train overall should match up
X_train_1 = X_train_overall[train_mask]
X_train_0 = X_train_overall[~train_mask]
X_test_1 = X_test_overall[test_mask]
X_test_0 = X_test_overall[~test_mask]

print("GS yet MD<=-6")
print("Number: %s" % str(len(X_train_1) + len(X_test_1)))
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_1, dat2=X_test_1)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_1, dat2=X_test_1)

print(" ")

print("NOT [GS yet MD<=-6] - not really useful info but i printed it anyways")
print("Number: %s" % str(len(X_train_0) + len(X_test_0)))
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_0, dat2=X_test_0)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_0, dat2=X_test_0)

print(" ")

train_mask = (wide_train.Condition_Label=='GS') & (wide_train.MD>-6)
test_mask = (wide_test.Condition_Label=='GS') & (wide_test.MD>-6)

# since i removed all NAs, wide_train and X_train overall should match up
X_train_1 = X_train_overall[train_mask]
X_train_0 = X_train_overall[~train_mask]
X_test_1 = X_test_overall[test_mask]
X_test_0 = X_test_overall[~test_mask]

print("GS with MD> -6")
print("Number: %s" % str(len(X_train_1) + len(X_test_1)))
for col in numeric_cols:
    print_mean_fornumeric(col, dat1=X_train_1, dat2=X_test_1)
for col in cat_cols:
    print_n_forcatvar(col, dat1=X_train_1, dat2=X_test_1)


# PLOT multiple ROC curves
- current version of this code requires running above multiple times for each roc curve I want to plot, then saving them manually and adding to global dictionary before plotting combined ROC curve
- X_test and y_test have different #s for drop_NA bc drop_NA may drop diff # rows depending on which variables are included

In [None]:
#models = {} # model label name: (model, associated X_test, associated y_test)
# start with univarate models dict
#models ={**models, **models_univariate} ## don't overwrite models just in case already contains stuff

### No longer needed because variables are all named correctly now, and no longer renaming univariate labels
"""
# map model names
key_mapping = {
    #'Average_RNFL_Thickness(μm)': 'OCT',
    #'MD': 'VF',
    'ML Feature Selection (Elastic Net)': 'ML Elastic Net',
    'HVF': 'VF',
    'OCT+HVF': 'OCT+VF',
    'CDR+OCT+HVF': 'CDR+OCT+VF'
}
# Function to rename keys in a dictionary
def rename_keys(d, key_map):
    return {key_map.get(k, k): v for k, v in d.items()}
# Apply the renaming function to the dictionary
models = rename_keys(models, key_mapping)
"""

In [None]:
## how to combine 2 dictionaries
#all_models = {**models_univariate, **selected_models}

In [None]:
# select which models to plot
analysis = "urgent_glaucoma" # "urgent_glaucoma"

#all_keys = ['Clinic Data', 'Demographics', CDR', 'OCT', 'HVF', 'OCT+HVF', 'CDR+OCT+HVF', 'All Four Domains', 'Domain Knowledge', 'All Features (Ridge)', 'ML Elastic Net'] # The keys you want

if analysis=="all_glaucoma":
    models= models_glaucoma
    wanted_keys = ['Demographics', 'Exam', 'OCT', 'VF', 'OCT+VF', 'All Four Domains', 'ML Elastic Net'] # for models_glaucoma
if analysis=="urgent_glaucoma":
    models= models_urgent
    wanted_keys = ['Demographics', 'Exam', 'OCT', 'Exam+OCT', 'All Three Domains', 'ML Elastic Net'] # for models_urgent
selected_models = dict((k, models[k]) for k in wanted_keys if k in models)

In [None]:
selected_models.keys()

In [None]:
figname = "Figure_1_chartlabel-04-21-2025-using_EyeAI_splits-urgentglaucoma-severeeye-nomild.png"

plt.figure(figsize=(9, 8))
for name, (m, xt, yt) in selected_models.items():
    print (name)
    fpr, tpr, auc, optimal_idx, optimal_threshold = EA.compute_performance_youden(m, xt, yt, plot=False)
    #plt.plot(fpr, tpr, label="%s (AUC=%.3f, Youden's=%.3f)" % (name, auc, (tpr[optimal_idx] - fpr[optimal_idx])))
    plt.plot(fpr, tpr, label="%s (AUC=%.2f)" % (name, auc))
    #plt.scatter(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='red')
    print ("")

####NOT sure why I had this duplicated code...was I having issues before?###
#name = "All Four Domains"
#m, xt, yt = models[name]
#print (name)
#fpr, tpr, auc, optimal_idx, optimal_threshold = EA.compute_performance_youden(m, xt, yt, plot=False)
##plt.plot(fpr, tpr, label="%s (AUC=%.3f, Youden's=%.3f)" % (name, auc, (tpr[optimal_idx] - fpr[optimal_idx])))
#plt.plot(fpr, tpr, label="%s (AUC=%.2f)" % (name, auc))
##plt.scatter(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='red')
#print ("")

#name = "ML Elastic Net"
#m, xt, yt = models[name]
#print (name)
#fpr, tpr, auc, optimal_idx, optimal_threshold = EA.compute_performance_youden(m, xt, yt, plot=False)
##plt.plot(fpr, tpr, label="%s (AUC=%.3f, Youden's=%.3f)" % (name, auc, (tpr[optimal_idx] - fpr[optimal_idx])))
#plt.plot(fpr, tpr, label="%s (AUC=%.2f)" % (name, auc))
##plt.scatter(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='red')
#print ("")

plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right', fontsize=14)
plt.savefig("/home/yukim3003/" + figname, format="png", dpi=300)
#plt.show()

## Save a high quality plot - nvm this has to be in same cell as original plot creation to work

In [None]:
os.mkdir(configuration_records.working_dir/'Execution_Assets/Multimodal_Figures/')
fig_path = configuration_records.working_dir/'Execution_Assets/Multimodal_Figures/Figure_1.png'

# Save the plot with higher DPI
plt.savefig(fig_path, dpi=300)

In [None]:
fig_path

In [None]:
# workaround
plt.savefig("/home/yukim3003/Figure_1.png", format="png")

## Cache models

In [None]:
import pickle
cache_path = asset_path + 'models_glaucoma_cache_04-21-2025.pkl'

# Cache the models dictionary to a file
with open(cache_path, 'wb') as f:
    pickle.dump(models_glaucoma, f)

cache_path = asset_path + 'models_urgent_cache_04-21-2025.pkl'

# Cache the models dictionary to a file
with open(cache_path, 'wb') as f:
    pickle.dump(models_urgent, f)

In [None]:
cache_path = asset_path + 'models_cache_04-12-2025.pkl'

# To load the cached models dictionary later
with open(cache_path, 'rb') as f:
    cached_models = pickle.load(f)

## Access a specific saved model

In [None]:
# compute a specific model that is saved
name = "ML Elastic Net"
m, xt, yt = cached_models[name]
EA.compute_performance(m, xt, yt)
EA.compute_performance_youden(m, xt, yt, plot=True)

In [None]:
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, False)

# OLD CODE NOT UPTODATE WITH CHANGES FOR ABOVE

# SKIP A-B-C BELOW -- I REMOVED ALL NAS EARLIER IN CODE NOW

## A) Simple imputation

In [None]:
strat = 'mean'
# NOTE: the following code imputes X_test based on the imputer fitted to X_train

"""
STRATEGIES
If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

If “median”, then replace missing values using the median along each column. Can only be used with numeric data.

If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.

If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
"""

# simple imputation fitted to X_train, but also applied to X_test
# eye_ai.py: def simple_impute(self, X_train_keep_missing, X_test_keep_missing, strat = "mean"):
X_train, X_test = EA.simple_impute(X_train_keep_missing, X_test_keep_missing)

y_train = y_train_keep_missing
y_test = y_test_keep_missing

## B) Multiple imputations (10 imputations)

In [None]:
# good article on MCAR vs MAR vs MNAR and how to appropriately handle missing values in each case: https://datascience.stackexchange.com/questions/116622/what-should-you-do-with-nan-values

# return list of pandas dataframes, each containing 1 of 10 imputations
# eye_ai.py:     def mult_impute_missing(self, X, train_data=None):

In [None]:
X_train_imputedsets = EA.mult_impute_missing(X_train_keep_missing) # list of 10 imputed X_trains

In [None]:
X_test_imputedsets = EA.mult_impute_missing(X_test_keep_missing, train_data=X_test_keep_missing) # Impute test data using model fit with training data, not with test data!

In [None]:
y_train = y_train_keep_missing
y_test = y_test_keep_missing

## C) Drop NA
### DON'T use this with the univariate loop -- incorrectly drops rows (dropNA in univariate loop instead to drop only for the univariate variable in question)

In [None]:
# drop rows with nan
X_train = X_train_keep_missing.dropna()
X_test = X_test_keep_missing.dropna()
print(len(X_train))
print(len(X_test))

y_train = y_train_keep_missing[y_train_keep_missing.index.isin(X_train.index)]
y_test = y_test_keep_missing[y_test_keep_missing.index.isin(X_test.index)]

In [None]:
# transferred to eye_ai.py

In [None]:
"""
    def model_summary(self, model, X_train):
    def calc_stats(self, y_pred, y_test):
    def compute_performance(self, model, X_test, y_test):
    def compute_performance_youden(self, model, X_test, y_test, plot=True):
"""

# Multivariate Logistic Regression DROPNA or SIMPLEIMPUTER

In [None]:
# transferred to severity_analysis.py as methods univariate_analysis, multivariate_logreg, multivariate_ridge_elastic

# Multivariate Logistic Regression MULTIPLE IMPUTATIONS
### To check if what I did is best method: used mode of y_pred, and averaged prediction probabilities of each imputed model to determine AUC, and averaged p-values

### Normal logistic regression

In [None]:
# eye_ai.py: 
#     def compute_performance_mice(self, logreg_models, Xtest_finals, y_test):
#     def model_summary_mice(self, logreg_models, Xtrain_finals):

In [None]:
# how to do prediction after multiple imputation:
# https://github.com/amices/mice/issues/82
# https://stackoverflow.com/questions/68460923/how-to-do-the-prediction-after-multiple-imputation-with-mice-package
logreg_models = []
Xtrain_finals = []
Xtest_finals = []

# MUST DROP REFERENCE COLUMN FOR ONE-HOT-ENCODED VARIABLES
#chosen_ref_labels = ['GHT_Within Normal Limits', 'Subject_Gender_M', 'Subject_Ethnicity_Other']
chosen_ref_labels = ['GHT_Within Normal Limits','GHT_Borderline', 'Subject_Gender_M', 'Subject_Ethnicity_Other']
penalty=None#'l1', 'l2', 'elasticnet', or None
solver='saga' # 'lbfgs', 'saga' (only saga supports l1 and elasticnet)

for X_train, X_test in zip(X_train_imputedsets, X_test_imputedsets):
    # NORMAL LOGISTIC REGRESSION
    drop_cols = [x for x in X_train.columns if x in chosen_ref_labels]
    X_train_dropped = X_train.drop(columns=drop_cols)
    X_test_dropped = X_test.drop(columns=drop_cols)

    logreg = LogisticRegression(random_state=16, solver=solver, max_iter=1000, penalty=penalty)
    logreg.fit(X_train_dropped, y_train)
    logreg_models.append(logreg)

    Xtrain_finals.append(X_train_dropped)
    Xtest_finals.append(X_test_dropped)

EA.model_summary_mice(logreg_models, Xtrain_finals)
EA.compute_performance_mice(logreg_models, Xtest_finals, y_test)

# Alternative models

In [None]:
# don't have to onehotencode, but xgboost performs better if does
# keep dummy variables, don't drop ref label for decision trees

from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

drop_NA=True
if drop_NA:
    # Drop NA if desired
    x = X_train_keep_missing.dropna()
    x_t = X_test_keep_missing.dropna()

    y = y_train[y_train.index.isin(x.index)]
    y_t = y_test[y_test.index.isin(x_t.index)]

print(x.columns)

#model = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=0) # bagged SVC
#model=BaggingClassifier() # bagged decision trees (bc DecisionTree is default)
model=SVC(probability=True) # probability=True to enable predict_proba function (slow)
clf = model.fit(x,y)

# define cross-validation evaluation procedure
k = 10
cv = RepeatedStratifiedKFold(n_splits=k, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x, y, scoring='roc_auc', cv=cv)
# summarize performance
print('Mean AUC using %i-fold cross-validation: %.3f' % (k, mean(scores)))# AUC from 10-fold cv on TRAINING set, as opposed to AUC on test set computed in compute_performance -- if this better than AUC for test set, then model probably overfit
print("")

# test performance
EA.compute_performance(clf, x_t, y_t)
EA.compute_performance_youden(clf, x_t, y_t)