<a href="https://colab.research.google.com/github/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Diagnosis_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multimodal Template

In [None]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:

from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

Connect to Eye-AI catalog.  Configure to store data local cache and working directories.  Initialize Eye-AI for pending execution based on the provided configuration file.

In [None]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid="2-C61G" # rid



In [None]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

In [None]:
# @title Initiate an Execution
configuration_records = EA.execution_init(configuration_rid=configuration_rid)
configuration_records.model_dump()

In [None]:
# View data

subject = pd.read_csv(configuration_records.bag_paths[0]/'data/Subject.csv')
subject

observation = pd.read_csv(configuration_records.bag_paths[0]/'data/Observation.csv')
observation

clinic = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinical_Records.csv')
clinic

observation_clinic_asso = pd.read_csv(configuration_records.bag_paths[0]/'data/Observation_Clinic_Asso.csv')
observation_clinic_asso # association table between observation table and clinic record table

icd10 = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinic_ICD10.csv')
icd10

icd10_asso = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinic_ICD_Asso.csv')
icd10_asso # association table between clinic record table and ICD10 code

# report = pd.read_csv(configuration_records.bag_paths[0]/'data/Report.csv')
# report

# RNFL_OCR = pd.read_csv(configuration_records.bag_paths[0]/'data/RNFL_OCR.csv')
# RNFL_OCR

# HVF_OCR = pd.read_csv(configuration_records.bag_paths[0]/'data/HVF_OCR.csv')
# HVF_OCR

print()

In [None]:
subject


In [None]:
observation

In [None]:
# Merge Subject with Observation
subject_observation = pd.merge(subject, observation, left_on='RID', right_on='Subject', how='inner', suffixes=('_subject_df', '_observation_df'))
subject_observation

In [None]:
# Merge with Observation_Clinical_Association
subject_obs_clinic = pd.merge(subject_observation, observation_clinic_asso, left_on= 'RID_observation_df', right_on='Observation', suffixes=('_subject_observation_df', "_observation_clinic_asso_df"), how='inner')

subject_obs_clinic

In [None]:
clinic

In [None]:
clinic['Condition_Label'].unique()

In [None]:
import numpy as np

# Create the first new column with condition names
condition_map = {
    '2-C60J': 'GS',
    '2-C60M': 'POAG',
    '2-C60P': 'PACG'
}

clinic['Condition_Name'] = clinic['Condition_Label'].map(condition_map)

# Create the second new column with Glaucoma classification
def classify_glaucoma(condition):
    if condition in ['POAG', 'PACG']:
        return 'Glaucoma'
    elif condition == 'GS':
        return 'Glaucoma suspects'
    else:
        return np.nan

clinic['Glaucoma_Classification'] = clinic['Condition_Name'].apply(classify_glaucoma)

clinic

In [None]:
clinic['Glaucoma_Classification'].unique()

In [None]:
subject_obs_clinic_data = pd.merge(subject_obs_clinic, clinic, left_on='Clinical_Records', right_on='RID', suffixes=("_subject_obs_clinic_df", "_clinic_df"), how='inner')




subject_obs_clinic_data

In [None]:
subject_obs_clinic_data['Glaucoma_Classification'].unique()

In [None]:
subject_obs_clinic_data['RID_subject_df'].nunique()

In [None]:

# Assuming subject_obs_clinic_data is your final merged dataframe
unique_subject_counts = subject_obs_clinic_data.groupby('Glaucoma_Classification')['RID_subject_df'].nunique()

# Print the unique subject counts for each classification
print(unique_subject_counts)

In [None]:

# Assuming subject_obs_clinic_data is your final merged dataframe
unique_subject_counts = subject_obs_clinic_data.groupby('Glaucoma_Classification')['Subject_ID'].nunique()

# Print the unique subject counts for each classification
print(unique_subject_counts)

# GS 

In [None]:
# All code for creating GS suspect entries

subject_obs_clinic_data_gs = subject_obs_clinic_data.copy(deep=True)

subject_obs_clinic_data_gs

In [None]:
subject_obs_clinic_data_gs = subject_obs_clinic_data_gs[
    (subject_obs_clinic_data_gs['IOP'] >= 0) &
    (subject_obs_clinic_data_gs['IOP'] <= 21) &
    (subject_obs_clinic_data_gs['Condition_Name'] == 'GS')
]

subject_obs_clinic_data_gs

In [None]:
icd10_gs = icd10.copy(deep=True)

icd10_gs

In [None]:
icd10_gs = icd10_gs[icd10_gs['ICD10'].isin(['H40.003', 'H40.013', 'H40.023'])]

icd10_gs 

In [None]:
icd10_asso_gs = icd10_asso.copy(deep=True)

icd10_asso_gs

In [None]:
# Merge the dataframes
icd10_and_icd10_asso_merged_df = pd.merge(icd10_gs, icd10_asso_gs, left_on='RID', right_on='ICD10_Eye', how='left',  suffixes=("_icd10_gs_df", "_icd10_asso_gs_df"))


icd10_and_icd10_asso_merged_df

In [None]:
# Merging the dataframes based on Clinical_ID and Clinical_Records
final_merged_df_gs = pd.merge(subject_obs_clinic_data_gs, icd10_and_icd10_asso_merged_df, 
                           left_on='RID_clinic_df', right_on='Clinical_Records', how='inner')

# Display the result
# print(final_merged_df)

final_merged_df_gs

In [None]:
final_merged_df_gs.Clinical_Records_y.nunique()

In [None]:
final_merged_df_gs.Subject_ID

In [None]:
final_merged_df_gs.to_csv("final_merged_df_gs.csv", index=False)

## Another way to test GS

In [None]:
import pandas as pd
import numpy as np

# Load data
subject = pd.read_csv(configuration_records.bag_paths[0]/'data/Subject.csv')
observation = pd.read_csv(configuration_records.bag_paths[0]/'data/Observation.csv')
clinic = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinical_Records.csv')
observation_clinic_asso = pd.read_csv(configuration_records.bag_paths[0]/'data/Observation_Clinic_Asso.csv')
icd10 = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinic_ICD10.csv')
icd10_asso = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinic_ICD_Asso.csv')

# Step 1: Apply initial filters to clinical records
clinic_filtered = clinic[
    (clinic['IOP'] >= 0) & 
    (clinic['IOP'] <= 21) & 
    (clinic['Condition_Label'] == '2-C60J')
]
print(f"Number of records with IOP 0-21 and GS condition: {len(clinic_filtered)}")

# Step 2: Prepare ICD-10 data
valid_icd10 = ['H40.003', 'H40.013', 'H40.023']
icd10_filtered = icd10[icd10['ICD10'].isin(valid_icd10)]
icd10_asso_filtered = icd10_asso[icd10_asso['ICD10_Eye'].isin(icd10_filtered['RID'])]

# Step 3: Merge clinical records with ICD-10 associations
merged_df = pd.merge(clinic_filtered, icd10_asso_filtered, 
                     left_on='RID', right_on='Clinical_Records', 
                     how='inner')

# Step 4: Add ICD-10 code to merged dataframe
merged_df = pd.merge(merged_df, icd10_filtered[['RID', 'ICD10']], 
                     left_on='ICD10_Eye', right_on='RID', 
                     suffixes=('', '_icd10'))

# Step 5: Group by Clinical_Records and aggregate ICD10 codes
grouped_df = merged_df.groupby('Clinical_Records').agg({
    'RID': 'first',
    'IOP': 'first',
    'Condition_Label': 'first',
    'ICD10': lambda x: ','.join(sorted(set(x)))
}).reset_index()

print(f"Number of unique clinical records: {len(grouped_df)}")

# Function to filter by specific ICD-10 codes
def filter_by_icd10(df, codes):
    return df[df['ICD10'].apply(lambda x: any(code in x.split(',') for code in codes))]

# Individual ICD-10 code filters
for code in valid_icd10:
    filtered = filter_by_icd10(grouped_df, [code])
    print(f"Number of unique clinical records for {code}: {len(filtered)}")

# Combined H40.003 and H40.013
combined_filtered = filter_by_icd10(grouped_df, ['H40.003', 'H40.013'])
print(f"Number of unique clinical records for H40.003 and H40.013 combined: {len(combined_filtered)}")

# Combined H40.023 and H40.013
combined_filtered = filter_by_icd10(grouped_df, ['H40.023', 'H40.013'])
print(f"Number of unique clinical records for H40.023 and H40.013 combined: {len(combined_filtered)}")

# Combined H40.003 and H40.023
combined_filtered = filter_by_icd10(grouped_df, ['H40.003', 'H40.023'])
print(f"Number of unique clinical records for H40.003 and H40.023 combined: {len(combined_filtered)}")

# All ICD-10 codes combined
all_filtered = filter_by_icd10(grouped_df, valid_icd10)
print(f"Total number of unique clinical records with any of the ICD-10 codes: {len(all_filtered)}")

# Display distribution of ICD-10 codes
print("\nDistribution of ICD-10 codes:")
icd10_distribution = grouped_df['ICD10'].apply(lambda x: x.split(',')).explode().value_counts()
print(icd10_distribution)

# Display a few rows to verify the result
print("\nSample of final dataframe:")
print(grouped_df.head())

print(f"\nTotal rows in final dataframe: {len(grouped_df)}")

# Optional: Save the final dataframe to a CSV file
# grouped_df.to_csv('final_clinical_icd10_data.csv', index=False)
print("Final dataframe saved to 'final_clinical_icd10_data.csv'")

In [None]:
set(grouped_df.Clinical_Records) == set(final_merged_df_gs.Clinical_Records_y)

# Glaucoma

In [None]:
# All code for creating Glaucoma entries

subject_obs_clinic_data_g = subject_obs_clinic_data.copy(deep=True)

subject_obs_clinic_data_g


In [None]:

subject_obs_clinic_data_g = subject_obs_clinic_data_g[
    (subject_obs_clinic_data_g['Condition_Name'] == 'POAG')
]

subject_obs_clinic_data_g

In [None]:


icd10_g = icd10.copy(deep=True)

icd10_g

In [None]:



icd10_g = icd10_g[icd10_g['ICD10'].isin([
    "H40.1130",
    "H40.1131",
    "H40.1132",
    "H40.1133",
    "H40.1134",
    "H40.1231",
    "H40.1232",
    "H40.1233",
    "H40.1234"
])]

icd10_g 

In [None]:



icd10_asso_g = icd10_asso.copy(deep=True)

icd10_asso_g

In [None]:



# Merge the dataframes
icd10_and_icd10_asso_merged_df = pd.merge(icd10_g, icd10_asso_g, left_on='RID', right_on='ICD10_Eye', how='left',  suffixes=("_icd10_g_df", "_icd10_asso_g_df"))


icd10_and_icd10_asso_merged_df


In [None]:

# Merging the dataframes based on Clinical_ID and Clinical_Records
final_merged_df_g = pd.merge(subject_obs_clinic_data_g, icd10_and_icd10_asso_merged_df, 
                           left_on='RID_clinic_df', right_on='Clinical_Records', how='inner')

# Display the result
# print(final_merged_df)

final_merged_df_g


In [None]:


final_merged_df_g.Clinical_Records_y.nunique()


In [None]:


final_merged_df_g.Subject_ID

In [None]:
final_merged_df_g.to_csv("final_merged_df_g.csv", index=False)

In [None]:
# # @title Execute Training algorithm
# from eye_ai.models.vgg19_hyper_parameter_tuning import main #import the new logistic module.
# with EA.execution(execution_rid=configuration_records.execution_rid) as exec:
#   main()


In [None]:
# # @title Save Execution Assets (model) and Metadata
# uploaded_assets = EA.execution_upload(configuration_records.execution_rid, False)

# 