<a href="https://colab.research.google.com/github/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Diagnosis_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multimodal Initial analyses

In [None]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
# import the class
from sklearn.linear_model import LogisticRegression
# Import label encoder 
from sklearn import preprocessing 

import numpy as np
import matplotlib.pyplot as plt

In [None]:

from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

Connect to Eye-AI catalog.  Configure to store data local cache and working directories.  Initialize Eye-AI for pending execution based on the provided configuration file.

In [None]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid="2-CC3W" # rid I created



In [None]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

In [None]:
# @title Initiate an Execution
configuration_records = EA.execution_init(configuration_rid=configuration_rid)
configuration_records.model_dump()

# Generate multimodal wide table

In [None]:
# old method using local files -- NOT recommended
# multimodal_wide_path = "/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/wide_multimodal_full.csv"
# multimodal_wide = pd.read_csv(multimodal_wide_path)
# multimodal_wide

In [None]:
# TRAIN: configuration_records.bag_paths[0]
wide_train_raw = EA.severity_analysis(configuration_records.bag_paths[0])

In [None]:
# TEST: configuration_records.bag_paths[1]
# TRAIN: configuration_records.bag_paths[0]
wide_test_raw = EA.severity_analysis(configuration_records.bag_paths[1])

# Create new table with only more severe eye for each patient

In [None]:
# old method with bugs:
#1. if eye1 is GS and eye2 is NaN, then eye2 also becomes GS; and similarly if eye1 has CDR 0.9 but eye2 is NaN, then eye2 also gets CDR 0.9. Fixed this by adding skipna=False to first()
#2. if eye1 has an RNFL but eye2 RNFL is NaN, then this method will consider eye1 to be more severe, whereas it's better to move on to assessing MD in that case. Fixing this fixed 11 eyes

#def pick_severe_eye(df):    
#    # Sort by RNFL, HVF, CDR whereby first row is most severe
#    df = df.sort_values(by=['Average_RNFL_Thickness(μm)', 'MD', 'CDR'], ascending=[True, True, False])
#    
#    # Group by subject and get the first row in each group. If all tied, will just pick the first eye - ie the right eye
#    return df.groupby('RID_Subject').first(skipna=False).reset_index() # first computes the first entry of each column within each group, but NaN's dont count as a value; so if one eye has NaN for any random column, then the value for the other eye is transferred to that eye

#wide_train = pick_severe_eye(wide_train_raw)
#wide_test = pick_severe_eye(wide_test_raw)

# the row that made me realize the bug requiring skipna in first() # if I did want to apply the label of any eye to both eyes, this could be useful
# wide_train[wide_train['RID_Subject']=='2-7KVA']

In [None]:
# current severity rule: prioritize RNFL > HVF > CDR
# if don't want thresholds, just make threshold 0
# just return the first eye if RNFL, MD, CDR all NaN
def pick_severe_eye(df, rnfl_threshold, md_threshold):
    # Sort by 'Average_RNFL_Thickness(μm)', 'MD', and 'CDR' in descending order
    df = df.sort_values(by=['Average_RNFL_Thickness(μm)', 'MD', 'CDR'], ascending=[True, True, False])
    
    # Custom function to select the row with the most severe value within the thresholds
    def select_row(group):
        max_value = group['Average_RNFL_Thickness(μm)'].min() # min is more severe for RNFL
        within_value_threshold = group[np.abs(group['Average_RNFL_Thickness(μm)'] - max_value) <= rnfl_threshold] # identify eyes within threshold

        if len(within_value_threshold) > 1 or len(within_value_threshold) == 0: # if both eyes "equal" rnfl OR if RNFL is NaN, then try MD
            max_other_column = within_value_threshold['MD'].min() # min is more severe for MD
            within_other_column_threshold = within_value_threshold[np.abs(within_value_threshold['MD'] - max_other_column) <= md_threshold]

            if len(within_other_column_threshold) > 1 or len(within_other_column_threshold) == 0: # if both eyes "equal" MD OR if MD is NaN, then try CDR
                return group.sort_values(by=['CDR'], ascending=[False]).iloc[0] # since i didn't set CDR threshold, this will always pick something (even if NaN)
            else:
                return within_other_column_threshold.iloc[0]
        else:
            return within_value_threshold.iloc[0]
                
    # Apply the custom function to each group
    return df.groupby('RID_Subject').apply(select_row).reset_index(drop=True)

In [None]:
wide_train_nothresh = pick_severe_eye(wide_train_raw, 0, 0)
wide_test_nothresh = pick_severe_eye(wide_test_raw, 0, 0)

In [None]:
rnfl_thresh = 4
md_thresh = 2
wide_train = pick_severe_eye(wide_train_raw, rnfl_thresh, md_thresh)
wide_test = pick_severe_eye(wide_test_raw, rnfl_thresh, md_thresh)

In [None]:
# Show which subjects changed eyes by adding thresholds
diff_values = wide_train.compare(wide_train_nothresh, align_axis=0, keep_shape=True, keep_equal=True) #keep_equal=False --> values that are equal are represented as NaN
diff_values = diff_values.drop_duplicates(keep=False) # drop rows that have a duplicate
print("# subjects where eye choice changed: %i" % (len(diff_values)/2))
diff_values[['RID_Subject', 'Side', 'Label', 'Average_RNFL_Thickness(μm)', 'MD', 'CDR']]

# Transform Train and Test Data and Handle Missing Values

In [None]:
#split dataset in features and target variable
demographic_fx = ['Gender', 'Ethnicity']
clinic_fx = ['LogMAR_VA', 'IOP', 'CDR'] # 'Gonioscopy' - mostly NaN, not standardized annotation # CCT - mostly NaN
HVF_fx = ['MD', 'VFI'] # 'PSD' - mostly NaN. I think PSD and PSD.1 columns should be merged to use this column if desired
RNFL_fx = ['Average_RNFL_Thickness(μm)'] # Average_C/D_Ratio - for RNFL-derived CDR
RNFL_clockhr_fx = ['Clock_Hours_1', 'Clock_Hours_2', 'Clock_Hours_3', 'Clock_Hours_4', 'Clock_Hours_5', 'Clock_Hours_6', 'Clock_Hours_7', 'Clock_Hours_8', 'Clock_Hours_9', 'Clock_Hours_10', 'Clock_Hours_11', 'Clock_Hours_12'] # if I want to use each clock hour
RNFL_quad_fx = ['Quadrants_S', 'Quadrants_N', 'Quadrants_T', 'Quadrants_I']

fx_cols = demographic_fx + clinic_fx + HVF_fx + RNFL_fx # selected feature cols from above

In [None]:
# Method to transfer data, to apply to wide_train and wide_test
def transform_data(multimodal_wide):
    ### drop rows missing label (ie no label for POAG vs PACG vs GS)
    multimodal_wide = multimodal_wide.dropna(subset=['Label'])
    # drop rows where label is "Other" (should only be PACG, POAG, or GS)
    allowed_labels = ["PACG", "POAG", "GS"]
    multimodal_wide = multimodal_wide[multimodal_wide['Label'].isin(allowed_labels)]

    X = multimodal_wide[fx_cols] # Features
    y = multimodal_wide.Label # Target variable

    ### categorical data: encode using LabelEncoder or OneHotEncoder
    # label encoder if data ordinal (ie ranked) -- jk nvm this transformer should be used to encode target values, i.e. y, and not the input X!
    # one-hot if data not ranked; note this will increase dimensionality of data which is bad if >1/3rd of fx are one-hot
    # https://datascience.stackexchange.com/questions/9443/when-to-use-one-hot-encoding-vs-labelencoder-vs-dictvectorizor
    #one_hot_encoder = preprocessing.OneHotEncoder(handle_unknown='ignore')
    from feature_engine.encoding import OneHotEncoder # this instead of skLearn allows me to one hot encode desired columns only
    categorical_vars = ['Gender', 'Ethnicity']
    encoder = OneHotEncoder(variables = categorical_vars)
    X_transformed = encoder.fit_transform(X)

    ### sort categorical encoded columns so that they're in alphabetical order
    def sort_cols(X, var):
        # Select the subset of columns to sort
        subset_columns = [col for col in X.columns if col.startswith(var)]
        # Sort the subset of columns alphabetically
        sorted_columns = sorted(subset_columns)
        # Reorder the DataFrame based on the sorted columns
        sorted_df = X[[col for col in X.columns if col not in subset_columns] + sorted_columns]
        return sorted_df
    for var in categorical_vars:
        X_transformed = sort_cols(X_transformed, var)

    ### format numerical data
    # VFI
    X_transformed['VFI'] = X_transformed['VFI'].replace('Off', np.nan) # replace "Off" with nan
    def convert_percent(x):
        if pd.isnull(x):
            return np.nan
        return float(x.strip('%'))/100
    X_transformed['VFI'] = X_transformed['VFI'].map(convert_percent)

    ### format y
    # combine PACG and POAG as glaucoma
    y = y.replace(['POAG', 'PACG'], 'Glaucoma')
    # convert to 0 and 1
    label_encoder = preprocessing.LabelEncoder()
    y[:] = label_encoder.fit_transform(y) # fit_transform combines fit and transform
    y = y.astype(int)

    return X_transformed, y

In [None]:
X_train_keep_missing, y_train = transform_data(wide_train)

In [None]:
def handle_missing(X_transformed):
    ### Handle missing values
    # Xu:
    # - In the past, we’ve used multiple imputation as long as the % of missing values was less than 10% for any given variable. I attached a paper we wrote where we used this technique. 
    # - Balancing can be done by upsampling the minority class, although in this case the two are fairly similar in number."
    # https://scikit-learn.org/stable/modules/impute.html
    
    ## temp simple imputation method
    from sklearn.impute import SimpleImputer
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputer = imputer.fit(X_transformed)
    X_transformed[:] = imputer.transform(X_transformed) # [:] modifies the df in place

    return X_transformed

In [None]:
X_train = handle_missing(X_train_keep_missing)

In [None]:
# repeat transform and handle missing for test
X_test_keep_missing, y_test = transform_data(wide_test)
X_test = handle_missing(X_test_keep_missing)

# Logistic Regression

In [None]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16, solver='lbfgs', max_iter=1000)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

In [None]:
# model performance
# https://medium.com/javarevisited/evaluating-the-logistic-regression-ae2decf42d61

print('Training set count: %i' % len(X_train))
print('Test set count: %i' % len(X_test))

import sklearn.metrics as metrics
# evaluate predictions
mae = metrics.mean_absolute_error(y_test, y_pred)
print('MAE: %.3f' % mae)

# examine the class distribution of the testing set (using a Pandas Series method)
y_test.value_counts()

# calculate the percentage of ones
# because y_test only contains ones and zeros, we can simply calculate the mean = percentage of ones
y_test.mean()

# calculate the percentage of zeros
1 - y_test.mean()


# # Metrics computed from a confusion matrix (before thresholding)

# Confusion matrix is used to evaluate the correctness of a classification model
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test,y_pred)
confusion_matrix

TP = confusion_matrix[1, 1]
TN = confusion_matrix[0, 0]
FP = confusion_matrix[0, 1]
FN = confusion_matrix[1, 0]

# Classification Accuracy: Overall, how often is the classifier correct?
# use float to perform true division, not integer division
# print((TP + TN) / sum(map(sum, confusion_matrix))) -- this is is the same as the below automatic method
print('Accuracy: %.3f' % metrics.accuracy_score(y_test, y_pred))

# Sensitivity(recall): When the actual value is positive, how often is the prediction correct?
sensitivity = TP / float(FN + TP)

print('Sensitivity: %.3f' % sensitivity)
# print('Recall score: %.3f' % metrics.recall_score(y_test, y_pred)) # same thing as sensitivity, but recall term used in ML

# Specificity: When the actual value is negative, how often is the prediction correct?
specificity = TN / (TN + FP)
print('Specificity: %.3f' % specificity)

#from imblearn.metrics import specificity_score
#specificity_score(y_test, y_pred)

# False Positive Rate: When the actual value is negative, how often is the prediction incorrect?
false_positive_rate = FP / float(TN + FP)
print('FPR: %.3f' % false_positive_rate)
# print(1 - specificity) # same as FPR

# Precision: When a positive value is predicted, how often is the prediction correct?
precision = TP / float(TP + FP)
#print('Precision: %.3f' % precision)
print('Precision: %.3f' % metrics.precision_score(y_test, y_pred))

# F score
f_score = 2*TP / (2*TP + FP + FN)
#print('F score: %.3f' % f_score)
print('F1 score: %.3f' % metrics.f1_score(y_test,y_pred))

#Evaluate the model using other performance metrics
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

from sklearn import metrics
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = None)

cm_display.plot()
plt.show()

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# template stuff I haven't deleted

In [None]:
# View data

# subject = pd.read_csv(configuration_records.bag_paths[0]/'data/Subject.csv')
# subject

# observation = pd.read_csv(configuration_records.bag_paths[0]/'data/Observation.csv')
# observation

# clinic = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinical_Records.csv')
# clinic

# observation_clinic_asso = pd.read_csv(configuration_records.bag_paths[0]/'data/Observation_Clinic_Asso.csv')
# observation_clinic_asso # association table between observation table and clinic record table

# icd10 = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinic_ICD10.csv')
# icd10

# icd10_asso = pd.read_csv(configuration_records.bag_paths[0]/'data/Clinic_ICD_Asso.csv')
# icd10_asso # association table between clinic record table and ICD10 code

# report = pd.read_csv(configuration_records.bag_paths[0]/'data/Report.csv')
# report

# RNFL_OCR = pd.read_csv(configuration_records.bag_paths[0]/'data/RNFL_OCR.csv')
# RNFL_OCR

HVF_OCR = pd.read_csv(configuration_records.bag_paths[0]/'data/HVF_OCR.csv')
HVF_OCR


In [None]:
# @title Execute Training algorithm
from eye_ai.models.vgg19_hyper_parameter_tuning import main #import the new logistic module.
with EA.execution(execution_rid=configuration_records.execution_rid) as exec:
  main()


In [None]:
# @title Save Execution Assets (model) and Metadata
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, False)

# 