<a href="https://colab.research.google.com/github/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Diagnosis_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multimodal analyses: linear regression predicting MD

In [None]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
# import the class
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
# Import label encoder 
from sklearn import preprocessing 
from sklearn import metrics

import numpy as np
import matplotlib.pyplot as plt

In [None]:

from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

Connect to Eye-AI catalog.  Configure to store data local cache and working directories.  Initialize Eye-AI for pending execution based on the provided configuration file.

In [None]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid= "2-CCD4" # rid I created with my config containing minid for both train and test sets


In [None]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= None, working_dir=None)

In [None]:
# @title Initiate an Execution
configuration_records = EA.initialize_execution(configuration_rid=configuration_rid)
configuration_records.model_dump()

# Generate multimodal wide table

In [None]:
# TRAIN: configuration_records.bag_paths[0]
wide_train_raw = EA.severity_analysis(configuration_records.bag_paths[0])

In [None]:
# TEST: configuration_records.bag_paths[1]
wide_test_raw = EA.severity_analysis(configuration_records.bag_paths[1])

In [None]:
# add age to table
age_path = "/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/multimodal_subject_age.csv"
age_df = pd.read_csv(age_path)
age_df.rename(columns={'RID': 'RID_Subject'}, inplace=True)
wide_train_raw = wide_train_raw.merge(age_df, on='RID_Subject', how='left')
wide_test_raw = wide_test_raw.merge(age_df, on='RID_Subject', how='left')

# Create new table with only more severe eye for each patient

In [None]:
# eye_ai.py:     def pick_severe_eye(self, df, rnfl_threshold, md_threshold):

In [None]:
wide_train_nothresh = EA.pick_severe_eye(wide_train_raw, 0, 0)
wide_test_nothresh = EA.pick_severe_eye(wide_test_raw, 0, 0)

In [None]:
rnfl_thresh = 0
md_thresh = 0
wide_train = EA.pick_severe_eye(wide_train_raw, rnfl_thresh, md_thresh)
wide_test = EA.pick_severe_eye(wide_test_raw, rnfl_thresh, md_thresh)

In [None]:
# Show which subjects changed eyes by adding thresholds
diff_values = wide_train.compare(wide_train_nothresh, align_axis=0, keep_shape=True, keep_equal=True) #keep_equal=False --> values that are equal are represented as NaN
diff_values = diff_values.drop_duplicates(keep=False) # drop rows that have a duplicate
print("# subjects where eye choice changed: %i" % (len(diff_values)/2))
diff_values[['RID_Subject', 'Side', 'Label', 'Average_RNFL_Thickness(μm)', 'MD', 'CDR']]

# Choose Features

In [None]:
#split dataset in features and target variable
demographic_fx = ['Gender', 'Ethnicity', 'Age']
clinic_fx = ['LogMAR_VA', 'IOP'] # 'Gonioscopy' - mostly NaN, not standardized annotation # CCT - mostly NaN
CDR_fx = ['CDR']
RNFL_fx = ['Average_RNFL_Thickness(μm)'] # Average_C/D_Ratio - for RNFL-derived CDR
RNFL_clockhr_fx = ['Clock_Hours_1', 'Clock_Hours_2', 'Clock_Hours_3', 'Clock_Hours_4', 'Clock_Hours_5', 'Clock_Hours_6', 'Clock_Hours_7', 'Clock_Hours_8', 'Clock_Hours_9', 'Clock_Hours_10', 'Clock_Hours_11', 'Clock_Hours_12'] # if I want to use each clock hour
RNFL_quad_fx = ['Quadrants_S', 'Quadrants_N', 'Quadrants_T', 'Quadrants_I']
RNFL_IS_fx = ['Quadrants_S', 'Quadrants_I']
HVF_fx = ['MD', 'VFI'] # 'PSD' - mostly NaN. I think PSD and PSD.1 columns should be merged to use this column if desired

# All Project Fx
fx_cols = demographic_fx + clinic_fx + CDR_fx + RNFL_fx + RNFL_IS_fx + HVF_fx

# Transform Train and Test Data

### Function

In [None]:
# transferred to eye_ai.py
# def transform_data(multimodal_wide, fx_cols, y_method="all_glaucoma" or "urgent_glaucoma"):
# Returns: X_transformed, y

### run transform

In [None]:
X_train_keep_missing, _ = EA.transform_data(wide_train, fx_cols)
X_test_keep_missing, _ = EA.transform_data(wide_test, fx_cols)

In [None]:
# drop rows missing MD
X_train_keep_missing.dropna(subset=['MD'], inplace=True)
X_test_keep_missing.dropna(subset=['MD'], inplace=True)

print(len(X_train_keep_missing))
print(len(X_test_keep_missing))

# Counts / data info

In [None]:
len(X_train_keep_missing) + len(X_test_keep_missing)

In [None]:
counts = np.unique(y_train_keep_missing, return_counts=True)
print(counts) # #GS vs #Glaucoma
print("Percent mild-GS vs mod-severe in TRAIN:", counts[1] / sum(counts[1])) # percent

counts = np.unique(y_test_keep_missing, return_counts=True)
print(counts) # #GS vs #Glaucoma
print("Percent mild-GS vs mod-severe in TEST:", counts[1] / sum(counts[1])) # percent



In [None]:
counts = sum(X_train_keep_missing['Gender_M']) + sum(X_test_keep_missing['Gender_M'])
print("Num male:", counts)
counts = sum(X_train_keep_missing['Gender_F']) + sum(X_test_keep_missing['Gender_F'])
print("Num female:", counts)

mean_age = (np.sum(X_train_keep_missing['Age']) + np.sum(X_test_keep_missing['Age'])) / (len(X_train_keep_missing) + len(X_test_keep_missing))
print("Mean age:", mean_age)

In [None]:
# #NAN
### the number of rows with nan in any column will increase if I choose more features

# count number / percent of rows with nan value
num_rows_with_nan = X_train_keep_missing.isnull().any(axis=1).sum()
print ("Number of train rows with any nan: %i" % num_rows_with_nan)

# Calculate the percentage of rows with NaN values
print ("Percent of train rows with any nan: %f" % ((num_rows_with_nan / len(X_train_keep_missing)) * 100))

# count number / percent of rows with nan value
num_rows_with_nan = X_test_keep_missing.isnull().any(axis=1).sum()
print ("Number of test rows with any nan: %i" % num_rows_with_nan)

# Calculate the percentage of rows with NaN values
print ("Percent of test rows with any nan: %f" % ((num_rows_with_nan / len(X_test_keep_missing)) * 100))

# Standardize Data
(NOT REQUIRED FOR LINEAR REGRESSION but makes coefficients easier to interpret)

In [None]:
### normalize numeric training data (so that features are on same scale instead of wildly different scales)
# not required for typical logistic regression, but do need for regularized regression
# I didn't put this in transform_data because I want to use the scaler fitted on train for test too

# how? https://datascience.stackexchange.com/questions/54908/data-normalization-before-or-after-train-test-split
# why? https://stackoverflow.com/questions/52670012/convergencewarning-liblinear-failed-to-converge-increase-the-number-of-iterati

# eye_ai.py: def standardize_data(self, fx_cols, X_train, X_test):

X_train_keep_missing, X_test_keep_missing = EA.standardize_data(fx_cols, X_train_keep_missing, X_test_keep_missing)


# A) Simple imputation

In [None]:
strat = 'mean'
# NOTE: the following code imputes X_test based on the imputer fitted to X_train

"""
STRATEGIES
If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

If “median”, then replace missing values using the median along each column. Can only be used with numeric data.

If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.

If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
"""

# simple imputation fitted to X_train, but also applied to X_test
# eye_ai.py: def simple_impute(self, X_train_keep_missing, X_test_keep_missing, strat = "mean"):
X_train, X_test = EA.simple_impute(X_train_keep_missing, X_test_keep_missing)

# B) Multiple imputations (10 imputations)

In [None]:
# good article on MCAR vs MAR vs MNAR and how to appropriately handle missing values in each case: https://datascience.stackexchange.com/questions/116622/what-should-you-do-with-nan-values

# return list of pandas dataframes, each containing 1 of 10 imputations
# eye_ai.py:     def mult_impute_missing(self, X, train_data=None):

In [None]:
X_train_imputedsets = EA.mult_impute_missing(X_train_keep_missing) # list of 10 imputed X_trains

In [None]:
X_test_imputedsets = EA.mult_impute_missing(X_test_keep_missing, train_data=X_test_keep_missing) # Impute test data using model fit with training data, not with test data!

# C) Drop NA
## don't drop until later when choosing specific variables to run, so that you don't drop extra columns

In [None]:
X_train = X_train_keep_missing
X_test = X_test_keep_missing

# Model methods

In [None]:
# transferred to eye_ai.py

# Multivariate Linear Regression DROPNA or SIMPLEIMPUTER

## NORMAL LINEAR REGRESSION

In [None]:
def run_linreg(chosen_fx, X_train, X_test):
    # drop NAs (if used simple or multiple imputer, this should also be fine because there shouldn't be anything to drop)
    # (do this first so it drops from both X and y)
    X_train = X_train.dropna(subset=chosen_fx)
    X_test = X_test.dropna(subset=chosen_fx)

    y_train = X_train['MD']
    y_test = X_test['MD']
    X_train = X_train[chosen_fx]
    X_test = X_test[chosen_fx]
    print("X_train length: %i \n X_test length: %i" % (len(X_train), len(X_test)))
    
    # MUST DROP REFERENCE COLUMN FOR ONE-HOT-ENCODED VARIABLES (AVOID DUMMY VARIABLE TRAP)
    chosen_ref_labels = ['Gender_M', 'Ethnicity_Other'] 
    drop_cols = [x for x in X_train.columns if x in chosen_ref_labels]
    X_train = X_train.drop(columns=drop_cols)
    X_test = X_test.drop(columns=drop_cols)
    
    linreg = LinearRegression() 
    
    linreg.fit(X_train, y_train)
    y_pred = linreg.predict(X_test)

    ### Commenting out because statsmodels package prints out same info but more
    # Print the coefficients and intercept
    #print("Coefficients:", pd.DataFrame({'Coefficient': linreg.coef_}, index=list(X_train.columns)))
    #print("Intercept:", linreg.intercept_)
    # model evaluation 
    #print('mean_squared_error : ', metrics.mean_squared_error(y_test, y_pred))
    #print('mean_absolute_error : ', metrics.mean_absolute_error(y_test, y_pred))

    #print("\nALTERNATIVE STATSMODELS PACKAGE-------------------------")
    import statsmodels.api as sm
    X2_train = sm.add_constant(X_train)
    model = sm.OLS(y_train, X2_train).fit()
    print(model.summary())

    # print out of metrics
    print("\nR-squared for y_train vs y_pred", linreg.score(X_train, y_train))
    print("R-squared for y_test vs y_pred2:", linreg.score(X_test, y_test)) # because only sklearn computes this
    y_pred2 = model.predict(sm.add_constant(X_test))
    print('mean_squared_error of test predictions: ', metrics.mean_squared_error(y_test, y_pred2))
    print('mean_absolute_error of test predictions: ', metrics.mean_absolute_error(y_test, y_pred2))
    print("---------------------------------------------------------------------------------------------------")


all_fx = [x for x in X_train.columns if x not in ['MD', 'VFI']] # to choose all features
print("\nAll fx")
run_linreg(all_fx, X_train, X_test)
all_fx_exceptIS = [x for x in X_train.columns if x not in ['MD', 'VFI']+RNFL_IS_fx] # to choose all features
print("\nAll non-redundant fx")
run_linreg(all_fx_exceptIS, X_train, X_test)
print("\nDemographics\n")
demographic_fx_onehot = [x for x in X_train.columns if x not in (['LogMAR_VA', 'IOP', 'CDR','MD', 'VFI']+RNFL_fx+RNFL_IS_fx)]
run_linreg(demographic_fx_onehot, X_train, X_test)
print("\nCDR + RNFL\n")
run_linreg(CDR_fx+RNFL_fx, X_train, X_test)
print("\nCDR\n")
run_linreg(CDR_fx, X_train, X_test)
print("\nRNFL\n")
run_linreg(RNFL_fx, X_train, X_test)

# Expectation of coefficient signs: IOP-, CDR-, RNFL+

# NOTHING AFTER THIS IS UDPATED #

## Ridge and Lasso

In [None]:
#### Regularization params
k_folds = 10 #5-10 standard
scoring = 'roc_auc' # 'neg_log_loss', 'neg_brier_score', 'accuracy' (default), 'roc_auc', 'neg_mean_absolute_error' ...options on sklearn.metrics: https://scikit-learn.org/stable/api/sklearn.metrics.html#module-sklearn.metrics
max_iter=1000
solver='saga'
# for elastic net only:
lambda_inverse = 20  # of C's (=inverse of lambda) to try; 10 by default
alpha_range = np.linspace(0, 1, 20)

from sklearn.linear_model import LogisticRegressionCV

In [None]:
# 1) Ridge
ridge_cv = LogisticRegressionCV(cv=k_folds, scoring=scoring, solver=solver, max_iter=max_iter)
ridge_cv.fit(X_train, y_train)
# Retrieve the best hyperparameters
best_C = ridge_cv.C_[0]
print(f"Best C (inverse of regularization strength): {best_C}")

EA.model_summary(ridge_cv, X_train)
EA.compute_performance(ridge_cv, X_test, y_test)
EA.compute_performance_youden(ridge_cv, X_test, y_test)

In [None]:
# 2) Elastic Net
#https://stackoverflow.com/questions/66787845/how-to-perform-elastic-net-for-a-classification-problem
# SAGA should be considered more advanced and used over SAG. For more information, see: https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions
en_cv = LogisticRegressionCV(cv=k_folds, scoring=scoring, penalty='elasticnet', Cs = lambda_inverse, l1_ratios=alpha_range, solver=solver, max_iter=max_iter)
en_cv.fit(X_train, y_train)

# Retrieve the best hyperparameters
best_C = en_cv.C_[0]
best_l1_ratio = en_cv.l1_ratio_[0]
print(f"Best C (inverse of regularization strength): {best_C}")
print(f"Best l1_ratio (mixing parameter): {best_l1_ratio}")

In [None]:
print(f"Best C (inverse of regularization strength): {best_C}")
print(f"Best l1_ratio (mixing parameter): {best_l1_ratio}")
EA.model_summary(en_cv, X_train)
EA.compute_performance(en_cv, X_test, y_test)
EA.compute_performance_youden(en_cv, X_test, y_test)

In [None]:
X_train.columns

# Multivariate Logistic Regression MULTIPLE IMPUTATIONS
### To check if what I did is best method: used mode of y_pred, and averaged prediction probabilities of each imputed model to determine AUC, and averaged p-values

### Normal logistic regression

In [None]:
# eye_ai.py: 
#     def compute_performance_mice(self, logreg_models, Xtest_finals, y_test):
#     def model_summary_mice(self, logreg_models, Xtrain_finals):

In [None]:
# how to do prediction after multiple imputation:
# https://github.com/amices/mice/issues/82
# https://stackoverflow.com/questions/68460923/how-to-do-the-prediction-after-multiple-imputation-with-mice-package
logreg_models = []
Xtrain_finals = []
Xtest_finals = []

# MUST DROP REFERENCE COLUMN FOR ONE-HOT-ENCODED VARIABLES
#chosen_ref_labels = ['GHT_Within Normal Limits', 'Gender_M', 'Ethnicity_Other']
chosen_ref_labels = ['GHT_Within Normal Limits','GHT_Borderline', 'Gender_M', 'Ethnicity_Other']
penalty=None#'l1', 'l2', 'elasticnet', or None
solver='saga' # 'lbfgs', 'saga' (only saga supports l1 and elasticnet)

for X_train, X_test in zip(X_train_imputedsets, X_test_imputedsets):
    # NORMAL LOGISTIC REGRESSION
    drop_cols = [x for x in X_train.columns if x in chosen_ref_labels]
    X_train_dropped = X_train.drop(columns=drop_cols)
    X_test_dropped = X_test.drop(columns=drop_cols)

    logreg = LogisticRegression(random_state=16, solver=solver, max_iter=1000, penalty=penalty)
    logreg.fit(X_train_dropped, y_train)
    logreg_models.append(logreg)

    Xtrain_finals.append(X_train_dropped)
    Xtest_finals.append(X_test_dropped)

EA.model_summary_mice(logreg_models, Xtrain_finals)
EA.compute_performance_mice(logreg_models, Xtest_finals, y_test)

# Alternative models

In [None]:
# don't have to onehotencode, but xgboost performs better if does
# keep dummy variables, don't drop ref label for decision trees

from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

drop_NA=True
if drop_NA:
    # Drop NA if desired
    x = X_train_keep_missing.dropna()
    x_t = X_test_keep_missing.dropna()

    y = y_train[y_train.index.isin(x.index)]
    y_t = y_test[y_test.index.isin(x_t.index)]

print(x.columns)

#model = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=0) # bagged SVC
#model=BaggingClassifier() # bagged decision trees (bc DecisionTree is default)
model=SVC(probability=True) # probability=True to enable predict_proba function (slow)
clf = model.fit(x,y)

# define cross-validation evaluation procedure
k = 10
cv = RepeatedStratifiedKFold(n_splits=k, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x, y, scoring='roc_auc', cv=cv)
# summarize performance
print('Mean AUC using %i-fold cross-validation: %.3f' % (k, mean(scores)))# AUC from 10-fold cv on TRAINING set, as opposed to AUC on test set computed in compute_performance -- if this better than AUC for test set, then model probably overfit
print("")

# test performance
EA.compute_performance(clf, x_t, y_t)
EA.compute_performance_youden(clf, x_t, y_t)

# PLOT multiple ROC curves
- current version of this code requires running above multiple times for each roc curve I want to plot, then saving them manually and adding to global dictionary before plotting combined ROC curve
- X_test and y_test have different #s for drop_NA bc drop_NA may drop diff # rows depending on which variables are included

In [None]:
#models = {} # model label name: (model, associated X_test, associated y_test)
# start with univarate models dict
models ={**models, **models_univariate} ## don't overwrite models just in case already contains stuff
# map univariate model names
key_mapping = {
    'Average_RNFL_Thickness(μm)': 'OCT',
    'MD': 'HVF',
    'ML Feature Selection (Elastic Net)': 'ML Elastic Net'
}
# Function to rename keys in a dictionary
def rename_keys(d, key_map):
    return {key_map.get(k, k): v for k, v in d.items()}
# Apply the renaming function to the dictionary
models = rename_keys(models, key_mapping)

### Manual model additions -- EDIT THE NAME AND MODEL NAME

In [None]:
### Manual model additions
name = "ML Elastic Net" # "Demographics"
mod = logreg #en_cv for elastic net model
models[name] = (mod, X_test, y_test)

In [None]:
## how to combine 2 dictionaries
#all_models = {**models_univariate, **selected_models}

### Plot Models

In [None]:
# select which models to plot
wanted_keys = ['Demographics', 'LogMAR_VA', 'CDR', 'OCT', 'CDR+OCT', 'All Significant Features', 'ML Elastic Net'] # The keys you want
selected_models = dict((k, models[k]) for k in wanted_keys if k in models)

In [None]:
selected_models.keys()

In [None]:
plt.figure(figsize=(9, 8))
for name, (m, xt, yt) in selected_models.items():
    print (name)
    fpr, tpr, auc, optimal_idx, optimal_threshold = EA.compute_performance_youden(m, xt, yt, plot=False)
    #plt.plot(fpr, tpr, label="%s (AUC=%s, Youden's=%.3f)" % (name, auc, (tpr[optimal_idx] - fpr[optimal_idx])))
    plt.plot(fpr, tpr, label="%s (AUC=%s)" % (name, auc))
    #plt.scatter(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='red')
    print ("")

plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='best')
plt.savefig("/home/yukim3003/Figure_1_urgent_glaucoma.png", format="png", dpi=300)
#plt.show()

## Save a high quality plot - nvm this has to be in same cell as original plot creation to work

In [None]:
os.mkdir(configuration_records.working_dir/'Execution_Assets/Multimodal_Figures/')
fig_path = configuration_records.working_dir/'Execution_Assets/Multimodal_Figures/Figure_1_urgent_glaucoma.png'

# Save the plot with higher DPI
plt.savefig(fig_path, dpi=300)

In [None]:
fig_path

In [None]:
# workaround
plt.savefig("/home/yukim3003/Figure_1_urgent_glaucoma.png", format="png")

## Cache models

In [None]:
import pickle
cache_path = configuration_records.working_dir/'Execution_Assets/Multimodal_Analysis/models_cache_urgent_glaucoma.pkl'

# Cache the models dictionary to a file
with open(cache_path, 'wb') as f:
    pickle.dump(models, f)

In [None]:
# To load the cached models dictionary later
with open(cache_path, 'rb') as f:
    cached_models = pickle.load(f)

## Access a specific saved model

In [None]:
# compute a specific model that is saved
name = "ML Elastic Net"
m, xt, yt = models[name]
EA.compute_performance(m, xt, yt)
EA.compute_performance_youden(m, xt, yt, plot=True)

In [None]:
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, False)