<a href="https://colab.research.google.com/github/informatics-isi-edu/eye-ai-exec/blob/main/notebooks/VGG19_Diagnosis_Train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multimodal Initial analyses

In [None]:
# import sys
# IN_COLAB = 'google.colab' in sys.modules

# if IN_COLAB:
#     !pip install deriva
#     !pip install bdbag
#     !pip install --upgrade --force pydantic
#     !pip install git+https://github.com/informatics-isi-edu/deriva-ml git+https://github.com/informatics-isi-edu/eye-ai-ml

In [None]:
repo_dir = "Repos"   # Set this to be where your github repos are located.
%load_ext autoreload
%autoreload 2

# Update the load path so python can find modules for the model
import sys
from pathlib import Path
sys.path.insert(0, str(Path.home() / repo_dir / "eye-ai-ml"))

In [None]:
# Prerequisites

import json
import os
from eye_ai.eye_ai import EyeAI
import pandas as pd
from pathlib import Path, PurePath
import logging
# import torch

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

In [None]:
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
# import the class
from sklearn.linear_model import LogisticRegression
# Import label encoder 
from sklearn import preprocessing 

import numpy as np
import matplotlib.pyplot as plt

In [None]:

from deriva.core.utils.globus_auth_utils import GlobusNativeLogin
catalog_id = "eye-ai" #@param
host = 'www.eye-ai.org'


gnl = GlobusNativeLogin(host=host)
if gnl.is_logged_in([host]):
    print("You are already logged in.")
else:
    gnl.login([host], no_local_server=True, no_browser=True, refresh_tokens=True, update_bdbag_keychain=True)
    print("Login Successful")

Connect to Eye-AI catalog.  Configure to store data local cache and working directories.  Initialize Eye-AI for pending execution based on the provided configuration file.

In [None]:
# Variables to configure the rest of the notebook.

cache_dir = '/data'        # Directory in which to cache materialized BDBags for datasets
working_dir = '/data'    # Directory in which to place output files for later upload.

configuration_rid= "2-CCD4" # rid I created with my config containing minid for both train and test sets


In [None]:
EA = EyeAI(hostname = host, catalog_id = catalog_id, cache_dir= cache_dir, working_dir=working_dir)

In [None]:
# @title Initiate an Execution
configuration_records = EA.execution_init(configuration_rid=configuration_rid)
configuration_records.model_dump()

# Generate multimodal wide table

In [None]:
# TRAIN: configuration_records.bag_paths[0]
wide_train_raw = EA.severity_analysis(configuration_records.bag_paths[0])

In [None]:
# TEST: configuration_records.bag_paths[1]
wide_test_raw = EA.severity_analysis(configuration_records.bag_paths[1])

In [None]:
# add age to table
age_path = "/data/yukim3003/EyeAI_working/Execution_Assets/Multimodal_Analysis/multimodal_subject_age.csv"
age_df = pd.read_csv(age_path)
age_df.rename(columns={'RID': 'RID_Subject'}, inplace=True)
wide_train_raw = wide_train_raw.merge(age_df, on='RID_Subject', how='left')
wide_test_raw = wide_test_raw.merge(age_df, on='RID_Subject', how='left')

# Create new table with only more severe eye for each patient

In [None]:
# current severity rule: prioritize RNFL > HVF > CDR
# if don't want thresholds, just make threshold 0
# just return the first eye if RNFL, MD, CDR all NaN
def pick_severe_eye(df, rnfl_threshold, md_threshold):
    # Sort by 'Average_RNFL_Thickness(μm)', 'MD', and 'CDR' in descending order
    df = df.sort_values(by=['Average_RNFL_Thickness(μm)', 'MD', 'CDR'], ascending=[True, True, False])

    ### 1. if only 1 eye has a label, just pick that eye as more severe eye (for Dr. Song's patients)
    df = df.groupby('RID_Subject').apply(lambda group: group[group['Label'].notna()]).reset_index(drop=True)
    
    # 2. Select the row/eye with most severe value within the thresholds
    def select_row(group):
        max_value = group['Average_RNFL_Thickness(μm)'].min() # min is more severe for RNFL
        within_value_threshold = group[np.abs(group['Average_RNFL_Thickness(μm)'] - max_value) <= rnfl_threshold] # identify eyes within threshold

        if len(within_value_threshold) > 1 or len(within_value_threshold) == 0: # if both eyes "equal" RNFL OR if RNFL is NaN, then try MD
            max_other_column = within_value_threshold['MD'].min() # min is more severe for MD
            within_other_column_threshold = within_value_threshold[np.abs(within_value_threshold['MD'] - max_other_column) <= md_threshold]

            if len(within_other_column_threshold) > 1 or len(within_other_column_threshold) == 0: # if both eyes "equal" MD OR if MD is NaN, then try CDR
                return group.sort_values(by=['CDR'], ascending=[False]).iloc[0] # since i didn't set CDR threshold, this will always pick something (even if NaN)
            else:
                return within_other_column_threshold.iloc[0]
        else:
            return within_value_threshold.iloc[0]
    return df.groupby('RID_Subject').apply(select_row).reset_index(drop=True)

In [None]:
wide_train_nothresh = pick_severe_eye(wide_train_raw, 0, 0)
wide_test_nothresh = pick_severe_eye(wide_test_raw, 0, 0)

In [None]:
rnfl_thresh = 0
md_thresh = 0
wide_train = pick_severe_eye(wide_train_raw, rnfl_thresh, md_thresh)
wide_test = pick_severe_eye(wide_test_raw, rnfl_thresh, md_thresh)

In [None]:
# Show which subjects changed eyes by adding thresholds
diff_values = wide_train.compare(wide_train_nothresh, align_axis=0, keep_shape=True, keep_equal=True) #keep_equal=False --> values that are equal are represented as NaN
diff_values = diff_values.drop_duplicates(keep=False) # drop rows that have a duplicate
print("# subjects where eye choice changed: %i" % (len(diff_values)/2))
diff_values[['RID_Subject', 'Side', 'Label', 'Average_RNFL_Thickness(μm)', 'MD', 'CDR']]

# Choose Features

In [None]:
#split dataset in features and target variable
demographic_fx = ['Gender', 'Ethnicity', 'Age']
clinic_fx = ['LogMAR_VA', 'IOP'] # 'Gonioscopy' - mostly NaN, not standardized annotation # CCT - mostly NaN
CDR_fx = ['CDR']
RNFL_fx = ['Average_RNFL_Thickness(μm)'] # Average_C/D_Ratio - for RNFL-derived CDR
RNFL_clockhr_fx = ['Clock_Hours_1', 'Clock_Hours_2', 'Clock_Hours_3', 'Clock_Hours_4', 'Clock_Hours_5', 'Clock_Hours_6', 'Clock_Hours_7', 'Clock_Hours_8', 'Clock_Hours_9', 'Clock_Hours_10', 'Clock_Hours_11', 'Clock_Hours_12'] # if I want to use each clock hour
RNFL_quad_fx = ['Quadrants_S', 'Quadrants_N', 'Quadrants_T', 'Quadrants_I']
RNFL_IS_fx = ['Quadrants_S', 'Quadrants_I']
HVF_fx = ['MD', 'VFI'] # 'PSD' - mostly NaN. I think PSD and PSD.1 columns should be merged to use this column if desired
GHT = ['GHT']

## Adjust fx_cols depending on what variables I want to include in model
# Clinic Data
fx_cols = demographic_fx + clinic_fx
# OCT+HVF
#fx_cols = ['MD'] + RNFL_fx
# CDR+OCT+HVF
#fx_cols = ['CDR', 'MD'] + RNFL_fx

# All Project Fx - also use this if running univariate analysis or elastic net
#fx_cols = demographic_fx + clinic_fx + CDR_fx + HVF_fx + RNFL_fx + RNFL_IS_fx + GHT

# Domain knowledge fx
#fx_cols = ['Age', 'Gender', 'LogMAR_VA', 'CDR'] + RNFL_fx + ['MD', 'GHT']  # Age, Gender, VA, CDR, avg RNFL, MD, GHT outside normal limits.

# Transform Train and Test Data

### Function

In [None]:
# transferred to eye_ai.py
# def transform_data(multimodal_wide, fx_cols, y_method="all_glaucoma" or "urgent_glaucoma"):
# Returns: X_transformed, y

### run transform

In [None]:
X_train_keep_missing, y_train_keep_missing = EA.transform_data(wide_train, fx_cols, y_method="all_glaucoma")
X_test_keep_missing, y_test_keep_missing = EA.transform_data(wide_test, fx_cols, y_method="all_glaucoma")

# Counts / data info

In [None]:
len(X_train_keep_missing) + len(X_test_keep_missing)

In [None]:
counts = np.unique(y_train_keep_missing, return_counts=True)
print(counts) # #GS vs #Glaucoma
print("Percent GS vs Glaucoma in TRAIN:", counts[1] / sum(counts[1])) # percent

counts = np.unique(y_test_keep_missing, return_counts=True)
print(counts) # #GS vs #Glaucoma
print("Percent GS vs Glaucoma in TEST:", counts[1] / sum(counts[1])) # percent



In [None]:
counts = sum(X_train_keep_missing['Gender_M']) + sum(X_test_keep_missing['Gender_M'])
print("Num male:", counts)
counts = sum(X_train_keep_missing['Gender_F']) + sum(X_test_keep_missing['Gender_F'])
print("Num female:", counts)

mean_age = (np.sum(X_train_keep_missing['Age']) + np.sum(X_test_keep_missing['Age'])) / (len(X_train_keep_missing) + len(X_test_keep_missing))
print("Mean age:", mean_age)

In [None]:
# #NAN
### the number of rows with nan in any column will increase if I choose more features

# count number / percent of rows with nan value
num_rows_with_nan = X_train_keep_missing.isnull().any(axis=1).sum()
print ("Number of train rows with any nan: %i" % num_rows_with_nan)

# Calculate the percentage of rows with NaN values
print ("Percent of train rows with any nan: %f" % ((num_rows_with_nan / len(X_train_keep_missing)) * 100))

# count number / percent of rows with nan value
num_rows_with_nan = X_test_keep_missing.isnull().any(axis=1).sum()
print ("Number of test rows with any nan: %i" % num_rows_with_nan)

# Calculate the percentage of rows with NaN values
print ("Percent of test rows with any nan: %f" % ((num_rows_with_nan / len(X_test_keep_missing)) * 100))

# Standardize Data
#### may not be required for univariate analysis to improve interpretability (but still good to center/scale to improve Gaussian-ness of distribution)

In [None]:
### normalize numeric training data (so that features are on same scale instead of wildly different scales)
# not required for typical logistic regression, but do need for regularized regression
# I didn't put this in transform_data because I want to use the scaler fitted on train for test too

# how? https://datascience.stackexchange.com/questions/54908/data-normalization-before-or-after-train-test-split
# why? https://stackoverflow.com/questions/52670012/convergencewarning-liblinear-failed-to-converge-increase-the-number-of-iterati

from sklearn.preprocessing import StandardScaler
categorical_vars = ['Gender', 'Ethnicity', 'GHT']
numeric_vars = sorted(set(fx_cols) - set(categorical_vars), key=fx_cols.index)

scaler = StandardScaler()

normalized_numeric_X_train = pd.DataFrame(
    scaler.fit_transform(X_train_keep_missing[numeric_vars]),
    columns = numeric_vars
)
cat_df = X_train_keep_missing.drop(numeric_vars, axis=1)
X_train_keep_missing = pd.concat([normalized_numeric_X_train.set_index(cat_df.index), cat_df], axis=1)

# normalize test data, but using scaler fitted to training data to prevent data leakage
normalized_numeric_X_test = pd.DataFrame(
    scaler.transform(X_test_keep_missing[numeric_vars]),
    columns = numeric_vars
)
cat_df = X_test_keep_missing.drop(numeric_vars, axis=1)
X_test_keep_missing = pd.concat([normalized_numeric_X_test.set_index(cat_df.index), cat_df], axis=1)


# A) Simple imputation

In [None]:
strat = 'mean'
# NOTE: the following code imputes X_test based on the imputer fitted to X_train

"""
STRATEGIES
If “mean”, then replace missing values using the mean along each column. Can only be used with numeric data.

If “median”, then replace missing values using the median along each column. Can only be used with numeric data.

If “most_frequent”, then replace missing using the most frequent value along each column. Can be used with strings or numeric data. If there is more than one such value, only the smallest is returned.

If “constant”, then replace missing values with fill_value. Can be used with strings or numeric data.
"""

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy=strat)
imputer = imputer.fit(X_train_keep_missing)
X_train_imputed = imputer.transform(X_train_keep_missing)
X_test_imputed = imputer.transform(X_test_keep_missing)
# convert into pandas dataframe instead of np array
X_train = pd.DataFrame(X_train_imputed, columns=X_train_keep_missing.columns)
X_test = pd.DataFrame(X_test_imputed, columns=X_test_keep_missing.columns)

y_train = y_train_keep_missing
y_test = y_test_keep_missing

# B) Multiple imputations (10 imputations)

In [None]:
# good article on MCAR vs MAR vs MNAR and how to appropriately handle missing values in each case: https://datascience.stackexchange.com/questions/116622/what-should-you-do-with-nan-values

In [None]:
# return list of pandas dataframes, each containing 1 of 10 imputations
def mult_impute_missing(X, train_data=None):
    if train_data is None:
        train_data = X

    ### multiple imputation method using IterativeImputer from sklearn 
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer

    imp = IterativeImputer(max_iter=10, random_state=0, sample_posterior=True)

    imputed_datasets = []
    for i in range(10): # 3-10 imputations standard
        imp.random_state = i
        imp.fit(train_data)
        X_imputed = imp.transform(X)
        imputed_datasets.append(pd.DataFrame(X_imputed, columns=X.columns))

    # ALTERNATIVE
    #from statsmodels.imputation import mice.MICEData # alternative package for MICE imputation
    # official docs: https://www.statsmodels.org/dev/generated/statsmodels.imputation.mice.MICE.html#statsmodels.imputation.mice.MICE
    # multiple imputation example using statsmodels: https://github.com/kshedden/mice_workshop
    #imp = mice.MICEData(data)
    #fml = 'y ~ x1 + x2 + x3 + x4' # variables used in multiple imputation model
    #mice = mice.MICE(fml, sm.OLS, imp) # OLS chosen; can change this up
    #results = mice.fit(10, 10) # 10 burn-in cycles to skip, 10 imputations
    #print(results.summary())
    
    return imputed_datasets

In [None]:
X_train_imputedsets = mult_impute_missing(X_train_keep_missing) # list of 10 imputed X_trains

In [None]:
X_test_imputedsets = mult_impute_missing(X_test_keep_missing, train_data=X_test_keep_missing) # Impute test data using model fit with training data, not with test data!

In [None]:
y_train = y_train_keep_missing
y_test = y_test_keep_missing

# C) Drop NA
### DON'T use this with the univariate loop -- incorrectly drops rows (dropNA in univariate loop instead to drop only for the univariate variable in question)

In [None]:
# drop rows with nan
X_train = X_train_keep_missing.dropna()
X_test = X_test_keep_missing.dropna()
print(len(X_train))
print(len(X_test))

y_train = y_train_keep_missing[y_train_keep_missing.index.isin(X_train.index)]
y_test = y_test_keep_missing[y_test_keep_missing.index.isin(X_test.index)]

# Model methods

In [None]:
# transferred to eye_ai.py

In [None]:
"""
    def model_summary(self, model, X_train):
    def calc_stats(self, y_pred, y_test):
    def compute_performance(self, model, X_test, y_test):
    def compute_performance_youden(self, model, X_test, y_test, plot=True):
"""

# Univariate Simple Logistic Regression loop

In [None]:
drop_NA=True # change if do simple imputation instead
if drop_NA:
    X_train = X_train_keep_missing
    X_test = X_test_keep_missing
    y_train = y_train_keep_missing
    y_test = y_test_keep_missing

In [None]:
# Iterate through all feature columns
# (code isn't all that different from the simple one below, I just wanted to use loop instead of doing by hand)

# MUST DROP REFERENCE COLUMN FOR ONE-HOT-ENCODED VARIABLES
chosen_ref_labels = ['GHT_Within Normal Limits', 'Gender_M', 'Ethnicity_Other']
X_train = X_train.drop(columns=chosen_ref_labels)
X_test = X_test.drop(columns=chosen_ref_labels)

penalty=None#'l1', 'l2', 'elasticnet', or None
solver='saga' # 'lbfgs', 'saga' (only saga supports l1 and elasticnet)

# save models in dict to access later
models_univariate = {} # model label name: (model, associated X_test, associated y_test)

def process_fx(fx, X, X_t, Y, Y_t):
    logreg = LogisticRegression(random_state=16, solver=solver, max_iter=1000, penalty=penalty)

    print(fx)
    # select all columns that contain fx (because of categorical vars)
    cols = [col for col in X.columns if fx in col]
    x = X[cols]
    x_t = X_t[cols]

    if drop_NA:
        # Drop NA if desired
        x = x.dropna()
        x_t = x_t.dropna()

        y = Y[Y.index.isin(x.index)]
        y_t = Y_t[Y_t.index.isin(x_t.index)]
    
    # fit the model with data
    logreg.fit(x, y)
    EA.model_summary(logreg, x)
    EA.compute_performance(logreg, x_t, y_t)
    EA.compute_performance_youden(logreg, x_t, y_t)
    return logreg, x_t, y_t
    print("")

# fx_cols = demographic_fx + clinic_fx + HVF_fx + RNFL_fx + RNFL_IS_fx + GHT -- must have set this in fx_cols at top before transforming data
for fx in fx_cols:
    models_univariate[fx] = process_fx(fx, X_train, X_test, y_train, y_test)
    print("----------------------------------------------------------------------------------------------------------")
    print("----------------------------------------------------------------------------------------------------------")

# Multivariate Logistic Regression DROPNA or SIMPLEIMPUTER

## SIMPLE LOGISTIC REGRESSION

In [None]:
# MUST DROP REFERENCE COLUMN FOR ONE-HOT-ENCODED VARIABLES
#chosen_ref_labels = ['GHT_Within Normal Limits', 'Gender_M', 'Ethnicity_Other']
chosen_ref_labels = ['GHT_Within Normal Limits','GHT_Borderline', 'Gender_M', 'Ethnicity_Other'] 
drop_cols = [x for x in X_train.columns if x in chosen_ref_labels]
X_train = X_train.drop(columns=drop_cols)
X_test = X_test.drop(columns=drop_cols)

penalty=None#'l1', 'l2', 'elasticnet', or None
solver='saga' # 'lbfgs', 'saga' (only saga supports l1 and elasticnet)

logreg = LogisticRegression(random_state=16, solver=solver, max_iter=1000, penalty=penalty) 

# fit the model with data
logreg.fit(X_train, y_train)
EA.model_summary(logreg, X_train)
EA.compute_performance(logreg, X_test, y_test)
EA.compute_performance_youden(logreg, X_test, y_test)

## Ridge and Lasso

In [None]:
#### Regularization params
k_folds = 10 #5-10 standard
scoring = 'roc_auc' # 'neg_log_loss', 'neg_brier_score', 'accuracy' (default), 'roc_auc', 'neg_mean_absolute_error' ...options on sklearn.metrics: https://scikit-learn.org/stable/api/sklearn.metrics.html#module-sklearn.metrics
max_iter=1000
solver='saga'
# for elastic net only:
lambda_inverse = 20  # of C's (=inverse of lambda) to try; 10 by default
alpha_range = np.linspace(0, 1, 20)

from sklearn.linear_model import LogisticRegressionCV

In [None]:
# 1) Ridge
ridge_cv = LogisticRegressionCV(cv=k_folds, scoring=scoring, solver=solver, max_iter=max_iter)
ridge_cv.fit(X_train, y_train)
# Retrieve the best hyperparameters
best_C = ridge_cv.C_[0]
print(f"Best C (inverse of regularization strength): {best_C}")

EA.model_summary(ridge_cv, X_train)
EA.compute_performance(ridge_cv, X_test, y_test)
EA.compute_performance_youden(ridge_cv, X_test, y_test)

In [None]:
# 2) Elastic Net
#https://stackoverflow.com/questions/66787845/how-to-perform-elastic-net-for-a-classification-problem
# SAGA should be considered more advanced and used over SAG. For more information, see: https://stackoverflow.com/questions/38640109/logistic-regression-python-solvers-defintions
en_cv = LogisticRegressionCV(cv=k_folds, scoring=scoring, penalty='elasticnet', Cs = lambda_inverse, l1_ratios=alpha_range, solver=solver, max_iter=max_iter)
en_cv.fit(X_train, y_train)

# Retrieve the best hyperparameters
best_C = en_cv.C_[0]
best_l1_ratio = en_cv.l1_ratio_[0]
print(f"Best C (inverse of regularization strength): {best_C}")
print(f"Best l1_ratio (mixing parameter): {best_l1_ratio}")

In [None]:
print(f"Best C (inverse of regularization strength): {best_C}")
print(f"Best l1_ratio (mixing parameter): {best_l1_ratio}")
EA.model_summary(en_cv, X_train)
EA.compute_performance(en_cv, X_test, y_test)
EA.compute_performance_youden(en_cv, X_test, y_test)

In [None]:
X_train.columns

# Multivariate Logistic Regression MULTIPLE IMPUTATIONS
### To check if what I did is best method: used mode of y_pred, and averaged prediction probabilities of each imputed model to determine AUC, and averaged p-values

### Simple logistic regression

In [None]:
### After performing logistic regression on each imputed dataset, pool the results using Rubin’s rules to obtain a single set of estimates.

# print model coefficients, ORs, p-values
def model_summary_mice(logreg_models, Xtrain_finals):
    print("Training set: %i" % len(Xtrain_finals[0]))
    
    # Extract coefficients and standard errors
    coefs = np.array([model.coef_[0] for model in logreg_models])
    ors = np.exp(coefs)
    intercepts = np.array([model.intercept_[0] for model in logreg_models])
    p_values = np.array([logit_pvalue(model, Xtrain_finals[i]) for i, model in enumerate(logreg_models)])
    
    # Calculate pooled estimates
    pooled_coefs = np.mean(coefs, axis=0)
    pooled_ors = np.mean(ors, axis=0)
    pooled_intercept = np.mean(intercepts)
    # I think this calculates SES between the imputed datasets
    pooled_ses = np.sqrt(np.mean(coefs**2, axis=0) + np.var(coefs, axis=0, ddof=1) * (1 + 1/len(logreg_models)))

    pooled_p_values = np.mean(p_values, axis=0)
    
    # Display pooled results
    results = pd.DataFrame({
        'Coefficient': format_dec(np.append(pooled_intercept, pooled_coefs)),
        'Odds Ratio': format_dec(np.append(np.exp(pooled_intercept), pooled_ors)),
        'Standard Error': format_dec(np.append(np.nan, pooled_ses)),  # Intercept SE is not calculated here
        'P-value': format_dec(pooled_p_values)
    }, index=['Intercept'] + list(Xtrain_finals[0].columns))
    print(results)
    print("")

In [None]:
# model performance
# https://medium.com/javarevisited/evaluating-the-logistic-regression-ae2decf42d61
def compute_performance_mice(logreg_models, Xtest_finals, y_test):
    print("Test set: %i" % len(Xtest_finals[0]))
    
    y_pred_results = []
    y_pred_proba_results = []
    for model, X_test in zip(logreg_models, Xtest_finals):
        y_pred_results.append(model.predict(X_test))
        
        y_pred_proba_results.append(model.predict_proba(X_test)[::,1])

    ypred_df = pd.DataFrame(np.row_stack(y_pred_results))
    y_pred = np.array(ypred_df.mode(axis=0).loc[0].astype(int)) ##### used the mode of y_pred across the 10 imputations
    y_pred_proba = np.mean(y_pred_proba_results, axis=0)
    
    
    import sklearn.metrics as metrics
    # evaluate predictions
    mae = metrics.mean_absolute_error(y_test, y_pred)
    print('MAE: %.3f' % mae)
    
    # examine the class distribution of the testing set (using a Pandas Series method)
    y_test.value_counts()
    
    # calculate the percentage of ones
    # because y_test only contains ones and zeros, we can simply calculate the mean = percentage of ones
    y_test.mean()
    
    # calculate the percentage of zeros
    1 - y_test.mean()
    
    # # Metrics computed from a confusion matrix (before thresholding)
    
    # Confusion matrix is used to evaluate the correctness of a classification model
    from sklearn.metrics import confusion_matrix
    confusion_matrix = confusion_matrix(y_test,y_pred)
    confusion_matrix
    
    TP = confusion_matrix[1, 1]
    TN = confusion_matrix[0, 0]
    FP = confusion_matrix[0, 1]
    FN = confusion_matrix[1, 0]
    
    # Classification Accuracy: Overall, how often is the classifier correct?
    # use float to perform true division, not integer division
    # print((TP + TN) / sum(map(sum, confusion_matrix))) -- this is is the same as the below automatic method
    print('Accuracy: %.3f' % metrics.accuracy_score(y_test, y_pred))
    
    # Sensitivity(recall): When the actual value is positive, how often is the prediction correct?
    sensitivity = TP / float(FN + TP)
    
    print('Sensitivity: %.3f' % sensitivity)
    # print('Recall score: %.3f' % metrics.recall_score(y_test, y_pred)) # same thing as sensitivity, but recall term used in ML
    
    # Specificity: When the actual value is negative, how often is the prediction correct?
    specificity = TN / float(TN + FP)
    print('Specificity: %.3f' % specificity)
    
    #from imblearn.metrics import specificity_score
    #specificity_score(y_test, y_pred)
    
    # Precision: When a positive value is predicted, how often is the prediction correct?
    precision = TP / float(TP + FP)
    #print('Precision: %.3f' % precision)
    print('Precision: %.3f' % metrics.precision_score(y_test, y_pred))
    
    # F score
    f_score = 2*TP / float(2*TP + FP + FN)
    #print('F score: %.3f' % f_score)
    print('F1 score: %.3f' % metrics.f1_score(y_test,y_pred))
    
    #Evaluate the model using other performance metrics - REDUNDANT, COMMENTED OUT FOR NOW
    # from sklearn.metrics import classification_report
    # print(classification_report(y_test,y_pred))

    # AUC
    fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
    auc = metrics.roc_auc_score(y_test, y_pred_proba)
    print('AUC: %.3f' % auc)

    # CM matrix plot
    from sklearn import metrics
    cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = None)
    
    cm_display.plot()
    plt.show()
    # 0 = GS, 1 = POAG
    
    # ROC curve plot
    plt.plot(fpr,tpr,label="auc="+str(auc))
    plt.xlabel("False positive rate (1-specificity)")
    plt.ylabel("True positive rate (sensitivity)")
    plt.legend(loc=4)
    plt.show()

In [None]:
# how to do prediction after multiple imputation:
# https://github.com/amices/mice/issues/82
# https://stackoverflow.com/questions/68460923/how-to-do-the-prediction-after-multiple-imputation-with-mice-package
logreg_models = []
Xtrain_finals = []
Xtest_finals = []

# MUST DROP REFERENCE COLUMN FOR ONE-HOT-ENCODED VARIABLES
#chosen_ref_labels = ['GHT_Within Normal Limits', 'Gender_M', 'Ethnicity_Other']
chosen_ref_labels = ['GHT_Within Normal Limits','GHT_Borderline', 'Gender_M', 'Ethnicity_Other']
penalty=None#'l1', 'l2', 'elasticnet', or None
solver='saga' # 'lbfgs', 'saga' (only saga supports l1 and elasticnet)

for X_train, X_test in zip(X_train_imputedsets, X_test_imputedsets):
    # SIMPLE LOGISTIC REGRESSION
    drop_cols = [x for x in X_train.columns if x in chosen_ref_labels]
    X_train_dropped = X_train.drop(columns=drop_cols)
    X_test_dropped = X_test.drop(columns=drop_cols)

    logreg = LogisticRegression(random_state=16, solver=solver, max_iter=1000, penalty=penalty)
    logreg.fit(X_train_dropped, y_train)
    logreg_models.append(logreg)

    Xtrain_finals.append(X_train_dropped)
    Xtest_finals.append(X_test_dropped)

model_summary_mice(logreg_models, Xtrain_finals)
compute_performance_mice(logreg_models, Xtest_finals, y_test)

# Alternative models

In [None]:
# don't have to onehotencode, but xgboost performs better if does
# keep dummy variables, don't drop ref label for decision trees

from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import BaggingClassifier
from sklearn.svm import SVC

drop_NA=True
if drop_NA:
    # Drop NA if desired
    x = X_train_keep_missing.dropna()
    x_t = X_test_keep_missing.dropna()

    y = y_train[y_train.index.isin(x.index)]
    y_t = y_test[y_test.index.isin(x_t.index)]

print(x.columns)

#model = BaggingClassifier(estimator=SVC(), n_estimators=10, random_state=0) # bagged SVC
#model=BaggingClassifier() # bagged decision trees (bc DecisionTree is default)
model=SVC(probability=True) # probability=True to enable predict_proba function (slow)
clf = model.fit(x,y)

# define cross-validation evaluation procedure
k = 10
cv = RepeatedStratifiedKFold(n_splits=k, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x, y, scoring='roc_auc', cv=cv)
# summarize performance
print('Mean AUC using %i-fold cross-validation: %.3f' % (k, mean(scores)))# AUC from 10-fold cv on TRAINING set, as opposed to AUC on test set computed in compute_performance -- if this better than AUC for test set, then model probably overfit
print("")

# test performance
EA.compute_performance(clf, x_t, y_t)
EA.compute_performance_youden(clf, x_t, y_t)

# PLOT multiple ROC curves
- current version of this code requires running above multiple times for each roc curve I want to plot, then saving them manually and adding to global dictionary before plotting combined ROC curve
- X_test and y_test have different #s for drop_NA bc drop_NA may drop diff # rows depending on which variables are included

In [None]:
#models = {} # model label name: (model, associated X_test, associated y_test)
# start with univarate models dict
#models ={**models, **models_univariate} ## don't overwrite models just in case already contains stuff
# map univariate model names
key_mapping = {
    'Average_RNFL_Thickness(μm)': 'OCT',
    'MD': 'HVF',
    'ML Feature Selection (Elastic Net)': 'ML Elastic Net'
}
# Function to rename keys in a dictionary
def rename_keys(d, key_map):
    return {key_map.get(k, k): v for k, v in d.items()}
# Apply the renaming function to the dictionary
models = rename_keys(models, key_mapping)

### Manual model additions -- EDIT THE NAME AND MODEL NAME

In [None]:
### Manual model additions
name = "Clinic Data"
mod = logreg
models[name] = (mod, X_test, y_test)

In [None]:
## how to combine 2 dictionaries
#all_models = {**models_univariate, **selected_models}

In [None]:
# select which models to plot
#all_keys = ['Clinic Data', 'CDR', 'OCT', 'HVF', 'OCT+HVF', 'CDR+OCT+HVF', 'All Features', 'Domain Knowledge', 'All Features (Ridge)', 'ML Elastic Net'] # The keys you want
#wanted_keys = ['All Features', 'Domain Knowledge', 'ML Elastic Net']
wanted_keys = ['Clinic Data', 'CDR', 'OCT', 'HVF', 'OCT+HVF', 'CDR+OCT+HVF', 'Domain Knowledge', 'ML Elastic Net'] # The keys you want
selected_models = dict((k, models[k]) for k in wanted_keys if k in models)

In [None]:
selected_models.keys()

In [None]:
plt.figure(figsize=(9, 8))
for name, (m, xt, yt) in selected_models.items():
    print (name)
    fpr, tpr, auc, optimal_idx, optimal_threshold = EA.compute_performance_youden(m, xt, yt, plot=False)
    #plt.plot(fpr, tpr, label="%s (AUC=%s, Youden's=%.3f)" % (name, auc, (tpr[optimal_idx] - fpr[optimal_idx])))
    plt.plot(fpr, tpr, label="%s (AUC=%s)" % (name, auc))
    #plt.scatter(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='red')
    print ("")

plt.plot([0, 1], [0, 1], color='black', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='best')
plt.savefig("/home/yukim3003/Figure_1.png", format="png", dpi=300)
#plt.show()

## Save a high quality plot - nvm this has to be in same cell as original plot creation to work

In [None]:
os.mkdir(configuration_records.working_dir/'Execution_Assets/Multimodal_Figures/')
fig_path = configuration_records.working_dir/'Execution_Assets/Multimodal_Figures/Figure_1.png'

# Save the plot with higher DPI
plt.savefig(fig_path, dpi=300)

In [None]:
fig_path

In [None]:
# workaround
plt.savefig("/home/yukim3003/Figure_1.png", format="png")

## Cache models

In [None]:
import pickle
cache_path = configuration_records.working_dir/'Execution_Assets/Multimodal_Analysis/models_cache.pkl'

# Cache the models dictionary to a file
with open(cache_path, 'wb') as f:
    pickle.dump(models, f)

In [None]:
# To load the cached models dictionary later
with open(cache_path, 'rb') as f:
    cached_models = pickle.load(f)

## Access a specific saved model

In [None]:
# compute a specific model that is saved
name = "Domain Knowledge"
m, xt, yt = models[name]
EA.compute_performance(m, xt, yt)
EA.compute_performance_youden(m, xt, yt, plot=True)

In [None]:
uploaded_assets = EA.execution_upload(configuration_records.execution_rid, False)