In [None]:
library(arrow)
library(dplyr)
library(mgcv)
library(tidyverse)
library(data.table)
library(metagam)
library(latex2exp)
library(eivtools)
library(plotly)
library(stats)
library(plotrix)
library(fishmethods)
library(metafor)
library(jsonlite)
library(stringr)
library(doParallel)
library(parallel)
library(MASS)
library(tidygam)

In [None]:
site_list <- c('KUMC', 'UTSW', 'MCW', 'UofU', 'UIOWA', 'UMHC', 'UPITT', 'UTHSCSA', 'UNMC')

In [None]:
read_config <- function(site, suffix=NULL, config_base_location='/home/hoyinchan/code/AKI_CDM_PY/configs_files/publish_config/') {
  config <- list()
  
  if (is.null(suffix)) {
    suffix <- site
  }
  
  config$site <- site
  config$config_base_location <- config_base_location
  config$config_filename <- paste0('configs_', suffix, '.txt')
  
  filename <- file.path(config$config_base_location, config$config_filename)
  
  con <- file(filename, open = "r")
  on.exit(close(con), add = TRUE)
  
  while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
    # Strip whitespace from the line and ignore everything after the '#' (comments)
    line <- trimws(strsplit(line, "#", fixed = TRUE)[[1]][1])
    
    # Ignore empty lines
    if (line == "") {
      next
    }
    
    # Split the line at the first '=' to separate the key and value
    if (grepl("=", line)) {
      parts <- strsplit(line, "=", fixed = TRUE)[[1]]
      key <- trimws(parts[1])
      value <- trimws(parts[2])
      
      # Optionally remove surrounding quotes from the value
      if ((startsWith(value, '"') && endsWith(value, '"')) || (startsWith(value, "'") && endsWith(value, "'"))) {
        value <- substr(value, 2, nchar(value) - 1)
      }
      
      # Store the key and value in the list
      config[[key]] <- value
    }
  }
  
  return(config)
}

# Helper functions for startsWith and endsWith in base R
startsWith <- function(string, prefix) {
  substr(string, 1, nchar(prefix)) == prefix
}

endsWith <- function(string, suffix) {
  substr(string, nchar(string) - nchar(suffix) + 1, nchar(string)) == suffix
}

In [None]:
configs_variables <- lapply(site_list, read_config)

In [None]:
configs_variable_m <- configs_variables[[8]]

In [None]:
year <- 3000

datafolder <- configs_variable_m$datafolder
home_directory <- configs_variable_m$home_directory

# Extract values from configs_variable_m
datafolder <- configs_variable_m$datafolder
stg <- configs_variable_m$stg
fs <- configs_variable_m$fs
oversample <- configs_variable_m$oversample
model_type <- configs_variable_m$model_type

drop_correlation_catboost <- configs_variable_m$drop_correlation_catboost
if (drop_correlation_catboost) {
  suffix <- 'nc'
} else {
  suffix <- ''
}

In [None]:
toptop <- arrow::read_parquet("../toptop.parquet")

In [None]:
toptop

In [None]:
# Create an empty list to store data frames
data_dict <- list()

# Iterate through each combination of site_ and site_d
for (site_m in site_list) {
  for (site_d in site_list) {

        # site_m <- configs_variable_m$site
        # site_d <- configs_variable_m$site        
        path <- paste0(
          datafolder, site_m, '/shapdataraw_', model_type, '_', site_m, '_', site_d, '_', 
          year, '_', stg, '_', fs, '_', oversample, suffix, '.parquet'
        )

        pathX <- paste0(
          datafolder, site_m, '/shapdatarawX_', model_type, '_', site_m, '_', site_d, '_', 
          year, '_', stg, '_', fs, '_', oversample, suffix, '.parquet'
        )
        if (file.exists(path) && file.exists(pathX)) {
            dfraw <- arrow::read_parquet(path)
            dfrawX <- arrow::read_parquet(pathX)

            data_dict[[paste(site_m, site_d, sep = "_")]] <- list(dfraw = dfraw, dfrawX = dfrawX)
        }
    }
}

In [None]:
filter_single_feature <- function(feature, site_list, data_dict){
    combined_df <- NULL

   
    for (site_m in site_list) {
      for (site_d in site_list) {
          key <- paste(site_m, site_d, sep = "_")
          if (key_to_check %in% names(data_dict)){
              dfraw <- data_dict[[key]]$dfraw
              dfrawX <- data_dict[[key]]$dfrawX
                # Check if AGE column exists
              if (feature %in% colnames(dfraw)) {
                # Extract the AGE column from both dfraw and dfrawX
                age_dfraw <- dfraw[feature] %>% rename(val = !!feature)
                age_dfrawX <- dfrawX[feature] %>% rename(Name = !!feature)

                # Combine horizontally
                combined <- cbind(age_dfraw, age_dfrawX)

                # Add the site_m and site_d columns
                combined$site_m <- site_m
                combined$site_d <- site_d

                # Add a feature column
                combined$feature <- "AGE"

                # Concatenate vertically
                if (is.null(combined_df)) {
                  combined_df <- combined
                } else {
                  combined_df <- bind_rows(combined_df, combined)
                }
            }
          }
        }
    }
    return(combined_df)
}

In [None]:
filter_single_feature('AGE', site_list, data_dict)

In [None]:
filter_double_feature <- function(featureX, featureY, site_list, data_dict){

    # Extract and combine the feature columns
    combined_df <- NULL

    for (site_m in site_list) {
      for (site_d in site_list) {
          key <- paste(site_m, site_d, sep = "_")
          if (key_to_check %in% names(data_dict)){
              dfraw <- data_dict[[key]]$dfraw
              dfrawX <- data_dict[[key]]$dfrawX        
          # Check if the feature columns exist
              if (featureX %in% colnames(dfraw) && featureY %in% colnames(dfraw)) {
                # Extract the feature columns from both dfraw and dfrawX
                featureX_dfraw <- dfraw[featureX] %>% rename(val.x = !!featureX)
                featureX_dfrawX <- dfrawX[featureX] %>% rename(Name.x = !!featureX)

                featureY_dfraw <- dfraw[featureY] %>% rename(val.y = !!featureY)
                featureY_dfrawX <- dfrawX[featureY] %>% rename(Name.y = !!featureY)

                # Combine horizontally
                combined <- cbind(featureX_dfraw, featureX_dfrawX, featureY_dfraw, featureY_dfrawX)

                # Add the site_m and site_d columns
                combined$site_m <- site_m
                combined$site_d <- site_d

                # Add the val column as the sum of val.x and val.y
                combined$val <- combined$val.x + combined$val.y

                # Add a feature column
                combined$featureX <- featureX
                combined$featureY <- featureY

                # Concatenate vertically
                if (is.null(combined_df)) {
                  combined_df <- combined
                } else {
                  combined_df <- bind_rows(combined_df, combined)
                }
              }
          }
      }
    }
    # Print the combined data frame
    return(combined_df)
}

In [None]:
filter_double_feature('AGE', 'RACE_03', site_list, data_dict)

In [None]:
#dfraw$site_d <- as.factor(dfraw$site_d)
#dfraw$site_m <- as.factor(dfraw$site_m)

In [None]:
cattarget <- list("PX:CH:J1940", "PX:09:96.72")
#cattarget <- names(df)[sapply(df, is.logical)]

In [None]:
fit_proc <- function(eqn, dfraw2, target, type, weight=FALSE){
    if (weight==FALSE){
        xfit <- bam(eqn, data=dfraw2, method='REML')  
    }else{
        xfit <- bam(eqn, data=dfraw2, method='REML', weight=rocw)  
    }
    print(target)
    print(type)
    print(summary(xfit))
    flush.console()
    sxfit<-summary(xfit)
    pxfit<-plot(xfit)
    pxfit2<-termplot(xfit, data=dfraw2, se = TRUE, plot = FALSE)
    for (i in 1:length(pxfit)){
        pxfit[[i]]$raw=NULL    
    }
    return(list(target, type, sxfit, pxfit, pxfit2))
}    

In [None]:
gam_proc <- function(outputname, cattarget, targets, site_list, data_dict, returnf=FALSE, weight=FALSE, noAUC=FALSE) {
    resultlist <- list()
    for (target in targets){
        print(target)
        flush.console()
#        dfraw2 <- dfraw %>% filter(Feature==target)            
        dfraw2 <- filter_single_feature(target, site_list, data_dict)
        if (target %in% cattarget){
            dfraw2$Name <- as.factor(dfraw2$Name)
            if (noAUC==FALSE){
                eqnl <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re") + roc2
                eqnq <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re") + roc2
                eqns <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re") + roc2
                eqnt <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re") + roc2 
                resultl<-fit_proc(eqnl, dfraw2, target, 'linear', weight=weight)
                resultq<-fit_proc(eqnq, dfraw2, target, 'quadratic', weight=weight)
                results<-fit_proc(eqns, dfraw2, target, 'spline', weight=weight)
                resultt<-fit_proc(eqnt, dfraw2, target, 'spline_interaction', weight=weight)            
                result<-list(resultl, resultq, results, resultt)
            }else{
                eqnl <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re")
                eqnq <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re")
                eqns <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re")
                eqnt <- val ~ Name + s(site_d,bs="re") + s(site_m,bs="re") 
                resultl<-fit_proc(eqnl, dfraw2, target, 'linear', weight=weight)
                resultq<-fit_proc(eqnq, dfraw2, target, 'quadratic', weight=weight)
                results<-fit_proc(eqns, dfraw2, target, 'spline', weight=weight)
                resultt<-fit_proc(eqnt, dfraw2, target, 'spline_interaction', weight=weight)            
                result<-list(resultl, resultq, results, resultt)
            }
        }else{
            if (noAUC==FALSE){
                eqnl <- val ~ poly(Name,1,raw=TRUE) + s(site_d,bs="re") + s(site_m,bs="re") + roc2
                eqnq <- val ~ poly(Name,2,raw=TRUE) + s(site_d,bs="re") + s(site_m,bs="re") + roc2
                eqns <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + roc2
                eqnt <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + roc2 + ti(Name,roc2,bs='cr')            
                resultl<-fit_proc(eqnl, dfraw2, target, 'linear')
                resultq<-fit_proc(eqnq, dfraw2, target, 'quadratic')
                results<-fit_proc(eqns, dfraw2, target, 'spline')
                resultt<-fit_proc(eqnt, dfraw2, target, 'spline_interaction')            
                result<-list(resultl, resultq, results, resultt)
            }else{
                eqnl <- val ~ poly(Name,1,raw=TRUE) + s(site_d,bs="re") + s(site_m,bs="re")
                eqnq <- val ~ poly(Name,2,raw=TRUE) + s(site_d,bs="re") + s(site_m,bs="re")
                eqns <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re")
                eqnt <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + ti(Name,roc2,bs='cr')            
                resultl<-fit_proc(eqnl, dfraw2, target, 'linear', weight=weight)
                resultq<-fit_proc(eqnq, dfraw2, target, 'quadratic', weight=weight)
                results<-fit_proc(eqns, dfraw2, target, 'spline', weight=weight)
                resultt<-fit_proc(eqnt, dfraw2, target, 'spline_interaction', weight=weight)            
                result<-list(resultl, resultq, results, resultt)
            }
        }
        resultlist <- append(resultlist, list(result))
    }
    if (returnf){
        return(resultlist)
    }
    output_to_python <- toJSON(resultlist, force = TRUE, digit=30)
    write(output_to_python, paste0(outputname))
}

In [None]:
# outputname <- "gamalltmp_single_AUC_populationweight.json"
# gam_proc(outputname, cattarget, targets, dfraw, weight=TRUE, noAUC=FALSE)

In [None]:
outputname <- "gamalltmp_single_noAUC.json"
gam_proc(outputname, cattarget, targets, site_list, data_dict, weight=FALSE, noAUC=TRUE)

In [None]:
# outputname <- "gamalltmp_single_weightAUC2.json"
# gam_proc(outputname, cattarget, targets, dfraw, weight=TRUE, noAUC=TRUE)

In [None]:
# outputname <- "gamalltmp_single_weightAUC2.json"
# gam_proc(outputname, cattarget, targets, dfraw, weight=TRUE, noAUC=TRUE)

In [None]:
# dfraw2 <- dfraw %>% filter(Feature=='LAB::2345-7(mg/dL)') 
# #eqnl <- val ~ poly(Name,1,raw=TRUE) + s(site_d,bs="re") + s(site_m,bs="re") 
# eqnq <- val ~ poly(Name,2,raw=TRUE) + s(site_d,bs="re") + s(site_m,bs="re") 
# #eqns <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") 
# #eqnt <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re")  + ti(Name,roc2,bs='cr')            
# resultq<-fit_proc(eqnq, dfraw2, 'AGE', 'quadratic')
# #results<-fit_proc(eqns, dfraw2, 'AGE', 'spline')

# toJSON(resultq, force = TRUE, digit=30)

# summary(resultq)

# ## 2D

# dfraw2 <- dfraw %>% filter(Feature=='AGE')
# eqnl <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + roc2
# xfit <- bam(eqnl, data=dfraw2, method='REML') 
# plot(xfit)

# dfraw2 <- dfraw %>% filter(Feature=='AGE')
# eqnl <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + roc2 + s(site_d,roc2,bs="re")
# xfit <- bam(eqnl, data=dfraw2, method='REML') 
# plot(xfit)

# summary(xfit)

In [None]:
# Try 2 feature interaction

In [None]:
target_combo = combn(targets, 2, simplify = FALSE)
outputname <- "gamalltmp_double_interaction_quadratic.json"
reversed_target_combo <- lapply(target_combo, rev)
# Combine the original and reversed lists
combined_list <- c(target_combo, reversed_target_combo)

In [None]:
gam_proc2d <- function(cattarget, dfraw, f1, f2, site_list, data_dict, returnf = FALSE) {

    f1str <- str_replace_all(f1,'::','_')
    f1str <- str_replace_all(f1str,'/','per')
    f1str <- str_replace_all(f1str,'\\(','_')
    f1str <- str_replace_all(f1str,'\\)','_')
    
    f2str <- str_replace_all(f2,'::','_')
    f2str <- str_replace_all(f2str,'/','per')
    f2str <- str_replace_all(f2str,'\\(','_')
    f2str <- str_replace_all(f2str,'\\)','_')    

    
    eqn_cc <- val ~ s(Name.x,k=10,bs='cr') + s(Name.y,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + ti(Name.x,Name.y,k=10,bs='cr')
    eqn_cd <- val ~ s(Name.x,k=10,bs='cr') + s(Name.x,by=Name.y,k=10,bs='cr') + Name.y + s(site_d,bs="re") + s(site_m,bs="re")

    eqn_cs <- val ~ s(Name.x,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + ti(Name.x,Name.y,k=10,bs='cr')
    
    if (f1 %in% cattarget){
        return()
    }

    # dfraw21 <- dfraw %>% filter(Feature==f1)
    # dfraw22 <- dfraw %>% filter(Feature==f2)
    # dfraw23 <- dfraw21 %>% inner_join(dfraw22, by=c('ID','site_d', 'site_m', 'roc', 'roc2'))
    # dfraw23 <- dfraw23 %>% mutate(val=val.x+val.y)
#    dfraw23 <- dfraw23 %>% mutate(val=val.x)
    dfraw23 <- filter_double_feature(f1, f2, site_list, data_dict)

    if (!f1 %in% cattarget & f2 %in% cattarget){
        eqn <- eqn_cd
        dfraw23$Name.y <- as.factor(dfraw23$Name.y)
    } else if (!f1 %in% cattarget & !f2 %in% cattarget){
        eqn <- eqn_cc
    } else {
        return()
    }

    xfit <- bam(eqn, data=dfraw23, method='REML') 
    sxfit<-summary(xfit)
    pxfit<-plot(xfit)
    for (i in 1:length(pxfit)){
        pxfit[[i]]$raw=NULL
    }
    result<-list(f1, f2, as.list(sxfit), pxfit)
    if (returnf){
        return(result)
    }
    output_to_python <- toJSON(result, force = TRUE, digit=30)
    write(output_to_python, filename)    
}

In [None]:
#gam_proc2d(cattarget, dfraw, target_combo[[179]][1], target_combo[[179]][2], stg, fs, oversample, model_type)

In [None]:
# Load libraries
library(foreach)
library(doParallel)

# Register parallel backends
numCores <- 10  # Or set a specific number
registerDoParallel(cores=numCores)

# Parallel foreach loop
foreach(tar = iter(target_combo)) %dopar% {
#for (tar in target_combo){
    tryCatch({
        print(paste(tar[1], tar[2], 'running'))
        flush.console()
        gam_proc2d(cattarget, dfraw, tar[1], tar[2], site_list, data_dict, model_type)
    },
    error = function(cond) {}
    )
}  

In [None]:
print('done')

In [None]:
xxx <- gam_proc2d(cattarget, dfraw, "LAB::4544-3(%)", 'PX:CH:J1940', stg, fs, oversample, model_type, returnf=TRUE)

In [None]:
f2 <- 'AGE'
f1 <- 'SYSTOLIC'
returnf=FALSE

f1str <- str_replace_all(f1,'::','_')
f1str <- str_replace_all(f1str,'/','per')
f1str <- str_replace_all(f1str,'\\(','_')
f1str <- str_replace_all(f1str,'\\)','_')

f2str <- str_replace_all(f2,'::','_')
f2str <- str_replace_all(f2str,'/','per')
f2str <- str_replace_all(f2str,'\\(','_')
f2str <- str_replace_all(f2str,'\\)','_')    

if (!returnf){
    filename <- paste0('/home/hoyinchan/blue/program_data/AKI_CDM_PY/MetaRegression/gam2d_tmp/','gam2d_tmp_noAUC_partial_','_',f1str,'_',f2str,'_',stg,'_',fs,'_',oversample,'_',model_type,'.json')
    if (file.exists(filename)){
        return()
    }
}

eqn_cc <- val ~ s(Name.x,k=10,bs='cr') + s(site_d,bs="re") + s(site_m,bs="re") + ti(Name.x,Name.y,k=10,bs='cr')
eqn_cd <- val ~ s(Name.x,k=10,bs='cr') + s(Name.x,by=Name.y,k=10,bs='cr') + Name.y + s(site_d,bs="re") + s(site_m,bs="re")

if (f1 %in% cattarget & !f2 %in% cattarget){
    tmp = f1
    f1 = f2
    f2 = tmp
}

dfraw21 <- dfraw %>% filter(Feature==f1)
dfraw22 <- dfraw %>% filter(Feature==f2)
dfraw23 <- dfraw21 %>% inner_join(dfraw22, by=c('ID','site_d', 'site_m', 'roc', 'roc2'))
#dfraw23 <- dfraw23 %>% mutate(val=val.x+val.y)
dfraw23 <- dfraw23 %>% mutate(val=val.x)

if (!f1 %in% cattarget & f2 %in% cattarget){
    eqn <- eqn_cd
    dfraw23$Name.y <- as.factor(dfraw23$Name.y)
} else if (!f1 %in% cattarget & !f2 %in% cattarget){
    eqn <- eqn_cc
#    eqn <- eqn_qcc        
} else {
    return()
}

xfit <- bam(eqn, data=dfraw23, method='REML') 
sxfit<-summary(xfit)
pxfit<-plot(xfit)

for (i in 1:length(pxfit)){
    pxfit[[i]]$raw=NULL
}
result<-list(f1, f2, as.list(sxfit), pxfit)
if (returnf){
    return(result)
}
output_to_python <- toJSON(result, force = TRUE, digit=30)

In [None]:
sxfit