In [None]:
library(arrow)
library(dplyr)
library(mgcv)
library(tidyverse)
library(data.table)
library(metagam)
library(latex2exp)
library(eivtools)
library(plotly)
library(stats)
library(plotrix)
library(fishmethods)
library(metafor)
library(jsonlite)
library(stringr)
library(doParallel)
library(parallel)
library(MASS)

In [None]:
stg <- "stg01"
fs <-  'nofs'
oversample <- 'raw'
model_type <- 'catd'

In [None]:
path <- '/home/hoyinchan/blue/Data/data2021/data2021/shapalltmp.parquet'
dfraw <- arrow::read_parquet(path)
dfraw <- dfraw %>% dplyr::select(-'__index_level_0__')
dfraw <- dfraw %>% dplyr::filter(site_d != 'MCRI') %>% dplyr::filter(site_m != 'MCRI')
dfraw$site_d <- as.factor(dfraw$site_d)
dfraw$site_m <- as.factor(dfraw$site_m)

In [None]:
targets <- unique(dfraw$Feature)

In [None]:
cattarget <- list("PX:CH:J1940", "PX:09:96.72")

In [None]:
dfmean <- dfraw %>% dplyr::select(c('site_d', 'site_m', 'roc')) %>% distinct() %>% dplyr::select(c('roc'))
dfmean <- mean(dfmean$roc)
dfraw$rocw <- dfraw$roc/dfmean

In [None]:
gam_proc <- function(eqn, outputname, cattarget, targets, dfraw, name2, weight=FALSE) {
    resultlist <- list()
    for (target in targets){
        if (!(target %in% cattarget)){
            print(target)
            dfraw2 <- dfraw %>% filter(Feature==target)           
            if (weight==FALSE){
                xfit <- bam(eqn, data=dfraw2, method='REML')        
            } else{
                xfit <- bam(eqn, data=dfraw2, weight=rocw, method='REML')                        
            }
            sxfit<-summary(xfit)
            pxfit<-plot(xfit)
            for (i in 1:length(pxfit)){
                pxfit[[i]]$raw=NULL
            }
            result<-list(target, as.list(sxfit), pxfit)
            resultlist <- append(resultlist, list(result))
        }
    }
    output_to_python <- toJSON(resultlist, force = TRUE)
    write(output_to_python, paste0(outputname))
}

In [None]:
# eqn <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + roc2  # Original
eqn <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") # noAUC
outputname <- "gamalltmp_single_nointeraction_weightedAUC.json"
gam_proc(eqn, outputname, cattarget, targets, dfraw, FALSE)
# gam_proc(eqn, outputname, cattarget, targets, dfraw, FALSE, weight=TRUE) # with weight

In [None]:
# eqn <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + roc2 + ti(Name,roc2,k=10,bs='cr')
# outputname <- "gamalltmp_single_interaction.json"
# gam_proc(eqn, outputname, cattarget, targets, dfraw, FALSE)

In [None]:
# dfraw2 <- dfraw %>% filter(Feature=='AGE')
# xfit <- bam(eqn, data=dfraw2, method='REML')

# summary(xfit)

In [None]:
eqn <- val ~ s(Name,k=10,bs='cr') + s(site_d,bs="re") + roc2 + ti(Name,roc2,k=10,bs='cr')
dfraw2 <- dfraw %>% filter(Feature=='AGE')

In [None]:
xfit <- bam(eqn, data=dfraw2, method='REML') 

In [None]:
summary(xfit)

In [None]:
testx<-plot(xfit)

In [None]:
testxx <- testx[[3]]

In [None]:
testxx$fit

In [None]:
# Try 2 feature interaction

In [None]:
target_combo = combn(targets,2, simplify = FALSE)
outputname <- "gamalltmp_double_interaction.json"

In [None]:
gam_proc2d <- function(cattarget, dfraw, f1, f2, stg, fs, oversample, model_type) {

    f1str <- str_replace_all(f1,'::','_')
    f1str <- str_replace_all(f1str,'/','per')
    f1str <- str_replace_all(f1str,'\\(','_')
    f1str <- str_replace_all(f1str,'\\)','_')
    
    f2str <- str_replace_all(f2,'::','_')
    f2str <- str_replace_all(f2str,'/','per')
    f2str <- str_replace_all(f2str,'\\(','_')
    f2str <- str_replace_all(f2str,'\\)','_')    
    
    filename <- paste0('gam2d_tmp/','gam2d_tmp','_',f1str,'_',f2str,'_',stg,'_',fs,'_',oversample,'_',model_type,'.json')
    if (file.exists(filename)){
        return()
    }
    
    eqn_cc <- val ~ s(Name.x,k=10,bs='cr') + s(Name.y,k=10,bs='cr') + s(site_d,bs="re") + roc2 + ti(Name.x,roc2,k=10,bs='cr') + ti(Name.y,roc2,k=10,bs='cr') + ti(Name.x,Name.y,k=10,bs='cr')
    eqn_cd <- val ~ s(Name.x, k=10, bs='cr') + s(Name.x, by=Name.y, k=10, bs='cr') + Name.y + s(site_d,bs="re") + roc2 + ti(Name.x,roc2,k=10,bs='cr')

    if (f1 %in% cattarget & !f2 %in% cattarget){
        tmp = f1
        f1 = f2
        f2 = tmp
    }

    dfraw21 <- dfraw %>% filter(Feature==f1)
    dfraw22 <- dfraw %>% filter(Feature==f2)
    dfraw23 <- dfraw21 %>% inner_join(dfraw22, by=c('ID','site_d', 'site_m', 'roc', 'roc2'))
    dfraw23 <- dfraw23 %>% mutate(val=val.x+val.y)

    if (!f1 %in% cattarget & f2 %in% cattarget){
        eqn <- eqn_cd
        #dfraw23['Name.y'] <- as.factor(dfraw23['Name.y'])
        return()
    } else if (!f1 %in% cattarget & !f2 %in% cattarget){
        eqn <- eqn_cc
    } else {
        return()
    }

    xfit <- bam(eqn, data=dfraw23, method='REML') 
    sxfit<-summary(xfit)
    pxfit<-plot(xfit)
    for (i in 1:length(pxfit)){
        pxfit[[i]]$raw=NULL
    }
    result<-list(f1, f2, as.list(sxfit), pxfit)
#    return(result)
    output_to_python <- toJSON(result, force = TRUE)

    write(output_to_python, filename)
}

In [None]:
gam_proc2d(cattarget, dfraw, target_combo[[179]][1], target_combo[[179]][2], stg, fs, oversample, model_type)

In [None]:
# registerDoParallel(cores=1)
# foreach(i=1:length(target_combo),.errorhandling='remove') %do% {
#     gam_proc2d(cattarget, dfraw, target_combo[[i]][1], target_combo[[i]][2], stg, fs, oversample, model_type)
# }
for (tar in target_combo){
    print(paste(tar[1], tar[2], 'running'))
    flush.console()
    gam_proc2d(cattarget, dfraw, tar[1], tar[2], stg, fs, oversample, model_type)
}  

In [None]:
print('done')