## MWE
MWE is uploaded in here:https://drive.google.com/drive/u/0/folders/1Ny3fzShONB-kfhvqcvnzL2VdyI1MC46J

In [None]:
sos run pipeline/per_gene_data_merger.ipynb data_merger \
--genoFile ./mwe.region_plink_files/plink_files_list.txt \
--cwd MWE/rds_per_gene/ \
--region-list MWE.region.list \
--phenoFile MWE.phenotype.list \
--covFile MWE.covar.list &

In [None]:
[global]
parameter: genoFile = path
parameter: phenoFile = path
parameter: covFile = path
parameter: region_list = path
parameter: cwd = path
parameter: name = "demo"
import pandas as pd
region_tbl = pd.read_csv(region_list,sep = "\t")
genoFile = pd.read_csv(genoFile,sep = "\t",names = ["gene_id","path"],header = 0).merge(region_tbl,on = "gene_id").to_dict("records")

# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 1
parameter: container = ""

In [None]:
[data_merger]
parameter: max_L = 10
# remove a variant if it has more than imiss missing individual data
parameter: imiss = 1
parameter: maf = 0
input: phenoFile,covFile, for_each = "genoFile"
output: f'{cwd:a}/{name}.{_genoFile["gene_id"]}.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads
R:  expand= "${ }", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout',container = container
    library("plink2R")
    library("dplyr")
    library("readr")
    library("purrr")
    ## Define function
        compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }
  
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }
  
    remove_covX = function(X,covar){
    for ( i in 1:ncol(X) ) {
        X[,i] = .lm.fit(x = cbind(1,covar), y = X[,i])$residuals
        }
     X = scale(X)
    }
    
    read_gene_pheno = function(path){
      arg = paste0(c("tabix -h ",path," ${_genoFile["#chr"]}:${_genoFile["start"]}-${_genoFile["start"]+1}"),collapse = "")
      result = system(arg,intern = T)
      output = read.table(text = result[2], sep = "\t")
      name = result[1]%>%stringr::str_split("\t")%>%unlist()
      if(ncol(output) == 1) { output = matrix(,ncol = (length(name)))%>%as_tibble }
      colnames(output) = name
      return(output)
      }
    ## Data Loader
    gene_id = "${_genoFile["gene_id"]}"
    ### Genotype
    geno = read_plink("${path(_genoFile["path"]):n}")
    X = filter_X(geno$bed,${imiss}, ${maf} )
    ### Phenotype
    phenotype_list = read_delim("${_input[0]}","\t")
    covar_list = read_delim("${_input[1]}","\t")
    covar_list = covar_list%>%mutate(covar = map(path, ~read_delim(.x,"\t")%>%select(-`#id`)%>%na.omit%>%t()))
    phenotype_list = inner_join(phenotype_list,covar_list, by = "tissue")
    phenotype_list = phenotype_list%>%mutate(Y = map(path.x, ~read_gene_pheno(.x)%>%select(-c(`#chr`,start,end,gene_id))%>%t%>%as.matrix))%>%mutate(
                                            #### Get residue for each of tissue, for the expression containing NA, the original NA matrix will be returned.
                                                Y_resid = map2(Y,covar,~tryCatch(.lm.fit(x = cbind(1,.y), y = .x[rownames(.y),])$residuals%>%t%>%as_tibble,error = function(e) return(.x[rownames(.y),]%>%t%>%as_tibble))))
    Y = phenotype_list%>%select(Y)%>%tidyr::unnest(Y)%>%t%>%as.matrix
    Y_resid = phenotype_list%>%select(Y_resid)%>%tidyr::unnest(Y_resid)%>%t%>%as.matrix
    colnames(Y_resid) = phenotype_list$tissue
    X_list = phenotype_list%>%mutate( X_data = map(covar,~X[intersect(rownames(X),paste0(rownames(.x),":",rownames(.x))),]),   # Get only the intersect samples
                                          X_resid = map2(X_data,covar,~remove_covX(X = .x, covar = .y)))%>%pull(X_resid)       # Remove covariate by tissues
    list(X,"X_resid" = X_list$X_resid,Y ,Y_resid)%>%saveRDS("${_output}")
  