# Fine-mapping with SuSiE model
This notebook conduct fine_mapping with complete data, unlike the susie_RSS module, this module perform analysis on 1 of the theme each time

sos run pipeline/SuSiE.ipynb uni_susie \
    --genoFile <(head GRCh38_plink_files_list.txt)  \
    --region-list ALL.log2cpm.region_list \
    --phenoFile ALL_pseudobulk_phenotype_list.txt \
    --covFile ALL_pseudobulk_phenotype_list.txt -J 800 -c csg.yml -q csg  &

In [None]:
sos run pipeline/SuSiE.ipynb uni_susie \
    --genoFile <(cat GRCh38_plink_files_list.txt | grep ENSG00000197943  )\
    --region-list ALL.log2cpm.region_list \
    --phenoFile ALL_pseudobulk_phenotype_list.txt \
    --covFile ALL_pseudobulk_covariate_list.txt --cwd ALL_PLCG2 --name ALL_pseudo_bulk  & 

In [None]:
sos run pipeline/per_gene_data_merger.ipynb data_merger \
    --genoFile <(cat GRCh38_plink_files_list.txt | grep ENSG00000197943  )\
    --region-list ALL.log2cpm.region_list \
    --phenoFile ALL_pseudobulk_phenotype_list.txt \
    --covFile ALL_pseudobulk_covariate_list.txt --cwd ALL_PLCG2 --name ALL_pseudo_bulk --cwd data_merger_test  & 

## Input

1. 1 region list documenting the regions to be analysised
2. a list to the path where phenotype per gene data stored
3. a list to the path where genotype per gene data stored

By default the input 2 and 3 is the output from the data_preprocessing module.

### Example genotype list

region        dir
ENSG00000000457 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000457.bed
ENSG00000000460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000460.bed
ENSG00000000938 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000938.bed
ENSG00000000971 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000000971.bed
ENSG00000001036 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001036.bed
ENSG00000001084 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001084.bed
ENSG00000001167 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001167.bed
ENSG00000001460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/genotype_reformmating/demo_per_gene_plink/ENSG00000001460.bed


### Example phenotype list


region dir
ENSG00000000457 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000457.mol_phe.bed.gz
ENSG00000000460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000460.mol_phe.bed.gz
ENSG00000000938 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000938.mol_phe.bed.gz
ENSG00000000971 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000000971.mol_phe.bed.gz
ENSG00000001036 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001036.mol_phe.bed.gz
ENSG00000001084 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001084.mol_phe.bed.gz
ENSG00000001167 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001167.mol_phe.bed.gz
ENSG00000001460 /mnt/mfs/statgen/xqtl_workflow_testing/demo/phenotype_reformat/demo.ENSG00000001460.mol_phe.bed.gz


## Output

For each analysis unit we output:

1. Analysis results in RDS format: A mvsusie Model
2. A vcf file with selected snps
        ES:PIP:CS

## MWE

In [None]:
sos run pipeline/SuSiE.ipynb uni_susie \
--genoFile /mnt/mfs/statgen/snuc_pseudo_bulk/Ast/genotype_per_gene/MWE.region_plink_files/plink_files_list.txt  \
--cwd MWE/susie_per_gene/ \
--region-list data/mwe/MWE.region.list \
--phenoFile data/mwe/MWE.phenotype.list \
--covFile data/mwe/MWE.covar.list -J 8 -c csg.yml -q csg  &

sos run pipeline/SuSiE.ipynb mv_susie \
--genoFile ./mwe.region_plink_files/plink_files_list.txt \
--cwd MWE/rds_per_gene/ \
--region-list MWE.region.list \
--phenoFile MWE.phenotype.list \
--covFile MWE.covar.list &


In [1]:
[global]
import glob
import pandas as pd
# Input
parameter: genoFile = path
# A list of file path for phenotype and covariates, must be in the same order.
parameter: phenoFile = path
parameter: covFile = path
parameter: region_list = path
parameter: cwd = path("output")
parameter: name = "demo"
region_tbl = pd.read_csv(region_list,sep = "\t")
genoFile = pd.read_csv(genoFile,sep = "\t",names = ["gene_id","path"],header = 0).merge(region_tbl,on = "gene_id").to_dict("records")

## Path to work directory where output locates
## Containers that contains the necessary packages
parameter: container = ""
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20

# use this function to edit memory string for PLINK input
from sos.utils import expand_size

## Univariate SuSiE

In [1]:
[uni_susie_1]
parameter: max_L = 10
# remove a variant if it has more than imiss missing individual data
parameter: imiss = 0.1
parameter: maf = 0.05
input: phenoFile,covFile, for_each = "genoFile"
output: f'{cwd:a}/cache/{name}.{_genoFile["gene_id"]}.unisusie.fit.rds'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr", container = container
    ## Define function
    compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }
  
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }
  
    compute_cov_flash <- function(Y, error_cache = NULL){
        covar <- diag(ncol(Y))
        tryCatch({
        fl <- flashier::flash(Y, var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        if(fl$n.factors==0){
          covar <- diag(fl$residuals.sd^2)
        } else {
          fsd <- sapply(fl$fitted.g[[1]], '[[', "sd")
          covar <- diag(fl$residuals.sd^2) + crossprod(t(fl$flash.fit$EF[[2]]) * fsd)
        }
        if (nrow(covar) == 0) {
          covar <- diag(ncol(Y))
          stop("Computed covariance matrix has zero rows")
        }
        }, error = function(e) {
          if (!is.null(error_cache)) {
            saveRDS(list(data=Y, message=warning(e)), error_cache)
            warning("FLASH failed. Using Identity matrix instead.")
            warning(e)
          } else {
            stop(e)
          }
        })
        s <- apply(Y, 2, sd, na.rm=T)
        if (length(s)>1) s = diag(s)
        else s = matrix(s,1,1)
        covar <- s%*%cov2cor(covar)%*%s
        return(covar)
    }
    read_gene_pheno = function(path){
      arg = paste0(c("tabix -h ",path," ${_genoFile["#chr"]}:${_genoFile["start"]}-${_genoFile["start"]+1}"),collapse = "")
      result = system(arg,intern = T)
      output = read.table(text = result[2], sep = "\t")
      colnames(output) = result[1]%>%stringr::str_split("\t")%>%unlist()
      return(output)
      }
    
    remove_covX = function(X,covar){
    for ( i in 1:ncol(X) ) {
        X[,i] = .lm.fit(x = cbind(1,covar), y = X[,i])$residuals
        }
     X = scale(X)
    }
    
    ## Load Library
    library("susieR")
    library("plink2R")
    library("dplyr")
    library("readr")
    library("stringr")
    library("purrr")
    ###
    # Core code
    ###
    # Input
    ### Genotype
    geno = read_plink("${path(_genoFile["path"]):n}")
    X = filter_X(geno$bed,${imiss}, ${maf} )

    ### Phenotype
    phenotype_list = read_delim("${_input[0]}","\t")
    covar_list = read_delim("${_input[1]}","\t")
    covar_list = covar_list%>%mutate(covar = map(path, ~read_delim(.x,"\t")%>%select(-`#id`)%>%na.omit%>%t()))
    phenotype_list = inner_join(phenotype_list,covar_list, by = "tissue")
    phenotype_list = phenotype_list%>%mutate(Y = map(path.x, ~read_gene_pheno(.x)%>%select(-c(`#chr`,start,end,gene_id))%>%t%>%as.matrix))%>%mutate(
                                            #### Get residue for each of tissue
                                              Y_resid = map2(Y,covar,~.lm.fit(x = cbind(1,.y), y = .x[rownames(.y),])$residuals%>%t%>%as_tibble))

    Y_resid = phenotype_list%>%select(Y_resid)%>%tidyr::unnest(Y_resid)%>%t%>%as.matrix
    colnames(Y_resid) = phenotype_list$tissue
    
    X_list = phenotype_list%>%mutate( X_data = map(covar,~X[intersect(rownames(X),paste0(rownames(.x),":",rownames(.x))),]),   # Get only the intersect samples
                                          X_resid = map2(X_data,covar,~.lm.fit(x = cbind(1,.y), y = .x)$residuals%>%scale))%>%pull(X_resid)       # Remove covariate by tissues
    non_missing = list()  
    fitted = list()
    # Fine-mapping with SuSiE
       for (r in 1:ncol(Y_resid)) {
        non_missing[[r]] = which(!is.na(Y_resid[,r]))
        st = proc.time()
        X = X_list[[r]]
    print(paste("Dimension of X matrix:", nrow(X), ncol(X)))
    print(paste("Dimension of Y matrix:", nrow(Y_resid), ncol(Y_resid)))
        fitted[[r]] <- susieR::susie(X[non_missing[[r]],], Y_resid[non_missing[[r]],r],
                           L=${max_L},
                           max_iter=1000,
                           estimate_residual_variance=TRUE,
                           estimate_prior_variance=TRUE,
                           refine=TRUE,compute_univariate_zscore = TRUE )
        fitted[[r]]$time = proc.time() - st
        fitted[[r]]$cs_corr = susieR:::get_cs_correlation(fitted[[r]], X=X[non_missing[[r]],])
        fitted[[r]]$cs_snps = names(fitted[[r]]$X_column_scale_factors[unlist(fitted[[r]]$sets$cs)])
        fitted[[r]]$variable_name =  names(fitted[[r]]$X_column_scale_factors)
        fitted[[r]]$coef =  coef.susie(fitted[[r]])
    }
    names(fitted) = phenotype_list$tissue 
    saveRDS(fitted, ${_output[0]:r})

In [None]:
[uni_susie_2]
input: group_with = "genoFile"
output: f"{_input:n}.vcf.bgz"
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr"
   ## Define create_vcf function
           create_vcf = function (chrom, pos, nea, ea, snp = NULL, ea_af = NULL, effect = NULL, 
        se = NULL, pval = NULL, name = NULL,cs = NULL, pip = NULL) 
    {
        stopifnot(length(chrom) == length(pos))
        if (is.null(snp)) {
            snp <- paste0(chrom, ":", pos)
        }
        snp <- paste0(chrom, ":", pos)
        nsnp <- length(chrom)
        gen <- list()
        ## Setupt data content for each sample column
        if (!is.null(ea_af)) 
            gen[["AF"]] <- matrix(ea_af, nsnp)
        if (!is.null(effect)) 
            gen[["ES"]] <- matrix(effect, nsnp)
        if (!is.null(se)) 
            gen[["SE"]] <- matrix(se, nsnp)
        if (!is.null(pval)) 
            gen[["LP"]] <- matrix(-log10(pval), nsnp)
        if (!is.null(cs)) 
            gen[["CS"]] <- matrix(cs, nsnp)
        if (!is.null(pip)) 
            gen[["PIP"]] <- matrix(pip, nsnp)
        gen <- S4Vectors::SimpleList(gen)
        
      ## Setup snps info for the fix columns
        gr <- GenomicRanges::GRanges(chrom, IRanges::IRanges(start = pos, 
            end = pos + pmax(nchar(nea), nchar(ea)) - 1, names = snp))
         coldata <- S4Vectors::DataFrame(Studies = name, row.names = name)
    ## Setup header informations
        hdr <- VariantAnnotation::VCFHeader(header = IRanges::DataFrameList(fileformat = S4Vectors::DataFrame(Value = "VCFv4.2", 
            row.names = "fileformat")), sample = name)
        VariantAnnotation::geno(hdr) <- S4Vectors::DataFrame(Number = c("A", 
            "A", "A", "A", "A", "A"), Type = c("Float", "Float", 
            "Float", "Float", "Float", "Float"), Description = c("Effect size estimate relative to the alternative allele", 
            "Standard error of effect size estimate", "-log10 p-value for effect estimate",  
            "Alternate allele frequency in the association study",
            "The CS this variate are captured, 0 indicates not in any cs", "The posterior inclusion probability to a CS"), 
            row.names = c("ES", "SE", "LP", "AF", "CS", "PIP"))
      ## Save only the meta information in the sample columns 
        VariantAnnotation::geno(hdr) <- subset(VariantAnnotation::geno(hdr), 
            rownames(VariantAnnotation::geno(hdr)) %in% names(gen))
      ## Save VCF 
        vcf <- VariantAnnotation::VCF(rowRanges = gr, colData = coldata, 
            exptData = list(header = hdr), geno = gen)
        VariantAnnotation::alt(vcf) <- Biostrings::DNAStringSetList(as.list(ea))
        VariantAnnotation::ref(vcf) <- Biostrings::DNAStringSet(nea)
      ## Add fixed values
        VariantAnnotation::fixed(vcf)$FILTER <- "PASS"
          return(sort(vcf))
        }
    library("susieR")
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    library("tidyr")
    library("stringr")
    
    # Get list of cs snps
    susie_list = readRDS(${_input:r})
    susie_tb_ls = list()
    for (i in 1:length(susie_list)){
        susie_tb = tibble( snps =  names(susie_list[[1]]$pip)[which( susie_list[[i]]$pip >= 0)], snps_index = which(( susie_list[[i]]$pip >= 0))  )
        susie_tb_ls[[i]]= susie_tb%>%mutate( cs = map(snps_index,~which( susie_list[[i]]$sets$cs %in% .x))%>%as.numeric%>%replace_na(0),
                                 pip = map_dbl(snps_index,~( susie_list[[i]]$pip[.x])),
                                 coef = map_dbl(snps_index,~(coef.susie( susie_list[[i]])[.x+1])))
        }
    if(length(susie_tb_ls) >= 2){ 
      for(i in 2:length(susie_tb_ls)){
          susie_tb_ls[[i]] = full_join(susie_tb_ls[[i-1]],susie_tb_ls[[i]], by = "snps") 
        }
    }
    m = c("cs","pip","coef")    
    output = list()
    for(i in m){
    output[[i]] = susie_tb_ls[[length(susie_tb_ls)]]%>%select(contains(i))%>%as.matrix
    }
    snps_tb = susie_tb_ls[[length(susie_tb_ls)]]%>%mutate(
                         chr = map_chr(snps,~read.table(text = .x,sep = ":",as.is = T)$V1),
                         pos_alt_ref = map_chr(snps,~read.table(text = .x,sep = ":",as.is = TRUE)$V2),
                         pos = map_dbl(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE)$V1),
                         alt = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V2),
                         ref = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V3))
    
    snps_tb = snps_tb%>%filter(str_detect(ref, "[ACTG]") & str_detect(alt, "[ACTG]"))
    output_vcf = create_vcf(
            chrom = snps_tb$chr,
             pos = snps_tb$pos,
             ea = snps_tb$alt,
             nea = snps_tb$ref,
             effect = snps_tb%>%select(contains("coef"))%>%as.matrix ,
             pip = snps_tb%>%select(contains("pip"))%>%as.matrix,
             cs = snps_tb%>%select(contains("cs"))%>%as.matrix,
             name = names(susie_list))
    VariantAnnotation::writeVcf(output_vcf,${_output:nr},index = TRUE)

In [None]:
[*_susie_3]
input: group_by = "all"
output: f'{cwd}/{name}.susie.output_list.txt'
python: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    pd.DataFrame({"output_vcf" : [$[_input:ar,]]}).to_csv("$[_output]",index = False ,header = False, sep = "t")

In [None]:
[mv_susie_1]
parameter: max_L = 10
# remove a variant if it has more than imiss missing individual data
parameter: imiss = 0.1
parameter: maf = 0.05
# Only analyze `cis` variants -- cis = N means using N variants around the center column of X matrix  
parameter: cis = 'NULL'
parameter: prior = path
input: phenoFile,covFile, for_each = "genoFile"
output: mvsusie = f'{wd:a}/{_input:bn}{("_cis_%s" % cis) if cis != "NULL" else ""}_{name}.mvsusie.rds'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output[0]:nn}.stdout", stderr = f"{_output[0]:nn}.stderr", container = container
    ###
    # Utility functions
    ###
    compute_maf <- function(geno){
      f <- mean(geno,na.rm = TRUE)/2
      return(min(f, 1-f))
    }

    compute_missing <- function(geno){
      miss <- sum(is.na(geno))/length(geno)
      return(miss)
    }
    
    mean_impute <- function(geno){
      f <- apply(geno, 2, function(x) mean(x,na.rm = TRUE))
      for (i in 1:length(f)) geno[,i][which(is.na(geno[,i]))] <- f[i]
      return(geno)
    }

    is_zero_variance <- function(x) {
      if (length(unique(x))==1) return(T)
      else return(F)
    }
  
    filter_X <- function(X, missing_rate_thresh, maf_thresh) {
        rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, compute_maf) < maf_thresh)
        if (length(rm_col)) X <- X[, -rm_col]
        rm_col <- which(apply(X, 2, is_zero_variance))
        if (length(rm_col)) X <- X[, -rm_col]
        return(mean_impute(X))
    }

    compute_cov_flash <- function(Y, error_cache = NULL){
        covar <- diag(ncol(Y))
        tryCatch({
        fl <- flashier::flash(Y, var.type = 2, prior.family = c(flashier::prior.normal(), flashier::prior.normal.scale.mix()), backfit = TRUE, verbose.lvl=0)
        if(fl$n.factors==0){
          covar <- diag(fl$residuals.sd^2)
        } else {
          fsd <- sapply(fl$fitted.g[[1]], '[[', "sd")
          covar <- diag(fl$residuals.sd^2) + crossprod(t(fl$flash.fit$EF[[2]]) * fsd)
        }
        if (nrow(covar) == 0) {
          covar <- diag(ncol(Y))
          stop("Computed covariance matrix has zero rows")
        }
        }, error = function(e) {
          if (!is.null(error_cache)) {
            saveRDS(list(data=Y, message=warning(e)), error_cache)
            warning("FLASH failed. Using Identity matrix instead.")
            warning(e)
          } else {
            stop(e)
          }
        })
        s <- apply(Y, 2, sd, na.rm=T)
        if (length(s)>1) s = diag(s)
        else s = matrix(s,1,1)
        covar <- s%*%cov2cor(covar)%*%s
        return(covar)
    }
  
    compute_cov_diag <- function(Y){
        covar <- diag(apply(Y, 2, var, na.rm=T))
        return(covar)
    }

    get_center <- function(k,n) {
      ## For given number k, get the range k surrounding n/2
      ## but have to make sure it does not go over the bounds
      if (is.null(k)) {
          return(1:n)
      }
      start = floor(n/2 - k/2)
      end = floor(n/2 + k/2)
      if (start<1) start = 1
      if (end>n) end = n
      return(start:end)
    }
    
    get_prior_indices <- function(Y, U) {
      # make sure the prior col/rows match the colnames of the Y matrix
      y_names = colnames(Y)
      u_names = colnames(U)
      if (is.null(y_names) || is.null(u_names)) {
          return(NULL)
      } else if (identical(y_names, u_names)) {
          return(NULL)
      } else {
          return(match(y_names, u_names))
      }
    }
    
    ###
    # Core code
    ###
    # Input
    ### Genotype
    geno = read_plink("${path(_genoFile["path"]):n}")
    X = filter_X(geno$bed,${imiss}, ${maf} )
    X = X[,get_center(${cis}, ncol(X))]
    ### Phenotype
    phenotype_list = read_delim("${_input[0]}","\t")
    covar_list = read_delim("${_input[1]}","\t")
    covar_list = covar_list%>%mutate(covar = map(path, ~read_delim(.x,"\t")%>%select(-`#id`)%>%na.omit%>%t()))
    phenotype_list = inner_join(phenotype_list,covar_list, by = "tissue")
    phenotype_list = phenotype_list%>%mutate(Y = map(path.x, ~read_gene_pheno(.x)%>%select(-c(`#chr`,start,end,gene_id))%>%t%>%as.matrix))%>%mutate(
                                            #### Get residue for each of tissue
                                              Y_resid = map2(Y,covar,~.lm.fit(x = cbind(1,.y), y = .x[rownames(.y),])$residuals%>%t%>%as_tibble))
    
    Y_resid = phenotype_list%>%select(Y_resid)%>%tidyr::unnest(Y_resid)%>%t%>%as.matrix
    colnames(Y_resid) = phenotype_list$tissue
    # FIXME: handle it when prior does not exist
    prior = readRDS(${prior:r})
    print(paste("Number of components in the mixture prior:", length(prior$U)))
    prior = mvsusieR::create_mash_prior(mixture_prior=list(weights=prior$w, matrices=prior$U), include_indices = get_prior_indices(Y_resid, prior$U[[1]]), max_mixture_len=-1)
    print(paste("Dimension of X matrix:", nrow(X), ncol(X)))
    print(paste("Dimension of Y matrix:", nrow(Y_resid), ncol(Y_resid)))
      

    # GWAS Summary statistics
    univariate_res = lapply(1:ncol(Y_resid), function(r) susieR:::univariate_regression(X[non_missing[[r]], ], Y_resid[non_missing[[r]], r]))
    bhat = do.call(cbind, lapply(1:ncol(Y_resid), function(r) univariate_res[[r]]$betahat))
    sbhat = do.call(cbind, lapply(1:ncol(Y_resid), function(r) univariate_res[[r]]$sebetahat))
    saveRDS(list(bhat=bhat, sbhat=sbhat), "${_output[0]:nn}.sumstat.rds")
    rm(bhat)
    rm(sbhat)
    # Multivariate fine-mapping
    st = proc.time()
    mv_res = mvsusieR::mvsusie(X, Y_resid, L=${max_L}, 
                              prior_variance=prior, residual_variance=resid_Y, 
                              precompute_covariances=F, compute_objective=T, 
                              estimate_residual_variance=F, estimate_prior_variance=T, estimate_prior_method='EM',
                              max_iter = 100, n_thread=1, approximate=F)
    mv_res$time = proc.time() - st
    mv_res$cs_corr = susieR:::get_cs_correlation(mv_res, X=X)
    saveRDS(mv_res, ${_output[0]:r})

In [None]:
[mv_susie_2]
input: group_with = "genoFile"
output: f"{_input:n}.vcf.bgz"
task: trunk_workers = 1, trunk_size = 1, walltime = '2h', mem = '55G', cores = 1, tags = f'{step_name}_{_output[0]:bn}'
R: expand = '${ }', stdout = f"{_output:nn}.stdout", stderr = f"{_output:nn}.stderr"
   ## Define create_vcf function
           create_vcf = function (chrom, pos, nea, ea, snp = NULL, ea_af = NULL, effect = NULL, 
        se = NULL, pval = NULL, name = NULL,cs = NULL, pip = NULL) 
    {
        stopifnot(length(chrom) == length(pos))
        if (is.null(snp)) {
            snp <- paste0(chrom, ":", pos)
        }
        snp <- paste0(chrom, ":", pos)
        nsnp <- length(chrom)
        gen <- list()
        ## Setupt data content for each sample column
        if (!is.null(ea_af)) 
            gen[["AF"]] <- matrix(ea_af, nsnp)
        if (!is.null(effect)) 
            gen[["ES"]] <- matrix(effect, nsnp)
        if (!is.null(se)) 
            gen[["SE"]] <- matrix(se, nsnp)
        if (!is.null(pval)) 
            gen[["LP"]] <- matrix(-log10(pval), nsnp)
        if (!is.null(cs)) 
            gen[["CS"]] <- matrix(cs, nsnp)
        if (!is.null(pip)) 
            gen[["PIP"]] <- matrix(pip, nsnp)
        gen <- S4Vectors::SimpleList(gen)
        
      ## Setup snps info for the fix columns
        gr <- GenomicRanges::GRanges(chrom, IRanges::IRanges(start = pos, 
            end = pos + pmax(nchar(nea), nchar(ea)) - 1, names = snp))
         coldata <- S4Vectors::DataFrame(Studies = name, row.names = name)
    ## Setup header informations
        hdr <- VariantAnnotation::VCFHeader(header = IRanges::DataFrameList(fileformat = S4Vectors::DataFrame(Value = "VCFv4.2", 
            row.names = "fileformat")), sample = name)
        VariantAnnotation::geno(hdr) <- S4Vectors::DataFrame(Number = c("A", 
            "A", "A", "A", "A", "A"), Type = c("Float", "Float", 
            "Float", "Float", "Float", "Float"), Description = c("Effect size estimate relative to the alternative allele", 
            "Standard error of effect size estimate", "-log10 p-value for effect estimate",  
            "Alternate allele frequency in the association study",
            "The CS this variate are captured, 0 indicates not in any cs", "The posterior inclusion probability to a CS"), 
            row.names = c("ES", "SE", "LP", "AF", "CS", "PIP"))
      ## Save only the meta information in the sample columns 
        VariantAnnotation::geno(hdr) <- subset(VariantAnnotation::geno(hdr), 
            rownames(VariantAnnotation::geno(hdr)) %in% names(gen))
      ## Save VCF 
        vcf <- VariantAnnotation::VCF(rowRanges = gr, colData = coldata, 
            exptData = list(header = hdr), geno = gen)
        VariantAnnotation::alt(vcf) <- Biostrings::DNAStringSetList(as.list(ea))
        VariantAnnotation::ref(vcf) <- Biostrings::DNAStringSet(nea)
      ## Add fixed values
        VariantAnnotation::fixed(vcf)$FILTER <- "PASS"
          return(sort(vcf))
        }
    library("susieR")
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    library("tidyr")
    
    # Get list of cs snps
    res = readRDS(${_input:r})
    output_snps = tibble( snps = res$variable_name[which(res$pip >= 0)], snps_index = which((res$pip >= 0))  )
    output_snps = output_snps%>%mutate( cs = map(snps_index,~which(res$sets$cs %in% .x))%>%as.numeric%>%replace_na(0),
                             pip = map_dbl(snps_index,~(res$pip[.x])),
                     chr = map_chr(snps,~read.table(text = .x,sep = ":",as.is = T)$V1),
                     pos_alt_ref = map_chr(snps,~read.table(text = .x,sep = ":",as.is = TRUE)$V2),
                     pos = map_dbl(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE)$V1),
                     alt = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V2),
                     ref = map_chr(pos_alt_ref,~read.table(text = .x,sep = "_",as.is = TRUE, colClass = "character")$V3))
    
    effect_mtr = res$coef[output_snps$snps_index+1]%>%as.matrix
    colnames(effect_mtr) = "${name}"
    rownames(effect_mtr) = output_snps$snps
    cs_mtr = effect_mtr
    for(i in 1:nrow(cs_mtr)) cs_mtr[i,] =  output_snps$cs[[i]]  
    pip_mtr = effect_mtr
    for(i in 1:nrow(pip_mtr)) pip_mtr[i,] =  output_snps$pip[[i]]  
    
    output_vcf = create_vcf(
           chrom = output_snps$chr,
            pos = output_snps$pos,
            ea = output_snps$alt,
            nea = output_snps$ref,
            effect = effect_mtr ,
            pip = pip_mtr,
            cs = cs_mtr,
            name = colnames(effect_mtr)
              )
    VariantAnnotation::writeVcf(output_vcf,${_output:nr},index = TRUE)