# Summary statistics formatting
This notebook takes in more than one collections of sumstat text file,  to produce a collections of merged.rds per gene files that can served as the input of both MASH and MVSuSiE analysis.

## Input
1. a sumstat list with columns: "#chr", theme1, theme2, theme3, each cells not under #chr represent the path to 1 sumstat file(generated by yml generator)
2. region_list:a table with columns: chr, start, end, gene_ID for partition
## Output
1. 23 merged sumstat file in txt format, 1 for each chrom
2. merged sumstat file in rds format, 1 for each gene
3. 2 file documenting 1 and 2

In [None]:
sos run pipeline/sumstat_processing.ipynb processing \
    --sumstat_inv `ls /mnt/vast/hpc/csg/snuc_pseudo_bulk/eight_celltypes_analysis/output/data_intergration/TensorQTL/*norminal.cis_long_table.merged.vcf.gz` \
    -n 

In [None]:
[global]
import glob
# Path to work directory where output locates
parameter: cwd = path("./output")
# Containers that contains the necessary packages
parameter: container = ''
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 8
# Columns: "#chr", sumstat(merged.vcf.gz)
parameter: sumstat_inv = paths
parameter: name = f'{sumstat_inv[0]:b}'.split(".")[0]

In [None]:
[processing_1]
input: sumstat_inv, group_by = 1
output: f'{cwd:a}/RDS/{_input:bnnn}.merged_rds.list'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime,  mem = mem, tags = f'{step_name}_{_output:bn}'  
R: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    extract_data = function(vcfObj,keep_snp){
        bhat = VariantAnnotation::geno(vcfObj)$STAT[keep_snp,]
        sbhat = VariantAnnotation::geno(vcfObj)$SE[keep_snp,]
        z = bhat/sbhat
        snp = map_chr(rownames(z),~stringr::str_split(.x,pattern = ":", n = 2 )[[1]][2])%>%unique
        rownames(bhat) = snp
        rownames(sbhat) = snp
        rownames(z) = snp
    return(list("bhat" = bhat , "sbhat" = sbhat, "Z" = z, "snp" = snp ))}
    vcfObj = VariantAnnotation::readVcf("${_input}")
    GENE = unique(VariantAnnotation::info(vcfObj)$GENE)%>%as.list()
    info = VariantAnnotation::info(vcfObj)%>%as_tibble(rownames = "ID")
    output_list = map(GENE, ~extract_data(vcfObj,info%>%filter(GENE %in% .x )%>%pull(ID)))
    output_path = tibble(gene = GENE%>%unlist)%>%mutate(path = map_chr(GENE,~paste0("${_output:nn}.",.x,".rds")) )
    walk2(output_list,output_path$path,~.x%>%saveRDS(.y))
    output_path%>%write_delim("${_output}","\t",col_names = FALSE)

In [None]:
[processing_2]
input: group_by = "all"
output: f'{cwd:a}/{name}.merged_rds.list'
bash: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    echo -e '#gene\t#path' > ${_output}
    cat ${_input:r} >> ${_output}

In [None]:
#[processing_1]
input:  for_each = "sumstat_inv"
output: f'{wd:a}/merge/{name}.{_sumstat_inv[0]}.merged.txt'
task: trunk_workers = 1, trunk_size = 20, walltime = '4h',  mem = '6G', tags = f'{step_name}_{_output:bn}'  
R: expand = "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    library("dplyr")
    library("tibble")
    library("purrr")
    library("readr")
    library("tidyr")
    library("stringr")
    ## Start
    Theme = c('$["','".join(theme)]')
    dir = c('$["','".join(_sumstat_inv[1:])]')
    tb = tibble(Theme = Theme, dir = dir)%>%mutate(data = map(dir,~read_delim(.x,"\t")%>%select(`$[chrom]`,$[pos],$[variant_id],$[beta],$[se])))
    data = tb$data%>%reduce(inner_join, by = c("$[chrom]","$[pos]","$[variant_id]"))%>%
      rename_if(str_detect(names(.),"$[beta]"), ~paste0("$[beta]_",Theme))%>%
      rename_if(str_detect(names(.),"$[se]"), ~paste0("$[se]_",Theme))
    data%>%write_delim("$[_output]"," ")
#[processing_2]
input: group_by = "all"
output: f'{wd:a}/merge/{name}_sumstat_list_per_chrom'
import pandas as pd
df = pd.DataFrame({"chr" : sumstat_list["#chr"], "dir" : _input})
df.to_csv(_output,sep = "\t",index = 0)


In [None]:
#[processing_3]
parameter: region_list = path
regions = [x.strip().split() for x in open(region_list).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: windows = 5000000
input: for_each = "regions"
output: f'{wd:a}/merge/RDS/{name}_{_regions[3]}.rds'
task: trunk_workers = 1, trunk_size = 1, walltime = '12h',  mem = '10G', tags = f'{step_name}_{_output:bn}'
R: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout',container = container
    library("tibble")
    library("readr")
    library("purrr")
    library("tidyr")
    library("dplyr")
    library("stringr")
    sumstat_list = read_delim("$[_input]","\t")
    sumstat_path = (sumstat_list%>%filter(chr == $[_regions[0]]))$dir
    sumstat_ftr = read_delim(sumstat_path,delim = " " )%>%filter( `$[pos]` >=  $[_regions[1]] - $[windows], `$[pos]` <= $[_regions[1]] + $[windows])%>%mutate($[variant_id] = str_remove($[variant_id],"chr"))
    output = list()
    output$bhat = as.matrix(sumstat_ftr%>%select(contains("$[beta]"))%>%rename_all(~str_replace(.,"$[beta]_","")))
    rownames(output$bhat) = (sumstat_ftr$$[variant_id])%>%unlist%>%as.character
    output$sbhat = as.matrix(sumstat_ftr%>%select(contains("$[se]"))%>%rename_all(~str_replace(.,"$[se]_","")))
    rownames(output$sbhat) = (sumstat_ftr$$[variant_id])%>%unlist%>%as.character
    output$Z = output$bhat/output$sbhat
    #keep_index = which(!is.na(output$Z) && !is.nan(output$Z) && is.finite(output$Z))
    #output$bhat = output$bhat[keep_index]
    #output$sbhat = output$sbhat[keep_index]
    #output$Z = output$Z[keep_index]
    output$snps = rownames(output$bhat)
    output%>%saveRDS("$[_output]")

In [None]:
#[processing_4]
input: group_by = "all"
output: f'{wd:a}/merge/RDS/{name}.analysis_unit'
python: expand= "$[ ]", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout'
    import pandas as pd
    pd.DataFrame({"analysis_unit" : [$[_input:ar,]]}).to_csv("$[_output]",index = False ,header = False, sep = "t")