# Covariate data formatting
This is the module where the output of factor analysis were merged into 1 covariate file that can be fed into both APEX and tensorQTL

## Input
1. factor+cov file as output from peer or BiCV factor module, It is assumed it to have columns as #id + samplesname and each rows is a covariateor factor (start with factor_)

1. pca file as output from the PCA module

## Output
1. PCA + Factor + Covariate file

## Minumal Working Example

An MWE is uploaded to [google drive](https://drive.google.com/drive/folders/1yjTwoO0DYGi-J9ouMsh9fHKfDmsXJ_4I?usp=sharing).
The singularity image (sif) for running this MWE is uploaded to [google drive](https://drive.google.com/drive/folders/1mLOS3AVQM8yTaWtCbO8Q3xla98Nr5bZQ)

In [None]:

sos run pipeline/covariate_formatting.ipynb merge_pca_covariate \
        --cwd output \
        --pcaFile ../genotype/MWE.pca.rds \
        --covFile MWE.covariate.cov.gz \
        --tol_cov 0.3  \
        --k 3 \
        --container containers/bioinfo.sif

sos run /home/hs3163/GIT/xqtl-pipeline/pipeline/covariate_formatting.ipynb merge_pca_covariate \
        --cwd output/data_preprocessing/MWE/covariates \
        --pcaFile output/data_preprocessing/MWE/pca/MWE.MWE.related.filtered.extracted.pca.projected.rds \
        --covFile  MWE.covariate.cov.gz \
        --tol_cov 0.3  \
        --k `awk '$3 < 0.7' output/data_preprocessing/MWE/pca/MWE.MWE.related.filtered.extracted.pca.projected.scree.txt | tail -1 | cut -f 1 ` \
        --container containers/bioinfo.sif


sos run pipeline/covariate_formatting.ipynb compute_residual \
        --cwd output \
        --phenoFile MWE.log2cpm.mol_phe.bed.gz \
        --covFile MWE.covariate.cov.gz \
        --container containers/bioinfo.sif

sos run pipeline/covariate_formatting.ipynb merge_factor_covariate \
        --cwd output \
        --factorFile ALL.covariate.pca.BiCV.cov.gz \
        --covFile MWE.covariate.cov.gz  \
        --container containers/bioinfo.sif

The default file name can be overwritten by the `--name` parameter, as demonstrated below:

In [None]:
sos run pipeline/covariate_formatting.ipynb merge_factor_covariate \
        --cwd output \
        --factorFile ALL.covariate.pca.BiCV.cov.gz \
        --covFile MWE.covariate.cov.gz  \
        --container containers/bioinfo.sif --name "demo"

## Command Interface

In [4]:
sos run covariate_formatting.ipynb -h

usage: sos run covariate_formatting.ipynb
               [workflow_name | -t targets] [options] [workflow_options]
  workflow_name:        Single or combined workflows defined in this script
  targets:              One or more targets to generate
  options:              Single-hyphen sos parameters (see "sos run -h" for details)
  workflow_options:     Double-hyphen workflow-specific parameters

Workflows:
  merge_pca_covariate
  compute_residual
  merge_factor_covariate

Global Workflow Options:
  --cwd output (as path)
                        The output directory for generated files.
  --covFile VAL (as path, required)
                        The covariate file
  --job-size 1 (as int)
                        For cluster jobs, number commands to run per job
  --walltime 5h
                        Wall clock time expected
  --mem 2G
                        Memory expected
  --numThreads 8 (as int)
                        Number of threads
  --container ''
                        Softwa

In [2]:
[global]
# The output directory for generated files. 
parameter: cwd = path("output")
# The covariate file
parameter: covFile = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "2G"
# Number of threads
parameter: numThreads = 8
# Software container option
parameter: container = ""
# The number of the external covariates to be included, -1 means includs all of them, 0 means include none of them,
# but keeping only the header (Basicaaly just formatting the PCs).
parameter: nCov = -1

# Tolerance of missingness in covariates, -1 means quit, otherwise for covariate with missing rate larger than tol_cov will be removed,
# with missing rate smaller than tol_cov will be mean_imputed.
parameter: tol_cov = -1.0

cwd = path(f"{cwd:a}")

In [None]:
[merge_pca_covariate]
# The PCA file. an RDS file as the output of the pca module
parameter: pcaFile = path
# The number of PCs to retained, by default is 20, in pratice should be the number of pc that captured more than 70% PVE
parameter: k = 20
parameter:name = f'{covFile:bn}.{pcaFile:bn}'
# Outliers
parameter: outliersFile = path(".") 
parameter: remove_outliers = False
## stop if no outliersFile was provided.
stop_if(remove_outliers and not outliersFile.is_file(), msg = "No outliers file specified, please add outliers file or remove the remove-outliers flag")
input: pcaFile, covFile
output:  f'{cwd:a}/{name}.gz'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container
        library("dplyr")
        library("tibble")
        library("readr")
        compute_missing <- function(mtx){
          miss <- sum(is.na(mtx))/length(mtx)
          return(miss)
        }

        mean_impute <- function(mtx){
          f <- apply(mtx, 2, function(x) mean(x,na.rm = TRUE))
          for (i in 1:length(f)) mtx[,i][which(is.na(mtx[,i]))] <- f[i]
          return(mtx)
        }
    
        filter_mtx <- function(X, missing_rate_thresh) {
            rm_col <- which(apply(X, 2, compute_missing) > missing_rate_thresh)
            if (length(rm_col)) X <- X[, -rm_col]
            return(mean_impute(X))
        }  
        pca_output = readRDS("$[_input[0]]")$pc_scores
        mtx = pca_output%>%select(contains("PC"))%>%t()
        colnames(mtx) <- pca_output$IID
        ## Keep only the number of PCs specified
        mtx = mtx[1:$[k],]
        mtx = mtx%>%as_tibble(rownames = "#id")
        cov = read_delim("$[_input[1]]","\t")
        colnames(cov)[1] = "#id"
        ## Retaining only the overlapped samples
        int = intersect(colnames(cov),colnames(mtx))
        ### Removal of outlier if needed
        if ($["TRUE" if remove_outliers else "FALSE"]){
              outlier = read_delim("$[outliersFile]","\t",col_names = FALSE)$X2
              int = setdiff(int,outlier)
              }
        cov = cov%>%select(int)
        # keep only the desired amount of covariates
        if($[nCov] > 0 ){cov = cov[1:$[nCov],]} else if ($[nCov] == 0){cov = cov[$[nCov],]}
        mtx = mtx%>%select(int)
        output = bind_rows(cov,mtx)
        ## Handle missingess in ncov
        if($[tol_cov] == -1){if(sum(is.na(output)) > 0 ){ stop("NA in covariates/PCs input: Check input file or raise parameter tol_cov to allow for imputation & filtering")}}
        output = output%>%as.data.frame
        rownames(output) = output$`#id`
        output = filter_mtx(output[,2:ncol(output)],$[tol_cov])%>%as_tibble(rownames = "#id")
        output%>%write_delim("$[_output]","\t")

In [None]:
[compute_residual_1]
# Path to the input molecular phenotype data.
parameter: phenoFile = path
parameter:name = f'{phenoFile:bnn}.{covFile:bn}'
input: phenoFile, covFile
output: f'{cwd}/{name}.resid.bed'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
R: expand = "${ }", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout' , container = container

    library(dplyr)
    library(readr)

    pheno = read_delim(${_input[0]:r},delim = "\t")
    covariate= read_delim(${_input[1]:r},delim = "\t") 

    # Extract samples in both files (Outliers will be removed in here as they are no longer presented in the header of factor)
    extraction_sample_list <- intersect(colnames(pheno), colnames(covariate)) 
    
    
    if(length(extraction_sample_list) == 0){
      stop("No samples are overlapped in two files!")
    }
    
    # Report identical samples:
    
    print("Listed samples are included in the analysis:")
    print(extraction_sample_list)
    
    # Subset the data:
    covariate = covariate[,extraction_sample_list]%>%as.matrix()%>%t()
    pheno_id = pheno%>%select(1:4)
    pheno = pheno%>%select(rownames(covariate))%>%as.matrix()%>%t()
    
    # Get residual 
    pheno_resid = .lm.fit(x = cbind(1,covariate), y = pheno)$residuals
    pheno_output = cbind(pheno_id, pheno_resid%>%t())
    pheno_output%>%write_delim(${_output[0]:r},delim = "\t")

# tabix via samtools
[compute_residual_2]
output: f'{_input}.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '4h',  mem = '20G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "${ }", stderr = f'{_output}.stderr', stdout = f'{_output}.stdout', container = container
    bgzip -f ${_input}
    tabix -p bed ${_output}

In [3]:
[merge_factor_covariate]
parameter: factorFile = path
parameter:name = f'{factorFile:bnn}.{covFile:bn}'
input: factorFile, covFile
output:  f'{cwd:a}/{name}.gz'
task: trunk_workers = 1, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output[0]:bn}'
R: expand= "$[ ]", stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout', container = container
        library("dplyr")
        library("readr")
        peer_res = read_delim("$[_input[0]]", delim = "\t")
        cov_pca = read_delim("$[_input[1]]", delim = "\t")
        ## Keep only common samples
        com_col = intersect(colnames(peer_res), colnames(cov_pca))
        write_delim((rbind(cov_pca[,com_col], peer_res[,com_col])), "$[_output]", "\t")