# transQTL analysis workflows 
This is the qtl association pipeline for xqtl workflow, containing the generation of:
1. Norminal cis QTL sumstat 

and/or
    
2. Norminal trans QTL sumstat


In [2]:
[global]
import os
# Work directory & output directory
parameter: wd = path
# The filename name for output data
parameter: container = 'gaow/twas'
# name for the analysis output
parameter: name = 'ROSMAP'
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 20
# Diretory to the executable
parameter: exe_dir = path("~/GIT/ADSPFG-xQTL/workflow")
# yml template
parameter: yml = f'{exe_dir:d}/code/csg.yml'
# queue for analysis
parameter: queue = "csg"
# Number of submission
parameter: J = 200
# The aforementioned Recipe
parameter: recipe = path

# QTL options


# cis_windows
parameter: window = 500000





import pandas as pd
file_dict = pd.read_csv(recipe, delimiter="\t").to_dict("list")
parameter: molecular_pheno_chr_list = file_dict["molecular_pheno_chr_list"][0]
parameter: covariate_factor_pca = file_dict["covariate_factor_pca"][0]
parameter: grm_list = file_dict["grm_list"][0]
parameter: qced_plink_genotype_list = file_dict["qced_plink_genotype_list"][0]
parameter: qced_vcf_genotype_list = file_dict["qced_vcf_genotype_list"][0]

## They should have same number of partitioned for there should be same number of chr
molecular_pheno_chr_inv = pd.read_csv(molecular_pheno_chr_list,header = None)
grm_inv = [x.strip().split() for x in open(grm_list).readlines() if x.strip() and not x.strip().startswith('#')]
qced_plink_genotype_inv = [x.strip().split() for x in open(qced_plink_genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]
qced_vcf_genotype_inv = [x.strip().split() for x in open(qced_vcf_genotype_list).readlines() if x.strip() and not x.strip().startswith('#')]

data_chr_inv = molecular_pheno_chr_inv.assign(covariate_factor_pca = covariate_factor_pca , 
                                              grm_inv = grm_inv, 
                                              qced_plink_genotype_inv = qced_plink_genotype_inv,
                                              qced_vcf_genotype_inv = qced_vcf_genotype_inv  ).values.tolist()

## APEX
This section is the APEX option for CIS and Trans analysis, including a shared step of generating lmm

### APEX cis


In [None]:
apex cis --vcf /mnt/mfs/statgen/xqtl_workflow_testing/testing_3/Data_Processing/Genotype/PCC_vcf_geno/PCC_chr22.vcf.gz \
--bed /mnt/mfs/statgen/xqtl_workflow_testing/testing_3/Data_Processing/Phenotype/PCC.chr21.mol_phe.bed.gz \
--cov ./test.cov \
--out /mnt/mfs/statgen/xqtl_workflow_testing/testing_3/QTL_association/APEX/cis/PCC.chr21.mol_phe \
--grm  /mnt/mfs/statgen/xqtl_workflow_testing/testing_3/Data_Processing/Genotype/GRM/PCC_chr22.grm \
--theta-file /mnt/mfs/statgen/xqtl_workflow_testing/testing_3/QTL_association/APEX/cis/PCC.chr21.mol_phe.theta.gz \
--long

In [None]:
[APEX_cis_1]
input:  for_each = "data_chr_inv"
output: f'{wd}/APEX/cis/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.theta.gz',
        f'{wd}/APEX/cis/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.cis_gene_table.txt.gz',
        f'{wd}/APEX/cis/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.cis_sumstats.txt.gz',
        APEX_cis = f'{wd}/APEX/cis/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.cis_long_table.reformated.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '24h',  mem = '40G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    sos run $[exe_dir]/QTL_association/APEX/APEX.ipynb APEX_cis \
            --wd $[wd]/APEX/cis/ \
            --container $[container] \
            --name $[name] \
            --molecular_pheno $[_data_chr_inv[0]] \
            --covariate $[_data_chr_inv[1]]\
            --genotype_file $[_data_chr_inv[4][1]] \
            --grm $[_data_chr_inv[2][1]] \
            --window $[window] -J $[J] -q $[queue] -c $[yml]

### Apex Trans

In [None]:
[APEX_trans_1]
input: recipe, for_each = "data_chr_inv"
output: f'{wd}/APEX/trans/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.theta.gz',
        f'{wd}/APEX/trans/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.trans_gene_table.txt.gz',
        APEX_trans = f'{wd}/QTL_association/APEX/trans/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.trans_long_table.txt.gz'
task: trunk_workers = 1, trunk_size = 1, walltime = '24h',  mem = '40G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    sos run $[exe_dir]/QTL_association/APEX/APEX.ipynb APEX_trans \
            --wd $[wd]/APEX/trans/ \
            --container $[container] \
            --name $[name] \
            --molecular_pheno $[_data_chr_inv[0]] \
            --covariate $[_data_chr_inv[1][1]]\
            --genotype_file $[_data_chr_inv[4][1]] \
            --grm $[_data_chr_inv[2]] -J $[J] -q $[queue] -c $[yml]

## TensorQTL
This section is the TensorQTL option for CIS and Trans analysis

### TensorQTL Cis


In [None]:
[TensorQTL_cis_1]
input: recipe, for_each = "data_chr_inv"
output: f'{wd}/TensorQTL/cis/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.emprical.cis_sumstats.txt',
        TensorQTL_cis = f'{wd}/TensorQTL/cis/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.norminal.cis_long_table.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '24h',  mem = '40G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    sos run $[exe_dir]/QTL_association/TensorQTL/TensorQTL.ipynb tensor_cis \
            --wd $[wd]/TensorQTL/cis/ \
            --container $[container] \
            --Prefix $[name] \
            --molecular_pheno $[_data_chr_inv[0]] \
            --covariate $[_data_chr_inv[1]]\
            --genotype_file $[_data_chr_inv[3][1]] \
            --window $[window] -J $[J] -q $[queue] -c $[yml]

### TensorQTL Trans

In [None]:
[TensorQTL_trans_1]
input: recipe, for_each = "data_chr_inv"
output:  TensorQTL_trans = f'{wd}/TensorQTL/trans/{_data_chr_inv[0].split("/")[-1].replace(".bed.gz","")}.trans_sumstats.txt'
task: trunk_workers = 1, trunk_size = 1, walltime = '24h',  mem = '40G', tags = f'{step_name}_{_output[0]:bn}'
bash: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    sos run $[exe_dir]/QTL_association/TensorQTL/TensorQTL.ipynb tensor_trans \
            --wd $[wd]/TensorQTL/cis/ \
            --container $[container] \
            --Prefix $[name] \
            --molecular_pheno $[_data_chr_inv[0][1]] \
            --covariate $[_data_chr_inv[1]]\
            --genotype_file $[_data_chr_inv[3][1]]  -J $[J] -q $[queue] -c $[yml]

## Output Recipe Generation
This step generate a list to document all the output of this step, so that they could be easily accessible by the next step

In [2]:
[APEX_cis_Recipe]
input: output_from("APEX_cis_1")["APEX_cis"], group_by = "all"
output: f'{wd}/{name}.APEX_cis_QTL_recipe.tsv'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import csv
    import pandas as pd 
    data_tempt = pd.DataFrame({
    "sumstat_dir" : [$[_input:r,]]
    })
    data_tempt.to_csv("$[_output]",index = False,sep = "\t" )

In [None]:
[APEX_trans_Recipe]
input: output_from("APEX_trans_1")["APEX_trans"], group_by = "all"
output: f'{wd}/{name}.APEX_trans_QTL_recipe.tsv'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import csv
    import pandas as pd 
    data_tempt = pd.DataFrame({
    "sumstat_dir" : [$[_input:r,]]
    })
    data_tempt.to_csv("$[_output]",index = False,sep = "\t" )

In [None]:
[TensorQTL_cis_Recipe]
input: output_from("TensorQTL_cis_1")["TensorQTL_cis"], group_by = "all"
output: f'{wd}/{name}.TensorQTL_cis_QTL_recipe.tsv'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import csv
    import pandas as pd 
    data_tempt = pd.DataFrame({
    "sumstat_dir" : [$[_input:r,]]
    })
    data_tempt.to_csv("$[_output]",index = False,sep = "\t" )

In [None]:
[TensorQTL_trans_Recipe]
input: named_output("TensorQTL_trans"), group_by = "all"
output: f'{wd}/{name}.APEX_cis_QTL_recipe.tsv'
python: expand = "$[ ]", stderr = f'{_output[0]}.stderr', stdout = f'{_output[0]}.stdout'
    import csv
    import pandas as pd 
    data_tempt = pd.DataFrame({
    "sumstat_dir" : [$[_input:r,]]
    })
    data_tempt.to_csv("$[_output]",index = False,sep = "\t" )