# Sample Matcher
This module trimmed phenotype/genotype files so that they only have what is in a sample_lookup list, as shown [in this ticket](https://github.com/cumc/xqtl-pipeline/issues/137)

## Input
1. sample_lookup file

    A tab delimated table wit two columns `sample_name_in_pheno(and cov), sample_name_in_geno` that takes only the OVERLAP between these data-set. This will also serve as a sample name matching file if sample names dont agree.
    
2. phenotype_file
    
    A gct like table, Output of bulk_expression_QC or external input before gene_annotation.
    
    Noted, for preping input to normalization, phenotype_trimming need to be run on geneCount and geneTPM each independently
    
3. genotype_file
    
    A plink file, as output of VCF_QC module

## Output
    
1. A gct like phenoFile that is ready to be fed into Normalization/gene_annotation
2. A plink genoFile that is ready to be fed into downstream analysis

## MWE
**FIXME:WIP**

In [None]:
[global]
# The output directory for generated files. MUST BE FULL PATH
parameter: cwd = path("output")
# Input 1
parameter: sample_participant_lookup = path
# For cluster jobs, number commands to run per job
parameter: job_size = 1
# Wall clock time expected
parameter: walltime = "5h"
# Memory expected
parameter: mem = "16G"
# Number of threads
parameter: numThreads = 8
# Software container option
parameter: container = ""
parameter: name = ""

In [None]:
[phenotype_trimming]
# The molecular phenotype matrix
parameter: phenoFile = path
input: phenoFile,sample_participant_lookup
output: f'{_input[0]:nn}.sample_matched.gct.gz'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand = "$[ ]", stderr = f'{_output:nn}.stderr', stdout = f'{_output:nn}.stdout', container = container
    library("dplyr")
    library("readr")
    phenoFile = read_delim($[_input[0]:ar], "\t", col_names = T, comment = "#")
    sample_lookup = read_delim($[_input[1]:ar], "\t" ,col_names = T, comment = "#")
    ## Make phenoFile consistant with sampleLookup, remove samples by select()
    int = intersect(colnames(phenoFile),unlist(sample_lookup[,1]))
    phenoFile_tmp = phenoFile%>%select(c(colnames(phenoFile)[1],all_of(int)))
    ## Add 2 header lines, https://github.com/getzlab/rnaseqc/blob/286f99dfd4164d33014241dd4f3149da0cddf5bf/src/RNASeQC.cpp#L426
    cat(paste("#1.2\n#", nrow(phenoFile_tmp), ncol(phenoFile_tmp) - 2, "\n"), file=$[_output:nr], append=FALSE)
    phenoFile_tmp%>%write_delim($[_output:nr],delim = "\t",col_names = T, append = T)

In [None]:
[genotype_trimming]
# The path to bed of plink trio
parameter: genoFile = path
input: genoFile, sample_participant_lookup
output: f'{cwd}/{_input[0]:bn}.sample_filtered.bed'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
bash:  expand= "${ }", stderr = f'{_output[0]:n}.stderr', stdout = f'{_output[0]:n}.stdout', container = container
    paste ${_input[1]} ${_input[1]} | cut -f 2,4 > ${_input[1]}_geno
    plink \
      --bfile ${_input[0]:n} \
      --keep ${_input[1]}_geno \
      --make-bed \
      --out ${_output:n} \
      --threads ${numThreads} \
      --memory ${int(expand_size(mem) * 0.9)/1e6}
    rm ${_input[1]}_geno

In [None]:
[filtered_sample_list]
# A genotype fam file
parameter: genoFile = path
# A phenotype file, can be bed.gz or tsv
parameter: phenoFile = path
# Whether the phenoFile sample name was translated into genoFile sample name already.
parameter: translated_phenoFile = False
input: genoFile,phenoFile,sample_participant_lookup
output: f'{cwd}/{path(_input[2]):bn}.filtered.txt', f'{cwd}/{path(_input[2]):bn}.filtered_geno.txt'
task: trunk_workers = 1, trunk_size = job_size, walltime = walltime, mem = mem, cores = numThreads, tags = f'{step_name}_{_output:bn}'
R: expand = "$[ ]", stderr = f'{_output[0]:nn}.stderr', stdout = f'{_output[0]:nn}.stdout', container = container
    library("dplyr")
    library("readr")
    # Read data
    genoFam = read_delim($[_input[0]:ar],"\t", col_names = F)
    phenoFile = read_delim($[_input[1]:ar], "\t", col_names = T)
    sample_lookup = read_delim($[_input[2]:ar], "\t" ,col_names = T)
    ## Get pheno sample list and geno sample list
    sample_lookup = sample_lookup%>%filter( participant_id%in%genoFam$X1,$[f"participant_id" if translated_phenoFile else "sample_id"]%in%colnames(phenoFile))
    cbind(0,sample_lookup$participant_id)%>%as_tibble%>%write_delim($[_output[1]:r],"\t")
    sample_lookup%>%write_delim($[_output[0]:r],"\t")