<a id="19"></a> 
# Variant calling by the consensus approach

<a id="20"></a> 
## Variant calling

<a id="21"></a> 
### Call variants
This script uses the GATK HaplotypeCaller tool to generate a single-sample vcf for each sample.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/HaplotypeCaller
JOB_NAME=HaplotypeCaller


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=30G
#SBATCH --time=6-00:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file=$1

module load java/jdk1.8.0_231
module load GATK/4.9.1

gatk HaplotypeCaller -R '$REF_FASTA' -I '$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed_marked_addg.bam -O '$HOME_DIRECTORY'/vcf_files/$(basename $input_file).g.vcf.gz -ploidy 2 -ERC BP_RESOLUTION --dont-use-soft-clipped-bases

' > $HOME_DIRECTORY/scripts/HaplotypeCaller.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/HaplotypeCaller.sh "$c"
"; done > $HOME_DIRECTORY/scripts/HaplotypeCaller_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/HaplotypeCaller_submit_jobs


<a id="22"></a> 
### Import variants to a GenomicsDB
This script uses the GATK GenomicsDBImport tool to import single-sample vcfs into a GenomicsDB before joint genotyping. Specifically, single-sample vcfs of the same regime are imported into a GenomicsDB (six single-sample vcfs per regime); total three GenomicsDBs, corresponding to the three regimes (T1, T2, T5), are generated.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
REF_FASTA_INTERVALS=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz_contigs.intervals
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/GenomicsDBImport
JOB_NAME=GenomicsDBImport


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=80G
#SBATCH --time=0-24:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load java/jdk1.8.0_231
module load GATK/4.9.1

gatk GenomicsDBImport --genomicsdb-workspace-path '$HOME_DIRECTORY'/vcf_handling/GenomicsDBImport_database/$(basename $regime)_database --tmp-dir '$HOME_DIRECTORY'/vcf_handling/GenomicsDBImport_temp/$(basename $regime)_temp --batch-size 10 --sample-name-map '$HOME_DIRECTORY'/$(basename $regime)_sample_map --reader-threads 10 -L '$REF_FASTA_INTERVALS' --showHidden True --merge-input-intervals True

' > $HOME_DIRECTORY/scripts/GenomicsDBImport.sh



less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/GenomicsDBImport.sh "$a"
"; done > $HOME_DIRECTORY/scripts/GenomicsDBImport_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/GenomicsDBImport_submit_jobs


<a id="23"></a> 
### Joint genotyping
This script uses the GATK GenotypeGVCFs tool to performe joint genotype and produces a multi-sample vcf for each GenomicsDB (i.e. for each regime).

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/GenotypeGVCFs
JOB_NAME=GenotypeGVCFs


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=30G
#SBATCH --time=0-24:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load java/jdk1.8.0_231
module load GATK/4.9.1

gatk GenotypeGVCFs -R '$REF_FASTA' -V gendb://'$HOME_DIRECTORY'/vcf_handling/GenomicsDBImport_database/$(basename $regime)_database -G StandardAnnotation -O '$HOME_DIRECTORY'/vcf_handling/GenotypeGVCFs/$(basename $regime)_GenotypeGVCFs.g.vcf.gz --include-non-variant-sites

' > $HOME_DIRECTORY/scripts/GenotypeGVCFs.sh



less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/GenotypeGVCFs.sh "$a"
"; done > $HOME_DIRECTORY/scripts/GenotypeGVCFs_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/GenotypeGVCFs_submit_jobs



<a id="24"></a> 
## Derive the coverage limits for each sample
This section calculates the mean coverage, as well as the uppler and lower coverage limits for each sample to facilitate [coverage filtering](#30).

<a id="25"></a> 
### Derive coverage for each site for each sample
This script uses the bcftools annotate tool to derive the coverage (FORMAT/DP field) for each site for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bcftools_annotate
JOB_NAME=bcftools_annotate


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=6G
#SBATCH --time=0-24:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load bcftools/1.15.1

bcftools annotate -x ^FORMAT/DP '$HOME_DIRECTORY'/vcf_handling/GenotypeGVCFs/$(basename $regime)_GenotypeGVCFs.g.vcf.gz -o '$HOME_DIRECTORY'/vcf_handling/bcftools_annotate/$(basename $regime)_bcftools_annotate_DP

' > $HOME_DIRECTORY/scripts/bcftools_annotate.sh



less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/bcftools_annotate.sh "$a"
"; done > $HOME_DIRECTORY/scripts/bcftools_annotate_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bcftools_annotate_submit_jobs


<a id="26"></a> 
### Calculate mean coverage across sites for each sample

In [None]:
## Derive mean DP for each sample

HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/DP_range
JOB_NAME=DP_range

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-12:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
sample_ID=$2
field=$3

sed '/^#/d' '$HOME_DIRECTORY'/vcf_handling/bcftools_annotate/$(basename $regime)_bcftools_annotate_DP | cut -f$field > '$HOME_DIRECTORY'/vcf_handling/bcftools_annotate/$(basename $sample_ID)_bcftools_annotate_DP 

awk '"'"'{ total += $1 } END { print total/NR }'"'"' '$HOME_DIRECTORY'/vcf_handling/bcftools_annotate/$(basename $sample_ID)_bcftools_annotate_DP  > '$HOME_DIRECTORY'/vcf_handling/bcftools_annotate/$(basename $sample_ID)_bcftools_annotate_DP_mean

' > $HOME_DIRECTORY/scripts/DP_range.sh


less $HOME_DIRECTORY/regime_sample_field_for_DP | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/DP_range.sh "$a" "$b" "$c"
"; done > $HOME_DIRECTORY/scripts/DP_range_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/DP_range_submit_jobs


<a id="27"></a> 
### Calculate the upper and lower coverage limits for each sample
This script calculates the upper (2 * mean coverage) and the lower (0.5 * mean coverage) limits for each sample, convert the numeric values to integer, and store the results in a table.

**Interperating the table:** 

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

cat $HOME_DIRECTORY/vcf_handling/bcftools_annotate/*mean | paste $HOME_DIRECTORY/regime_sample_field_for_DP - > $HOME_DIRECTORY/vcf_handling/bcftools_annotate/DP_range

awk '{$5=$4*0.5; $6=$4*2; print $0}' $HOME_DIRECTORY/vcf_handling/bcftools_annotate/DP_range > $HOME_DIRECTORY/vcf_handling/bcftools_annotate/DP_range_temp && mv $HOME_DIRECTORY/vcf_handling/bcftools_annotate/DP_range_temp $HOME_DIRECTORY/DP_range

awk '{$7=int($5); $8=int($6); print $0}' $HOME_DIRECTORY/DP_range > $HOME_DIRECTORY/DP_range_temp && mv $HOME_DIRECTORY/DP_range_temp $HOME_DIRECTORY/DP_range


<a id="28"></a> 
## Variant filtering

<a id="29"></a> 
### Derive SNP sites
This script uses the GATK SelectVariants tool to select sites that are SNPs.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/SelectVariants_SNP
JOB_NAME=SelectVariants_SNP


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=12G
#SBATCH --time=00-06:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load java/jdk1.8.0_231
module load GATK/4.9.1

gatk SelectVariants -R '$REF_FASTA' -V '$HOME_DIRECTORY'/vcf_handling/GenotypeGVCFs/$(basename $regime)_GenotypeGVCFs.g.vcf.gz -O '$HOME_DIRECTORY'/vcf_handling/SelectVariants_SNP/$(basename $regime)_SelectVariants_SNP.g.vcf.gz --select-type-to-include SNP --select-type-to-include NO_VARIATION --exclude-non-variants false
' > $HOME_DIRECTORY/scripts/SelectVariants_SNP.sh



less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/SelectVariants_SNP.sh "$a"
"; done > $HOME_DIRECTORY/scripts/SelectVariants_SNP_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/SelectVariants_SNP_submit_jobs



<a id="30"></a> 
### Coverage filtering
This script filters to derive SNPs with coverages within the upper or lower coverage limits for each sample.

<a id="31"></a> 
#### Split the multi-sample vcf to single-sample vcf
This script uses the bcftools split tool to split the multi-sample vcf containing SNPs for each regime (i.e. the output from [SelectVariants](#29)) to six single-sample vcf; total 18 single-sample vcf files (six per regime) are generated.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bcftools_split
JOB_NAME=bcftools_split

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-04:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
sample_ID=$2

module load bcftools/1.15.1
module load htslib/1.15.1

bcftools +split '$HOME_DIRECTORY'/vcf_handling/SelectVariants_SNP/$(basename $regime)_SelectVariants_SNP.g.vcf.gz -O z -o '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/

tabix -p vcf '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample_ID).vcf.gz

' > $HOME_DIRECTORY/scripts/bcftools_split.sh


less $HOME_DIRECTORY/regime_sample_field_for_DP | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/bcftools_split.sh "$a" "$b"
"; done > $HOME_DIRECTORY/scripts/bcftools_split_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bcftools_split_submit_jobs


<a id="32"></a> 
#### Coverage filtering for each sample
This script uses the bcftools view tool to derive sites that pass the sample's coverage filters for each sample (i.e. sites with coverages within 0.5* and 2* of the mean coverage of the sample are kept; see [Calculate the upper and lower coverage limits for each sample](#27)). The output of this script is a single-sample vcf for each sample.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bcftools_DP
JOB_NAME=bcftools_view_DP


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-02:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
sample_ID=$2
min_DP=$3
max_DP=$4

module load bcftools/1.15.1
module load htslib/1.15.1

bcftools view -i '"'"'FMT/DP > '"'"'$min_DP'"'"'  & FMT/DP < '"'"'$max_DP'"'"' '"'"' '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample_ID).vcf.gz -o '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample_ID)_DP.vcf.gz

tabix -p vcf '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample_ID)_DP.vcf.gz

' > $HOME_DIRECTORY/scripts/bcftools_view_DP.sh


less $HOME_DIRECTORY/DP_range | while read a b c d e f g h; do printf "
sbatch $HOME_DIRECTORY/scripts/bcftools_view_DP.sh "$a" "$b" "$g" "$h"
"; done > $HOME_DIRECTORY/scripts/bcftools_view_DP_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bcftools_view_DP_submit_jobs


<a id="33"></a> 
#### Intersect to derive a multi-sample vcf for each regime
This script uses the bcftools isec tool to generate a multi-sample vcf for each regime containg SNPs that pass the sample-specific coverage filters for all six samples within the regime; total three multi-sample vcf files, corresponding to the three regimes (T1, T2, T5), are generated.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bcftools_DP
JOB_NAME=bcftools_intersect


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=18G
#SBATCH --time=0-8:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
sample1=$2
sample2=$3
sample3=$4
sample4=$5
sample5=$6
sample6=$7

module load bcftools/1.15.1
module load htslib/1.15.1


# 1. generate the intersection within regime for each regime; this produces a multi-sample vcf containing sites that passed the sample-specific DP filters for all samples within a regime

bcftools isec -n=6 -w1 '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample1)_DP.vcf.gz '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample2)_DP.vcf.gz '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample3)_DP.vcf.gz '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample4)_DP.vcf.gz '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample5)_DP.vcf.gz '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $sample6)_DP.vcf.gz -o '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_bcftools_isec.vcf.gz -O z

tabix -p vcf '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_bcftools_isec.vcf.gz

# 2. generate the intersection between 1 and the multi-sample vcf containing SNPs generated by SelectVariants

bcftools isec -n=2 -w1 '$HOME_DIRECTORY'/vcf_handling/SelectVariants_SNP/$(basename $regime)_SelectVariants_SNP.g.vcf.gz '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_bcftools_isec.vcf.gz -o '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_SNP_DPfiltered.vcf.gz -O z

tabix -p vcf '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_SNP_DPfiltered.vcf.gz

' > $HOME_DIRECTORY/scripts/bcftools_intersect.sh


less $HOME_DIRECTORY/regime_sample_wide | while read a b c d e f g; do printf "
sbatch $HOME_DIRECTORY/scripts/bcftools_intersect.sh "$a" "$b" "$c" "$d" "$e" "$f" "$g"
"; done > $HOME_DIRECTORY/scripts/bcftools_intersect_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bcftools_intersect_submit_jobs



<a id="34"></a> 
### Further filtering and hard-filtering
This script uses the multi-sample vcf for each regime generated from the [coverage filtering step](#33) as input and further filters to derive biallelic sites where all samples of the same regime are genotyped (Step 1; see below) and for sites that pass the five hard-filter combinations (Step 2; see below), and outputs a tab-delimited table for each hard-filter combination (Step 3). Total 15 tables, five for each regime, are generated.

Specifically, the three steps are:
**Step 1:** Use GATK SelectVariants tool to derive biallelic sites where all samples of the same regime are genotyped. This step uses the multi-sample vcf for each regime from the [coverage filtering step](#33) and outputs a multi-sample vcf for each regime for Step 2.
**Step 2:** Use GATK VariantFiltration and bcftools view tools to derive sites that pass the hard-filters. Total five hard-filter combinations are designed (defined [here](#3)). This step uses the multi-sample vcf generated in Step 1 as input and outputs five resultant multi-sample vcf files for each input vcf, with each of the resultant vcf corresponding to a hard-filter combination.
**Step 3:** Use GATK VariantsToTable tool to extract INFO and FORMAT fields from the input vcf and outputs a tab-delimited table. This step uses the multi-sample vcf generated in Step 2 as input and outputs a tab-delimited table for each input vcf. Total five tables, each corresponding to a hard-filter combination, are generated for each regime. 

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/VariantFiltration
JOB_NAME=VariantFiltration


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=12G
#SBATCH --time=0-06:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB


regime=$1
parameter_combination=$2
QD_threshold=$3
FS_threshold=$4
SOR_threshold=$5
MQ_threshold=$6
MQRankSum_threshold=$7
ReadPosRankSum_threshold=$8


module load java/jdk1.8.0_231
module load GATK/4.9.1


# 1. Filters to derive biallelic SNP sites where all samples within the regime are called

gatk SelectVariants -R '$REF_FASTA' \
-V '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_SNP_DPfiltered.vcf.gz \
--max-nocall-number 0 \
--restrict-alleles-to BIALLELIC \
-O '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC.vcf

# 2. Filters to derive sites that passed the hard-filters

gatk VariantFiltration -R '$REF_FASTA' \
-V '$HOME_DIRECTORY'/vcf_handling/bcftools_DP/$(basename $regime)/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC.vcf \
-O '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination).vcf.gz \
--filter "QD < $QD_threshold" \
--filter-name "QD_threshold" \
--filter "FS > $FS_threshold" \
--filter-name "FS_threshold" \
--filter "SOR > $SOR_threshold" \
--filter-name "SOR_threshold" \
--filter "MQ < $MQ_threshold" \
--filter-name "MQ_threshold" \
--filter "MQRankSum < $MQRankSum_threshold" \
--filter-name "MQRankSum_threshold" \
--filter "ReadPosRankSum < $ReadPosRankSum_threshold" \
--filter-name "ReadPosRankSum_threshold" 

module load bcftools/1.15.1
module load htslib/1.15.1

bcftools view -i '"'"'FILTER="PASS"'"'"' '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination).vcf.gz -o '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS.vcf.gz

tabix -p vcf '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS.vcf.gz


# 3. VariantsToTable

gatk VariantsToTable -R '$REF_FASTA' \
-V '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS.vcf.gz \
-O '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS.table \
-F CHROM -F POS -F REF -F ALT -F QD -F QUAL -F DP \
-F MQRankSum -F ReadPosRankSum -F MQ -F FS -F SOR -F AC -F AF -F AN -F HOM-REF -F HET -F HOM-VAR -F NO-CALL -F TRANSITION -F TYPE -F VAR \
-GF GT -GF AD -GF DP -GF GQ 

' > $HOME_DIRECTORY/scripts/VariantFiltration.sh



less $HOME_DIRECTORY/VariantFiltration_parameter_combinations | while read a b c d e f g h; do printf "
sbatch $HOME_DIRECTORY/scripts/VariantFiltration.sh "$a" "$b" "$c" "$d" "$e" "$f" "$g" "$h"
"; done > $HOME_DIRECTORY/scripts/VariantFiltration_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/VariantFiltration_submit_jobs



<a id="35"></a> 
### Derive heterozygous singletons
This section uses the tables generated in the [hard-filtering step](#34) as input and further filters to derive sites where the mutant allele is found in only one of all samples in each regime (i.e. singletons), sites where the mutant sample is heterozygous and the non-mutant samples are homozygous (for the same allele, either reference or alternate), and site where the alleleic ratio of the mutant allele is between 0.25 and 0.75 in the mutant sample, and 0 in the non-mutant samples of the same regime.

Specifically, this section contains three scripts:
**[Script 1:](#36)** Split the GT and AD columns to facilitate filtering.
**[Script 2:](#37)** Derive singletons. Two types of singltons are derived; the mutant sample is heterozygous and the non-mutant samples are homozygous for the reference allele, or the mutant sample is heterozygous and the non-mutant samples are homozygous for the alternate allele.
**[Script 3:](#38)** Derive sites where the allelic ratio of the mutant allele is between 0.25 and 0.75 in the mutant sample, and 0 in the non-mutant samples of the same regime.

<a id="36"></a> 
#### Split GT and AD columns

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/VariantFiltration
JOB_NAME=split_columns


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-01:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB


regime=$1
parameter_combination=$2


## split GT columns

sed '"'"'s/|/\t/g'"'"' '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS.table > '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_temp1.table 

sed '"'"'s/\//\t/g'"'"' '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_temp1.table > '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_temp2.table 

## split AD columns

sed '"'"'s/,/\t/g'"'"' '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_temp2.table > '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_splited.table 

rm '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_temp1.table
rm '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_temp2.table

' > $HOME_DIRECTORY/scripts/split_columns.sh



less $HOME_DIRECTORY/VariantFiltration_parameter_combinations | while read a b c d e f g h; do printf "
sbatch $HOME_DIRECTORY/scripts/split_columns.sh "$a" "$b"
"; done > $HOME_DIRECTORY/scripts/split_columns_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/split_columns_submit_jobs


<a id="37"></a> 
#### Derive singletons

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/VariantFiltration
JOB_NAME=singletons


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-01:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB


regime=$1
parameter_combination=$2

## HOM-REF == 5, HET == 1

awk '"'"'($16 == 5) && ($17 == 1) && ($18 == 0) {print $0}'"'"' '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_splited.table > '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_splited_singletons_HOMREF_HET.table

## HOM-VAR == 5, HET == 1

awk '"'"'($16 == 0) && ($17 == 1) && ($18 == 5) {print $0}'"'"' '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_splited.table > '$HOME_DIRECTORY'/vcf_handling/VariantFiltration/$(basename $regime)_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_$(basename $parameter_combination)_PASS_splited_singletons_HOMVAR_HET.table

' > $HOME_DIRECTORY/scripts/singletons.sh



less $HOME_DIRECTORY/VariantFiltration_parameter_combinations | while read a b c d e f g h; do printf "
sbatch $HOME_DIRECTORY/scripts/singletons.sh "$a" "$b"
"; done > $HOME_DIRECTORY/scripts/singletons_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/singletons_submit_jobs



<a id="38"></a> 
#### Filter on the allelic ratio of the mutant allele
This script performs further filtering to derive sites where the allelic ratio of mutant allele is between 0.25 and 0.75 in the mutant sample, and no mutant allele is found in non-mutant samples of the same regime.


In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu


less $HOME_DIRECTORY/regime_parameter_type_sample1 | while read a b c; do printf "
## for sample1, HOM-REF/HET
awk ' (\$23 == \$3) && (\$24 == \$4) && (\$29 == \$3) && (\$30 == \$3) && (\$35 == \$3) && (\$36 == \$3) && (\$41 == \$3) && (\$42 == \$3) && (\$47 == \$3) && (\$48 == \$3) && (\$53 == \$3) && (\$54 == \$3) && (\$26/(\$26+\$25) >= 0.25) && (\$26/(\$26+\$25) <= 0.75) && (\$32 == 0) && (\$38 == 0) && (\$44 == 0) && (\$50 == 0) && (\$56 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMREF_HET.table
## for sample1, HOM-VAR/HET
awk ' (\$23 == \$3) && (\$24 == \$4) && (\$29 == \$4) && (\$30 == \$4) && (\$35 == \$4) && (\$36 == \$4) && (\$41 == \$4) && (\$42 == \$4) && (\$47 == \$4) && (\$48 == \$4) && (\$53 == \$4) && (\$54 == \$4) && (\$26/(\$26+\$25) >= 0.25) && (\$26/(\$26+\$25) <= 0.75) && (\$31 == 0) && (\$37 == 0) && (\$43 == 0) && (\$49 == 0) && (\$55 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMVAR_HET.table
"; done > $HOME_DIRECTORY/scripts/singletons_filtering_sample1




less $HOME_DIRECTORY/regime_parameter_type_sample2 | while read a b c; do printf "
## for sample2, HOM-REF/HET
awk ' (\$23 == \$3) && (\$24 == \$3) && (\$29 == \$3) && (\$30 == \$4) && (\$35 == \$3) && (\$36 == \$3) && (\$41 == \$3) && (\$42 == \$3) && (\$47 == \$3) && (\$48 == \$3) && (\$53 == \$3) && (\$54 == \$3) && (\$32/(\$32+\$31) >= 0.25) && (\$32/(\$32+\$31) <= 0.75) && (\$26 == 0) && (\$38 == 0) && (\$44 == 0) && (\$50 == 0) && (\$56 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMREF_HET.table
## for sample2, HOM-VAR/HET
awk ' (\$23 == \$4) && (\$24 == \$4) && (\$29 == \$3) && (\$30 == \$4) && (\$35 == \$4) && (\$36 == \$4) && (\$41 == \$4) && (\$42 == \$4) && (\$47 == \$4) && (\$48 == \$4) && (\$53 == \$4) && (\$54 == \$4) && (\$32/(\$32+\$31) >= 0.25) && (\$32/(\$32+\$31) <= 0.75) && (\$25 == 0) && (\$37 == 0) && (\$43 == 0) && (\$49 == 0) && (\$55 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMVAR_HET.table
"; done > $HOME_DIRECTORY/scripts/singletons_filtering_sample2



less $HOME_DIRECTORY/regime_parameter_type_sample3 | while read a b c; do printf "
## for sample3, HOM-REF/HET
awk ' (\$23 == \$3) && (\$24 == \$3) && (\$29 == \$3) && (\$30 == \$3) && (\$35 == \$3) && (\$36 == \$4) && (\$41 == \$3) && (\$42 == \$3) && (\$47 == \$3) && (\$48 == \$3) && (\$53 == \$3) && (\$54 == \$3) && (\$38/(\$38+\$37) >= 0.25) && (\$38/(\$38+\$37) <= 0.75) && (\$26 == 0) && (\$32 == 0) && (\$44 == 0) && (\$50 == 0) && (\$56 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMREF_HET.table
## for sample3, HOM-VAR/HET
awk ' (\$23 == \$4) && (\$24 == \$4) && (\$29 == \$4) && (\$30 == \$4) && (\$35 == \$3) && (\$36 == \$4) && (\$41 == \$4) && (\$42 == \$4) && (\$47 == \$4) && (\$48 == \$4) && (\$53 == \$4) && (\$54 == \$4) && (\$38/(\$38+\$37) >= 0.25) && (\$38/(\$38+\$37) <= 0.75) && (\$25 == 0) && (\$31 == 0) && (\$43 == 0) && (\$49 == 0) && (\$55 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMVAR_HET.table
"; done > $HOME_DIRECTORY/scripts/singletons_filtering_sample3




less $HOME_DIRECTORY/regime_parameter_type_sample4 | while read a b c; do printf "
## for sample4, HOM-REF/HET
awk ' (\$23 == \$3) && (\$24 == \$3) && (\$29 == \$3) && (\$30 == \$3) && (\$35 == \$3) && (\$36 == \$3) && (\$41 == \$3) && (\$42 == \$4) && (\$47 == \$3) && (\$48 == \$3) && (\$53 == \$3) && (\$54 == \$3) && (\$44/(\$44+\$43) >= 0.25) && (\$44/(\$44+\$43) <= 0.75) && (\$26 == 0) && (\$32 == 0) && (\$38 == 0) && (\$50 == 0) && (\$56 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMREF_HET.table
## for sample4, HOM-VAR/HET
awk ' (\$23 == \$4) && (\$24 == \$4) && (\$29 == \$4) && (\$30 == \$4) && (\$35 == \$4) && (\$36 == \$4) && (\$41 == \$3) && (\$42 == \$4) && (\$47 == \$4) && (\$48 == \$4) && (\$53 == \$4) && (\$54 == \$4) && (\$44/(\$44+\$43) >= 0.25) && (\$44/(\$44+\$43) <= 0.75) && (\$25 == 0) && (\$31 == 0) && (\$37 == 0) && (\$49 == 0) && (\$55 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMVAR_HET.table
"; done > $HOME_DIRECTORY/scripts/singletons_filtering_sample4




less $HOME_DIRECTORY/regime_parameter_type_sample5 | while read a b c; do printf "
## for sample5, HOM-REF/HET
awk ' (\$23 == \$3) && (\$24 == \$3) && (\$29 == \$3) && (\$30 == \$3) && (\$35 == \$3) && (\$36 == \$3) && (\$41 == \$3) && (\$42 == \$3) && (\$47 == \$3) && (\$48 == \$4) && (\$53 == \$3) && (\$54 == \$3) && (\$50/(\$49+\$50) >= 0.25) && (\$50/(\$49+\$50) <= 0.75) && (\$26 == 0) && (\$32 == 0) && (\$38 == 0) && (\$44 == 0) && (\$56 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMREF_HET.table
## for sample5, HOM-VAR/HET
awk ' (\$23 == \$4) && (\$24 == \$4) && (\$29 == \$4) && (\$30 == \$4) && (\$35 == \$4) && (\$36 == \$4) && (\$41 == \$4) && (\$42 == \$4) && (\$47 == \$3) && (\$48 == \$4) && (\$53 == \$4) && (\$54 == \$4) && (\$50/(\$49+\$50) >= 0.25) && (\$50/(\$49+\$50) <= 0.75) && (\$25 == 0) && (\$31 == 0) && (\$37 == 0) && (\$43 == 0) && (\$55 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMVAR_HET.table
"; done > $HOME_DIRECTORY/scripts/singletons_filtering_sample5



less $HOME_DIRECTORY/regime_parameter_type_sample6 | while read a b c; do printf "
## for sample6, HOM-REF/HET
awk ' (\$23 == \$3) && (\$24 == \$3) && (\$29 == \$3) && (\$30 == \$3) && (\$35 == \$3) && (\$36 == \$3) && (\$41 == \$3) && (\$42 == \$3) && (\$47 == \$3) && (\$48 == \$3) && (\$53 == \$3) && (\$54 == \$4) && (\$56/(\$55+\$56) >= 0.25) && (\$56/(\$55+\$56) <= 0.75) && (\$26 == 0) && (\$32 == 0) && (\$38 == 0) && (\$44 == 0) && (\$50 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMREF_HET.table
## for sample6, HOM-VAR/HET
awk ' (\$23 == \$4) && (\$24 == \$4) && (\$29 == \$4) && (\$30 == \$4) && (\$35 == \$4) && (\$36 == \$4) && (\$41 == \$4) && (\$42 == \$4) && (\$47 == \$4) && (\$48 == \$4) && (\$53 == \$3) && (\$54 == \$4) && (\$56/(\$55+\$56) >= 0.25) && (\$56/(\$55+\$56) <= 0.75) && (\$25 == 0) && (\$31 == 0) && (\$37 == 0) && (\$43 == 0) && (\$49 == 0) ' $HOME_DIRECTORY/vcf_handling/VariantFiltration/"$a"_SNP_DPfiltered_AllCalled_BIALLELIC_VariantFiltration_"$b"_PASS_splited_singletons_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$c"_"$b"_HOMVAR_HET.table
"; done > $HOME_DIRECTORY/scripts/singletons_filtering_sample6

#sh -e $HOME_DIRECTORY/scripts/singletons_filtering_sample1
#sh -e $HOME_DIRECTORY/scripts/singletons_filtering_sample2
#sh -e $HOME_DIRECTORY/scripts/singletons_filtering_sample3
#sh -e $HOME_DIRECTORY/scripts/singletons_filtering_sample4
#sh -e $HOME_DIRECTORY/scripts/singletons_filtering_sample5
#sh -e $HOME_DIRECTORY/scripts/singletons_filtering_sample6

<a id="39"></a> 
### Derive the number of candidate germline mutations for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu


touch $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection

less $HOME_DIRECTORY/regime_parameter_type_sample_wide | while read a b c d e f g h i; do printf "
wc -l $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$d"_"$b"_"$c".table >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
wc -l $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$e"_"$b"_"$c".table >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
wc -l $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$f"_"$b"_"$c".table >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
wc -l $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$g"_"$b"_"$c".table >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
wc -l $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$h"_"$b"_"$c".table >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
wc -l $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$i"_"$b"_"$c".table >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
"; done > $HOME_DIRECTORY/scripts/HaplotypeCaller_number_GM_before_manual_inspection

sh -e $HOME_DIRECTORY/scripts/HaplotypeCaller_number_GM_before_manual_inspection

sed -i 's/\/gpfs\/home\/wwh23wcu\/de_novo_mutation\/consensus_approach\///' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection

sed -i 's/_/ /' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection
sed -i 's/_/ /' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection

sed -i 's/.table//' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection


awk -F' ' 'BEGIN { SUBSEP = OFS = FS } { s[$2,$3] += $1; ++c[$2,$3] } END { for (i in s) { print i, s[i], c[i] } }' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection | sort | uniq > $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection_temp

mv $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection_temp $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM_before_manual_inspection


<a id="40"></a> 
### Derive bed files for manual inspection
This script derives a bed file for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu


less $HOME_DIRECTORY/regime_sample | while read a b; do printf "
awk '{print \$1\" \"\$2\" \"\$3\" \"\$4}' $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$b"_1_HOMREF_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$b"_1_HOMREF_HET.bed
awk '{print \$1\" \"\$2\" \"\$4\" \"\$3}' $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$b"_1_HOMVAR_HET.table > $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$b"_1_HOMVAR_HET.bed

cat $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$b"_1_HOMREF_HET.bed $HOME_DIRECTORY/de_novo_mutation/consensus_approach/"$b"_1_HOMVAR_HET.bed > $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_consensus_bed
"; done > $HOME_DIRECTORY/scripts/consensus_derive_bed

#sh -e $HOME_DIRECTORY/scripts/consensus_derive_bed

<a id="41"></a> 
## Callability

<a id="46"></a> 
### Derive genotyped sites for each regime
This script filters out sites with missing genotype in any of the samples of the same regime and keeps sites that all samples of the same regime are successfully genotyped.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/callability
JOB_NAME=callability_cohort

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-04:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load bcftools/1.15.1

bcftools view -i '"'"' FORMAT/GT[0] != "mis" && FORMAT/GT[1] != "mis" && FORMAT/GT[2] != "mis"  && FORMAT/GT[3] != "mis" && FORMAT/GT[4] != "mis" && FORMAT/GT[5] != "mis" '"'"' '$HOME_DIRECTORY'/vcf_handling/GenotypeGVCFs/$(basename $regime)_GenotypeGVCFs.g.vcf.gz \
--output '$HOME_DIRECTORY'/vcf_handling/callability/$(basename $regime)_GenotypeGVCFs_callable.g.vcf.gz

bcftools index -t '$HOME_DIRECTORY'/vcf_handling/callability/$(basename $regime)_GenotypeGVCFs_callable.g.vcf.gz

' > $HOME_DIRECTORY/scripts/callability_cohort.sh


less $HOME_DIRECTORY/regime_list | while read a b; do printf "
sbatch $HOME_DIRECTORY/scripts/callability_cohort.sh "$a"
"; done > $HOME_DIRECTORY/scripts/callability_cohort_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/callability_cohort_submit_jobs


<a id="43"></a> 
### Derive the number of callable sites for each regime

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu


touch $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime

less $HOME_DIRECTORY/regime_list | while read a; do printf "
bcftools index --nrecords $HOME_DIRECTORY/vcf_handling/callability/"$a"_GenotypeGVCFs_callable.g.vcf.gz >> $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime
"; done > $HOME_DIRECTORY/scripts/callability_cohort_derive_results


sed -i '1i module load bcftools/1.15.1' $HOME_DIRECTORY/scripts/callability_cohort_derive_results

# sh -e $HOME_DIRECTORY/scripts/callability_cohort_derive_results
