<a id="44"></a> 
# Variant calling by the probabilistic approach

<a id="44"></a> 
# Variant calling by the probabilistic approach

<a id="45"></a> 
## Merge bam files for each regime
This script uses the picard MergeSamFiles tool to merge single-sample bam files of the same regime into a multi-sample bam file. Total three multi-sample bam files, one for each regime, are generated.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output
JOB_NAME=picard_MergeSamFiles

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=60G
#SBATCH --time=0-48:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
regime_sample1=$2
regime_sample2=$3
regime_sample3=$4
regime_sample4=$5
regime_sample5=$6
regime_sample6=$7

module load java/jdk1.8.0_231
module load picard/2.24.1


java -jar /gpfs/software/ada/picard/2.24.1/picard.jar MergeSamFiles \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample1)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample2)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample3)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample4)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample5)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample6)_viewed_marked_addg.bam \
O='$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.bam

' > $HOME_DIRECTORY/scripts/MergeSamFiles.sh


less $HOME_DIRECTORY/regime_sample_wide | while read a b c d e f g; do printf "
sbatch $HOME_DIRECTORY/scripts/MergeSamFiles.sh "$a" "$b" "$c" "$d" "$e" "$f" "$g"
"; done > $HOME_DIRECTORY/scripts/MergeSamFiles_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/MergeSamFiles_submit_jobs


<a id="46"></a> 
## Add a readgroup for the ancestor to the header of the merged bam
This script uses the samtools view tool to add a dummy readgroup for the ancestor to the merged, multi-sample bam file.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/accuMUlate
JOB_NAME=addg


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=12G
#SBATCH --time=0-12:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1


module load samtools/1.16.1

samtools view -H '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.bam > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.head.sam

sed '"'"'19 i @RG\tID:Ancestral\tLB:lib1 PL:illumina_TruSeq\tSM:Ancestral\tPU:unit1'"'"' '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.head.sam > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.head.sam

samtools view '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.bam | cat '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.head.sam - | samtools view -Sb - > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam

samtools index '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam


module load python/3.10.0

samtools view -H '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam | python '$HOME_DIRECTORY'/extract_samples.py Ancestral - >> '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini


cat '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini '$HOME_DIRECTORY'/accuMUlate_parameters > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params_temp.ini && mv '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params_temp.ini '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini

' > $HOME_DIRECTORY/scripts/addg.sh



less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/addg.sh "$a"
"; done > $HOME_DIRECTORY/scripts/addg_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/addg_submit_jobs




<a id="47"></a> 
## Call variants
This script uses the accuMUlate tool to call variants.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
REF_FASTA_DECOMPRESSED=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/accuMUlate
JOB_NAME=accuMUlate

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=30G
#SBATCH --time=5-00:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load accuMUlate/0.2.2

/gpfs/software/ada/accuMUlate/0.2.2/accuMUlate \
-b '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam \
-r '$REF_FASTA_DECOMPRESSED' \
-q 13 \
-m 13 \
-p 0.10000000000000001 \
--ploidy-ancestor 2 \
--ploidy-descendant 2 \
-c '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini \
-o '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_putative_mutations.tsv

' > $HOME_DIRECTORY/scripts/accuMUlate.sh


less $HOME_DIRECTORY/regime_list | while read a b c d e f g; do printf "
sbatch $HOME_DIRECTORY/scripts/accuMUlate.sh "$a"
"; done > $HOME_DIRECTORY/scripts/accuMUlate_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/accuMUlate_submit_jobs


<a id="48"></a> 
## Variant filtering
This scripts filters to keep the following:
1. Sites outside of soft-masked regions;
2. Sites where estimated probability of at least one mutation at this site = 1;
3. Sites where estimated probability of exactly one mutation at this site = 1; 
4. Sites where probability of the mutation direction above being correct = 1; 
5. Homozygous to heterozygous mutations;
6. Sites where the number of reads for the mutant allele in non-mutant samples = 0; 
7. Sites where the allelic ratio for the mutant allele in the mutant sample is between 0.25 and 0.75;
8. Sites where the AD test statistic for mapping quality difference between mutant and non-mutant alleles =< 1.95;
9. Sites where the AD test statistic for insert size differfence between inferred insert size between mutant and non-mutant alleles =< 1.95;
10. Sites where the p-value from Fisher's exact test of strand bias between mutant and non-mutant alleles is > 0.05;
11. Sites where the p-value from Fisher's exact test of pair-mapping rate difference between mutant and non-mutant alleles is > 0.05.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

less $HOME_DIRECTORY/regime_list | while read a; do printf "

awk '"' $4~/[A-Z]/ '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp1.tsv

awk '"' $7==1 && $8==1 && $9==1 {print $0}'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp1.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp2.tsv

awk '"' { if (substr($6,1,1) == substr($6,2,1) && substr($6,1,1) == substr($6,5,1) && substr($6,5,1) != substr($6,6,1) ) print $0 }'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp2.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp3.tsv

awk '"' $15==0 {print $0}'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp3.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp4.tsv

awk '"' $14/($14+$12+$13) >=0.25 && $14/($14+$12+$13) <= 0.75 {print $0}'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp4.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp5.tsv

awk '"' $16 <= 1.95 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp5.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp6.tsv

awk '"' $17 <= 1.95 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp6.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp7.tsv

awk '"' $18 > 0.05 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp7.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp8.tsv

awk '"' $19 > 0.05 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp8.tsv > $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$a"_filtered_mutations.tsv

rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp1.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp2.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp3.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp4.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp5.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp6.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp7.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp8.tsv
"; done > $HOME_DIRECTORY/scripts/accuMUlate_filtering

#sh -e $HOME_DIRECTORY/scripts/accuMUlate_filtering


<a id="49"></a> 
## Derive the number of candidate germline mutations for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

touch $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM

less $HOME_DIRECTORY/regime_sample | while read a b; do printf "
grep '"$b"' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$a"_filtered_mutations.tsv > $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$b"_filtered_mutations.tsv
wc -l $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$b"_filtered_mutations.tsv >> $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM_before_manual_inspection
"; done > $HOME_DIRECTORY/scripts/accuMUlate_number_GM_before_manual_inspection

sh -e $HOME_DIRECTORY/scripts/accuMUlate_number_GM_before_manual_inspection

sed -i 's/\/gpfs\/home\/wwh23wcu\/de_novo_mutation\/probabilistic_approach\///' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM_before_manual_inspection

sed -i 's/_filtered_mutations.tsv//' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM_before_manual_inspection


<a id="45"></a> 
## Merge bam files for each regime
This script uses the picard MergeSamFiles tool to merge single-sample bam files of the same regime into a multi-sample bam file. Total three multi-sample bam files, one for each regime, are generated.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output
JOB_NAME=picard_MergeSamFiles

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=60G
#SBATCH --time=0-48:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
regime_sample1=$2
regime_sample2=$3
regime_sample3=$4
regime_sample4=$5
regime_sample5=$6
regime_sample6=$7

module load java/jdk1.8.0_231
module load picard/2.24.1


java -jar /gpfs/software/ada/picard/2.24.1/picard.jar MergeSamFiles \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample1)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample2)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample3)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample4)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample5)_viewed_marked_addg.bam \
I='$SCRATCH_DIRECTORY'/bam_files/$(basename $regime_sample6)_viewed_marked_addg.bam \
O='$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.bam

' > $HOME_DIRECTORY/scripts/MergeSamFiles.sh


less $HOME_DIRECTORY/regime_sample_wide | while read a b c d e f g; do printf "
sbatch $HOME_DIRECTORY/scripts/MergeSamFiles.sh "$a" "$b" "$c" "$d" "$e" "$f" "$g"
"; done > $HOME_DIRECTORY/scripts/MergeSamFiles_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/MergeSamFiles_submit_jobs


<a id="46"></a> 
## Add a readgroup for the ancestor to the header of the merged bam
This script uses the samtools view tool to add a dummy readgroup for the ancestor to the merged, multi-sample bam file.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/accuMUlate
JOB_NAME=addg


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=12G
#SBATCH --time=0-12:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1


module load samtools/1.16.1

samtools view -H '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.bam > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.head.sam

sed '"'"'19 i @RG\tID:Ancestral\tLB:lib1 PL:illumina_TruSeq\tSM:Ancestral\tPU:unit1'"'"' '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.head.sam > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.head.sam

samtools view '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles.bam | cat '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.head.sam - | samtools view -Sb - > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam

samtools index '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam


module load python/3.10.0

samtools view -H '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam | python '$HOME_DIRECTORY'/extract_samples.py Ancestral - >> '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini


cat '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini '$HOME_DIRECTORY'/accuMUlate_parameters > '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params_temp.ini && mv '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params_temp.ini '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini

' > $HOME_DIRECTORY/scripts/addg.sh



less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/addg.sh "$a"
"; done > $HOME_DIRECTORY/scripts/addg_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/addg_submit_jobs




<a id="47"></a> 
## Call variants
This script uses the accuMUlate tool to call variants.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
REF_FASTA_DECOMPRESSED=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/accuMUlate
JOB_NAME=accuMUlate

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=30G
#SBATCH --time=5-00:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load accuMUlate/0.2.2

/gpfs/software/ada/accuMUlate/0.2.2/accuMUlate \
-b '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_addg.bam \
-r '$REF_FASTA_DECOMPRESSED' \
-q 13 \
-m 13 \
-p 0.10000000000000001 \
--ploidy-ancestor 2 \
--ploidy-descendant 2 \
-c '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_MergeSamFiles_params.ini \
-o '$HOME_DIRECTORY'/accuMUlate/$(basename $regime)_putative_mutations.tsv

' > $HOME_DIRECTORY/scripts/accuMUlate.sh


less $HOME_DIRECTORY/regime_list | while read a b c d e f g; do printf "
sbatch $HOME_DIRECTORY/scripts/accuMUlate.sh "$a"
"; done > $HOME_DIRECTORY/scripts/accuMUlate_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/accuMUlate_submit_jobs


<a id="48"></a> 
## Variant filtering
This scripts filters to keep the following:
1. Sites outside of soft-masked regions;
2. Sites where estimated probability of at least one mutation at this site = 1;
3. Sites where estimated probability of exactly one mutation at this site = 1; 
4. Sites where probability of the mutation direction above being correct = 1; 
5. Homozygous to heterozygous mutations;
6. Sites where the number of reads for the mutant allele in non-mutant samples = 0; 
7. Sites where the allelic ratio for the mutant allele in the mutant sample is between 0.25 and 0.75;
8. Sites where the AD test statistic for mapping quality difference between mutant and non-mutant alleles =< 1.95;
9. Sites where the AD test statistic for insert size differfence between inferred insert size between mutant and non-mutant alleles =< 1.95;
10. Sites where the p-value from Fisher's exact test of strand bias between mutant and non-mutant alleles is > 0.05;
11. Sites where the p-value from Fisher's exact test of pair-mapping rate difference between mutant and non-mutant alleles is > 0.05.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

less $HOME_DIRECTORY/regime_list | while read a; do printf "

awk '"' $4~/[A-Z]/ '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp1.tsv

awk '"' $7==1 && $8==1 && $9==1 {print $0}'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp1.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp2.tsv

awk '"' { if (substr($6,1,1) == substr($6,2,1) && substr($6,1,1) == substr($6,5,1) && substr($6,5,1) != substr($6,6,1) ) print $0 }'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp2.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp3.tsv

awk '"' $15==0 {print $0}'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp3.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp4.tsv

awk '"' $14/($14+$12+$13) >=0.25 && $14/($14+$12+$13) <= 0.75 {print $0}'"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp4.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp5.tsv

awk '"' $16 <= 1.95 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp5.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp6.tsv

awk '"' $17 <= 1.95 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp6.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp7.tsv

awk '"' $18 > 0.05 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp7.tsv > $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp8.tsv

awk '"' $19 > 0.05 {print $0} '"' $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp8.tsv > $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$a"_filtered_mutations.tsv

rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp1.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp2.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp3.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp4.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp5.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp6.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp7.tsv
rm $HOME_DIRECTORY/accuMUlate/"$a"_putative_mutations_temp8.tsv
"; done > $HOME_DIRECTORY/scripts/accuMUlate_filtering

#sh -e $HOME_DIRECTORY/scripts/accuMUlate_filtering


<a id="49"></a> 
## Derive the number of candidate germline mutations for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

touch $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM

less $HOME_DIRECTORY/regime_sample | while read a b; do printf "
grep '"$b"' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$a"_filtered_mutations.tsv > $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$b"_filtered_mutations.tsv
wc -l $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$b"_filtered_mutations.tsv >> $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM_before_manual_inspection
"; done > $HOME_DIRECTORY/scripts/accuMUlate_number_GM_before_manual_inspection

sh -e $HOME_DIRECTORY/scripts/accuMUlate_number_GM_before_manual_inspection

sed -i 's/\/gpfs\/home\/wwh23wcu\/de_novo_mutation\/probabilistic_approach\///' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM_before_manual_inspection

sed -i 's/_filtered_mutations.tsv//' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach_number_GM_before_manual_inspection


<a id="51"></a> 
## Derive bed files for manual inspection
This script derives a bed file for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

less $HOME_DIRECTORY/regime_sample | while read a b; do echo "

awk '{print \$1\" \"\$3\" \"\$4\" \"substr(\$6,6,1)}' $HOME_DIRECTORY/de_novo_mutation/probabilistic_approach/"$b"_filtered_mutations.tsv > $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_probabilistic_bed

"; done > $HOME_DIRECTORY/scripts/probabilistic_derive_bed

sh -e $HOME_DIRECTORY/scripts/probabilistic_derive_bed
