<a id="52"></a> 
# Manual inspection

<a id="53"></a> 
## Merge and annotate the bed files from both approaches for each sample

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

less $HOME_DIRECTORY/regime_sample | while read a b; do echo "
awk '\$5 = \"consensus\" {print \$1\" \"\$2\" \"\$3\" \"\$4\" \"\$5}' $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_consensus_bed > $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_consensus_bed_temp
awk '\$5 = \"probabilistic\" {print \$1\" \"\$2\" \"\$3\" \"\$4\" \"\$5}' $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_probabilistic_bed > $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_probabilistic_bed_temp

cat $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_consensus_bed_temp $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_probabilistic_bed_temp > $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_bed_temp

sort $HOME_DIRECTORY/de_novo_mutation/bed/"$b"_bed_temp > $HOME_DIRECTORY/de_novo_mutation/bed/manual_inspection/"$b"_bed_before_manual_inspection

rm $HOME_DIRECTORY/de_novo_mutation/bed/*_temp
"; done > $HOME_DIRECTORY/scripts/prepare_bed_files

sh -e $HOME_DIRECTORY/scripts/prepare_bed_files


<a id="54"></a> 
## Derive single-sample vcf
This script splits the multi-sample vcf derived from joint genotyping to six single-sample vcf

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bcftools_split
JOB_NAME=bcftools_split

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-04:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1

module load bcftools/1.15.1
module load htslib/1.15.1

bcftools +split '$HOME_DIRECTORY'/vcf_handling/GenotypeGVCFs/$(basename $regime)_GenotypeGVCFs.g.vcf.gz -O z -o '$HOME_DIRECTORY'/de_novo_mutation/bed/manual_inspection/

' > $HOME_DIRECTORY/scripts/bcftools_split.sh


less $HOME_DIRECTORY/regime_list | while read a; do printf "
sbatch $HOME_DIRECTORY/scripts/bcftools_split.sh "$a"
"; done > $HOME_DIRECTORY/scripts/bcftools_split_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bcftools_split_submit_jobs


<a id="55"></a> 
## Index single-sample vcf
This script indexes the single-sample vcf files

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bcftools_split
JOB_NAME=bcftools_split_tabix

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-04:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

regime=$1
sample_ID=$2

module load bcftools/1.15.1
module load htslib/1.15.1

tabix -p vcf '$HOME_DIRECTORY'/de_novo_mutation/bed/manual_inspection/$(basename $sample_ID).vcf.gz

' > $HOME_DIRECTORY/scripts/bcftools_split_tabix.sh


less $HOME_DIRECTORY/regime_sample_field_for_DP | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/bcftools_split_tabix.sh "$a" "$b"
"; done > $HOME_DIRECTORY/scripts/bcftools_split_tabix_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bcftools_split_tabix_submit_jobs


<a id="56"></a> 
## Files needed for manual inspection

Files needed for manual inspection include:
1 and 2: the reference assembly;
3 and 4: bam and bai files for each sample;
5 and 6: single-sample vcf and its index for each sample;
7: bed files containing the coordination for each candidate germline mutation for each sample.

1. /gpfs/scratch/wwh23wcu/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna
2. /gpfs/scratch/wwh23wcu/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.fai
3. /gpfs/scratch/wwh23wcu/bam_files/*_viewed_marked_addg.bam 
4. /gpfs/scratch/wwh23wcu/bam_files/*_viewed_marked_addg.bam.bai
5. /gpfs/home/wwh23wcu/de_novo_mutation/bed/manual_inspection/*.vcf.gz
6. /gpfs/home/wwh23wcu/de_novo_mutation/bed/manual_inspection/*.vcf.gz.tbi
7. /gpfs/home/wwh23wcu/de_novo_mutation/bed/manual_inspection/*_bed_before_manual_inspection

For each sample, load the reference assembly and the vcf in IGV; then load the bam files for all samples of the same regime in IGV. Manually inspect individual candidate germline mutation and annotate 'ACCEPTED' or 'REJECTED' in the bed file. When finished, save the bed file as sampleID_bed_after_manual_inspection, and upload it to /gpfs/home/wwh23wcu/de_novo_mutation/bed/manual_inspection/.

<a id="57"></a> 
# Find overlapping variants

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

less $HOME_DIRECTORY/regime_sample | while read a b; do echo "
awk '{print \$1\" \"\$2\" \"\$3\" \"\$4}' $HOME_DIRECTORY/de_novo_mutation/bed/manual_inspection/"$b"_bed_after_manual_inspection | sort | uniq -d > $HOME_DIRECTORY/de_novo_mutation/"$b"_bed_after_manual_inspection_overlapping
"; done > $HOME_DIRECTORY/scripts/find_overlapping_variants

<a id="58"></a> 
# Aggregate the results
This script aggregates callability and the number of accepted germline mutations for each sample.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

paste -d ' ' $HOME_DIRECTORY/regime_list $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime > $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime_temp && mv $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime_temp $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime

perl -ne 'print $_ x 36' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime > $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime_temp

paste -d ' ' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime_temp $HOME_DIRECTORY/de_novo_mutation/consensus_approach_number_GM > $HOME_DIRECTORY/de_novo_mutation/consensus_approach_results_temp

awk '{$6=""; print $0}' $HOME_DIRECTORY/de_novo_mutation/consensus_approach_results_temp > $HOME_DIRECTORY/de_novo_mutation/consensus_approach_results


rm $HOME_DIRECTORY/de_novo_mutation/consensus_approach_callability_regime_temp
rm $HOME_DIRECTORY/de_novo_mutation/consensus_approach_results_temp