<a id="9"></a> 
# Pre-processing

<a id="10"></a> 
## Trimming

<a id="11"></a> 
### Trim adapters

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/trimming
JOB_NAME=trimming

echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=6G
#SBATCH --time=0-02:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file_1=$1
input_file_2=$2

module load fastp/0.23.2

fastp -i '$SCRATCH_DIRECTORY'/fastq_files/$input_file_1 -I '$SCRATCH_DIRECTORY'/fastq_files/$input_file_2 -o '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $input_file_1 '.fastq.gz')_fastp.fastq.gz -O '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $input_file_2 '.fastq.gz')_fastp.fastq.gz --detect_adapter_for_pe -q 15 -l 15

' > $HOME_DIRECTORY/scripts/trimming.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/trimming.sh "$a" "$b"
"; done > $HOME_DIRECTORY/scripts/trimming_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/trimming_submit_jobs


<a id="12"></a> 
## Mapping

<a id="13"></a> 
### Map adapter-trimmed reads against the reference assembly

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/bwa_mem
JOB_NAME=bwa_mem


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=18G
#SBATCH --time=0-72:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file_1=$1
input_file_2=$2
output_file=$3

module load bwa/0.7.17

bwa mem -t 6 '$REF_FASTA' '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $input_file_1 '.fastq.gz')_fastp.fastq.gz '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $input_file_2 '.fastq.gz')_fastp.fastq.gz > '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $output_file).sam

' > $HOME_DIRECTORY/scripts/bwa_mem.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/bwa_mem.sh "$a" "$b" "$c"
"; done > $HOME_DIRECTORY/scripts/bwa_mem_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/bwa_mem_submit_jobs


<a id="14"></a> 
### Sort the sam files

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/samtools_sort
JOB_NAME=samtools_sort_sam_to_bam


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=18G
#SBATCH --time=0-72:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file=$1

module load samtools/1.16.1

samtools sort -T '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $input_file) -o '$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file).bam '$SCRATCH_DIRECTORY'/trimmed_seq/$(basename $input_file).sam 
' > $HOME_DIRECTORY/scripts/samtools_sort_sam_to_bam.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/samtools_sort_sam_to_bam.sh "$c"
"; done > $HOME_DIRECTORY/scripts/samtools_sort_sam_to_bam_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/samtools_sort_sam_to_bam_submit_jobs



<a id="15"></a> 
### View the bam files

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/samtools_view
JOB_NAME=samtools_view_bam


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=18G
#SBATCH --time=0-48:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file=$1

module load samtools/1.16.1

samtools view '$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file).bam -h -b -o '$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed.bam

' > $HOME_DIRECTORY/scripts/samtools_view_bam.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/samtools_view_bam.sh "$c"
"; done > $HOME_DIRECTORY/scripts/samtools_view_bam_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/samtools_view_bam_submit_jobs



<a id="16"></a> 
### Mark duplicated reads in the bam files

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/MarkDuplicates
JOB_NAME=MarkDuplicates


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=36G
#SBATCH --time=0-02:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file=$1

module load java/jdk1.8.0_231
module load picard/2.24.1

java -jar /gpfs/software/ada/picard/2.24.1/picard.jar MarkDuplicates I='$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed.bam O='$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed_marked.bam METRICS_FILE='$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_metrics.txt ASSUME_SORTED=true VALIDATION_STRINGENCY=SILENT

' > $HOME_DIRECTORY/scripts/MarkDuplicates.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/MarkDuplicates.sh "$c"
"; done > $HOME_DIRECTORY/scripts/MarkDuplicates_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/MarkDuplicates_submit_jobs



<a id="17"></a> 
### Add read groups to the bam files

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/AddOrReplaceReadGroups
JOB_NAME=AddOrReplaceReadGroups


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=6G
#SBATCH --time=0-12:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file=$1

module load java/jdk1.8.0_231
module load picard/2.24.1

java -jar /gpfs/software/ada/picard/2.24.1/picard.jar AddOrReplaceReadGroups I='$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed_marked.bam  O='$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed_marked_addg.bam RGID=$(basename $input_file) RGLB=lib1 RGPL=illumina_TruSeq RGSM=$(basename $input_file) RGPU=unit1

' > $HOME_DIRECTORY/scripts/AddOrReplaceReadGroups.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/AddOrReplaceReadGroups.sh "$c"
"; done > $HOME_DIRECTORY/scripts/AddOrReplaceReadGroups_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/AddOrReplaceReadGroups_submit_jobs





<a id="18"></a> 
### Index the bam files

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/samtools_index
JOB_NAME=samtools_index_bam


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=6G
#SBATCH --time=0-01:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

input_file=$1

module load samtools/1.16.1

samtools index '$SCRATCH_DIRECTORY'/bam_files/$(basename $input_file)_viewed_marked_addg.bam 

' > $HOME_DIRECTORY/scripts/samtools_index.sh



less $HOME_DIRECTORY/fastq_list | while read a b c; do printf "
sbatch $HOME_DIRECTORY/scripts/samtools_index.sh "$c"
"; done > $HOME_DIRECTORY/scripts/samtools_index_submit_jobs

# sh -e $HOME_DIRECTORY/scripts/samtools_index_submit_jobs

