<a id="5"></a> 
# Prepare the reference fasta 

<a id="6"></a> 
## Index the reference fasta 

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu
FILE_DIRECTORY=/gpfs/data/justin_rna/Hwei-yen

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz
JOB_OUTPUT=$SCRATCH_DIRECTORY/job_output/prepare_ref
JOB_NAME=prepare_reference_fasta


echo '#!/bin/bash -e
#SBATCH --mail-type=ALL #Mail events (NONE, BEGIN, END, FAIL, ALL) 
#SBATCH --mail-user=papilio.chen@gmail.com #Where to send mail
#SBATCH -p compute-24-96
#SBATCH --mem=3G
#SBATCH --time=0-02:00
#SBATCH --job-name='$JOB_NAME'
#SBATCH -o '$JOB_OUTPUT'/'$JOB_NAME'-%j.out
#SBATCH -e '$JOB_OUTPUT'/'$JOB_NAME'-%j.err

#ABOVE THIS LINE ARE THE SLURM DIRECTIVES
#BELOW THIS LINE ARE THE COMMANDS TO RUN YOUR JOB

module load bwa/0.7.17
module load java/jdk1.8.0_231
module load picard/2.24.1
module load htslib/1.15.1
module load samtools/1.16.1


bwa index -p '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA) '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA) -a bwtsw '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA)

java -jar /gpfs/software/ada/picard/2.24.1/picard.jar CreateSequenceDictionary -R '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA) -O '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA '.fna.gz').dict

zcat '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA) | bgzip -c > '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA)

samtools faidx '$SCRATCH_DIRECTORY'/ref_fasta/$(basename $REF_FASTA)

' > $HOME_DIRECTORY/scripts/prepare_ref.sh


# sbatch $HOME_DIRECTORY/scripts/prepare_ref.sh



<a id="7"></a> 
## Generate the contig.intervals file
This script generates the contig.intervals file that will be used in the [GenomicsDBImport](#22) step.

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu
FILE_DIRECTORY=/gpfs/data/justin_rna/Hwei-yen

REF_FASTA=GCA_001643735.4_ASM164373v4_genomic.fna.gz


cut -f1,2 $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA).fai > $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.temp

less $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.temp | while read p; do echo "1"; done > $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.start

paste $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.temp $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.start | awk '{ print $1,$3,$2 }' > $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.interval_list_1

sed -r 's/\s+/:/' $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.interval_list_1 > $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.interval_list_2

sed -r 's/\s+/-/' $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.interval_list_2 > $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.intervals


rm $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.temp
rm $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.start
rm $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.interval_list_1
rm $SCRATCH_DIRECTORY/ref_fasta/$(basename $REF_FASTA)_contigs.interval_list_2





<a id="8"></a> 
## Decompress the reference fasta for the probabilistic approach
This script produces a decompressed fastq for the reference assembly which will be used for the [probabilistic approach](#XXX)

In [None]:
HOME_DIRECTORY=/gpfs/home/wwh23wcu
SCRATCH_DIRECTORY=/gpfs/scratch/wwh23wcu
FILE_DIRECTORY=/gpfs/data/justin_rna/Hwei-yen

REF_FASTA=$SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz

zcat $SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna.gz > $SCRATCH_DIRECTORY/ref_fasta/GCA_001643735.4_ASM164373v4_genomic.fna