In [None]:
#!/bin/bash
#SBATCH -N 1 # number of nodes
#SBATCH -n 16 # number of cores
#SBATCH --mem=48G
#SBATCH --job-name="Umar_preprocessing"
#SBATCH --mail-type=END
#SBATCH --partition=computeq
#SBATCH --mail-user=hrclndnn@memphis.edu
#SBATCH -t 14-00:00:00
#SBATCH -a 0-81

module load bwa
module load samtools
module load picard
module load sratoolkit
module load gatk
module load bcftools

files=(SRR517013
SRR518661
SRR518662
SRR518670
SRR518671
SRR518672
SRR518673
SRR518676
SRR518677
SRR518678
SRR518679
SRR518680
SRR518684
SRR518685
SRR518686
SRR518687
SRR518688
SRR518689
SRR518699
SRR518700
SRR518701
SRR518702
SRR518703
SRR827537
SRR827574
SRR827584
SRR827585
SRR827587
SRR827588
SRR827600
SRR942197
SRR942203
SRR942211
SRR942218
SRR942235
SRR942239
SRR942240
SRR942246
SRR942248
SRR942249
SRR942251
SRR942253
SRR942260
SRR942265
SRR942266
SRR942273
SRR942275
SRR942276
SRR942277
SRR942278
SRR942279
SRR942280
SRR942281
SRR942282
SRR942283
SRR942284
SRR942285
SRR942286
SRR942287
SRR942288
SRR942289
SRR942290
SRR942291
SRR942292
SRR942293
SRR942294
SRR942295
SRR942296
SRR942297
SRR942298
SRR942299
SRR942300
SRR942301
SRR942302
SRR942303
SRR942304
SRR942305
SRR942306
SRR942307
SRR942308
SRR942309
SRR942310
SRR947747)

file=${files[$SLURM_ARRAY_TASK_ID]}

In [None]:
#00 pull samples from NCBI
cd /home/hrclndnn/sra_local/
prefetch --max-size 100G ${file}
fastq-dump --split-files ${file}


#01 Align to reference genome
bwa mem -M -t 16 /home/mdpllard/Genomes/Ursus_maritimus.fna /home/hrclndnn/WGSUmari/00_rethread/${file}_1.fastq /home/hrclndnn/WGSUmari/00_rethread/${file}_2.fastq > /home/hrclndnn/WGSUmari/01_mappedSam/S${file}_mapped.sam


#02 Create a sorted bam file from the mapped sam file
gatk --java-options '-Djava.io.tmpdir=/home/scratch/hrclndnn/' SortSam -I /home/hrclndnn/WGSUmari/01_mappedSam/${file}_mapped.sam -O /home/hrclndnn/WGSUmari/02_sortedBam/${file}.sorted.bam -SO coordinate 

#03 Mark duplicates
gatk --java-options '-Djava.io.tmpdir=/home/scratch/hrclndnn/' MarkDuplicates -I /home/hrclndnn/WGSUmari/02_sortedBam/${file}.sorted.bam -O /home/hrclndnn/WGSUmari/03_markDups/${file}.dup.sorted.bam -M /home/hrclndnn/WGSUmari/03_markDups/${file}.dup.metrics --MAX_FILE_HANDLES 1000


#04 Add or replace read groups
cd /home/hrclndnn/WGSUmari/03_markDups/ #may not need, but sometimes GATK has errors related to paths
gatk AddOrReplaceReadGroups -I /home/hrclndnn/WGSUmari/03_markDups/${file}.dup.sorted.bam -O /home/hrclndnn/WGSUmari/04_addRepGroups/${file}.RG.dup.sorted.bam --RGLB NCBI --RGPL illumina --RGPU NCBI --RGSM /home/hrclndnn/WGSUmari/04_addRepGroups/${file} 

#05 call variants
samtools index /home/hrclndnn/WGSUmari/04_addRepGroups/${file}.RG.dup.sorted.bam
gatk --java-options '-DGATK_STACKTRACE_ON_USER_EXCEPTION=true' HaplotypeCaller -R /home/mdpllard/Genomes/Ursus_maritimus.fna -I /home/hrclndnn/WGSUmari/04_addRepGroups/${file}.RG.dup.sorted.bam -O /home/hrclndnn/WGSUmari/05_callVariants/${file}.vcf.gz --heterozygosity 0.0005 -ERC GVCF



    
#changeRGSMnames
#directory of scripts for each sample at: /home/hrclndnn/WGSUmari/scripts/changeRGSMnames



In [None]:
06_combinegVCFs.sh 
#!/bin/bash
#SBATCH -N 1 # number of nodes
#SBATCH -n 16 # number of cores
#SBATCH --mem=48G
#SBATCH --job-name="Ursus_maritimus_06"
#SBATCH --mail-type=END
#SBATCH --partition=computeq
#SBATCH --mail-user=hrclndnn@memphis.edu
#SBATCH -t 14-00:00:00

module load bwa
module load samtools
module load picard
module load sratoolkit
module load gatk
module load bcftools

#06_Combine gVCFs
cd /home/hrclndnn/WGSUmari/05_callVariants/
gatk CombineGVCFs -R /home/mdpllard/Genomes/Ursus_maritimus.fna \
	-V SRR518661.vcf.gz \
	-V SRR518662.vcf.gz \
	-V SRR518670.vcf.gz \
	-V SRR518671.vcf.gz \
	-V SRR518672.vcf.gz \
	-V SRR518673.vcf.gz \
	-V SRR518676.vcf.gz \
	-V SRR518677.vcf.gz \
	-V SRR518678.vcf.gz \
	-V SRR518679.vcf.gz \
	-V SRR518680.vcf.gz \
	-V SRR518684.vcf.gz \
	-V SRR518685.vcf.gz \
	-V SRR518686.vcf.gz \
	-V SRR518687.vcf.gz \
	-V SRR518688.vcf.gz \
	-V SRR518689.vcf.gz \
	-V SRR518699.vcf.gz \
	-V SRR518700.vcf.gz \
	-V SRR518701.vcf.gz \
	-V SRR518702.vcf.gz \
	-V SRR518703.vcf.gz \
	-V SRR827537.vcf.gz \
	-V SRR827574.vcf.gz \
	-V SRR827584.vcf.gz \
	-V SRR827585.vcf.gz \
	-V SRR827587.vcf.gz \
	-V SRR827588.vcf.gz \
	-V SRR827600.vcf.gz \
	-V SRR942197.vcf.gz \
	-V SRR942203.vcf.gz \
	-V SRR942211.vcf.gz \
	-V SRR942213.vcf.gz \
	-V SRR942218.vcf.gz \
	-V SRR942235.vcf.gz \
	-V SRR942239.vcf.gz \
	-V SRR942240.vcf.gz \
	-V SRR942246.vcf.gz \
	-V SRR942248.vcf.gz \
	-V SRR942249.vcf.gz \
	-V SRR942251.vcf.gz \
	-V SRR942253.vcf.gz \
	-V SRR942260.vcf.gz \
	-V SRR942263.vcf.gz \
	-V SRR942265.vcf.gz \
	-V SRR942266.vcf.gz \
	-V SRR942273.vcf.gz \
	-V SRR942275.vcf.gz \
	-V SRR942276.vcf.gz \
	-V SRR942277.vcf.gz \
	-V SRR942278.vcf.gz \
	-V SRR942279.vcf.gz \
	-V SRR942280.vcf.gz \
	-V SRR942281.vcf.gz \
	-V SRR942282.vcf.gz \
	-V SRR942283.vcf.gz \
	-V SRR942284.vcf.gz \
	-V SRR942285.vcf.gz \
	-V SRR942286.vcf.gz \
	-V SRR942287.vcf.gz \
	-V SRR942288.vcf.gz \
	-V SRR942289.vcf.gz \
	-V SRR942290.vcf.gz \
	-V SRR942291.vcf.gz \
	-V SRR942292.vcf.gz \
	-V SRR942293.vcf.gz \
	-V SRR942294.vcf.gz \
	-V SRR942295.vcf.gz \
	-V SRR942296.vcf.gz \
	-V SRR942297.vcf.gz \
	-V SRR942298.vcf.gz \
	-V SRR942299.vcf.gz \
	-V SRR942300.vcf.gz \
	-V SRR942301.vcf.gz \
	-V SRR942302.vcf.gz \
	-V SRR942303.vcf.gz \
	-V SRR942304.vcf.gz \
	-V SRR942305.vcf.gz \
	-V SRR942306.vcf.gz \
	-V SRR942307.vcf.gz \
	-V SRR942308.vcf.gz \
	-V SRR942309.vcf.gz \
	-V SRR942310.vcf.gz \
	-V SRR947747.vcf.gz \
	-O /home/hrclndnn/WGSUmari/06_combineGVCFs/Umar.combined.Jun22.g.vcf.gz


In [None]:
07_genoGVCFS.sh 
#!/bin/bash
#SBATCH -N 1 # number of nodes
#SBATCH -n 16 # number of cores
#SBATCH --mem=48G
#SBATCH --job-name="UmarPrep7"
#SBATCH --mail-type=END
#SBATCH --partition=computeq
#SBATCH --mail-user=hrclndnn@memphis.edu
#SBATCH -t 14-00:00:00

module load bwa
module load samtools
module load picard
module load sratoolkit
module load gatk
module load bcftools

#07_Genotype gVCFs
gatk GenotypeGVCFs -R /home/mdpllard/Genomes/Ursus_maritimus.fna \
	-V /home/hrclndnn/WGSUmari/06_combineGVCFs/Umar.combined.Jun1522.g.vcf.gz --heterozygosity 0.0005 \
	-O /home/hrclndnn/WGSUmari/07_genotypeGVCFs/UmarGenotypedJun1522.vcf.gz


In [None]:
#08 Filter VCFs
#bcftools view \
#		-i 'FMT/DP>4 & FMT/DP<300 && FMT/GQ>=20' \ #filtering for depth of coverage and quality of calls
##		-s ^LA362 \ #add a sample to process, or multiple samples as comma separated list
#		-m2 -M2 /home/hrclndnn/WGS_Ursus_maritimus/07_genotypeGVCFs/Umar_genotypedFeb1522.vcf.gz \ #select only biallelic (excluding multiallelic) snps from the vcf.gz file from 07
#		-Oz -o /home/hrclndnn/WGS_Ursus_maritimus/08_bcftoolsFilter/Umar.filt.vcf.gz #outfile for 08
bcftools view -i 'FMT/DP>4 & FMT/DP<300 && FMT/GQ>=20' -m2 -M2 /home/hrclndnn/WGSUmari/07_genotypeGVCFs/UmarGenotypedJun1522.vcf.gz \
	-Oz -o /home/hrclndnn/WGSUmari/08_bcftoolsFilter/UmarJun1522.filt.vcf.gz

In [None]:
#09 filter SNPs in VCFs
#bcftools view \
#		-v snps /home/hrclndnn/WGS_Ursus_maritimus/08_bcftoolsFilter/Uamer.filt.vcf.gz \ #select only biallelic snps from 08 outfile
#		-Oz -o /home/hrclndnn/WGS_Ursus_maritimus/09_filterSNPs/Uamer.snp.filt.vcf.gz   #create outfile for 09
bcftools view -v snps /home/hrclndnn/WGSUmari/08_bcftoolsFilter/UmarJun1522.filt.vcf.gz -Oz -o /home/hrclndnn/WGSUmari/09filterSNPs/UmarJun1522.snp.filt.vcf.gz 