In [1]:
# Following variant calling pipeline below
# https://informatics.fas.harvard.edu/whole-genome-resquencing-for-population-genomics-fastq-to-vcf.html#preprocess

In [2]:
import io
import pandas as pd
import numpy as np
import math

In [3]:
def sbatch_header(job,mem,tasks,hours):
    #sbatch submission script header
    script = 'script_' + job + '.sh'
    outfile = io.open(script,'w', newline='\n')    
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00  \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()
    
def sbatch_header_loop(job,mem,tasks,hours,infile):
    #sbatch submission script header
    script = 'script_' + infile + job + '.sh'
    outfile = io.open(script,'w', newline='\n') 
    jobname= infile + job   
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+jobname+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+jobname+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00 \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()

In [None]:
# Commands run previously by Elias to trim and align fastq

# Directory and file assignment for each file and program
# my_dir=/home/eoziolor/phpopg/data/128.120.88.242/raw_data
# fq1=$my_dir/AWPH*$num/*1.fq.gz
# fq2=$my_dir/AWPH*$num/*2.fq.gz
# my_bwa=/home/eoziolor/program/bwa-0.7.17/bwa
# my_sbl=/home/eoziolor/program/samblaster/samblaster
# my_sam=/home/eoziolor/program/samtools-1.9/samtools
# my_out=/home/eoziolor/phpopg/data/align/
# my_gen=/home/eoziolor/phpopg/data/genome/CAADHV01.fasta
# my_list=/home/eoziolor/phpopg/data/list/zeros_samples.tsv
# 
# #others
# pop=$(cat $my_list | grep $sample | cut -f 2)
# rg=$(echo \@RG\\tID:$sample\\tPL:Illumina\\tPU:x\\tLB:combined\\tSM:$sample.$pop)
# outroot=$sample\_$pop
# 
# #Code
# paste <(zcat $fq1 | paste - - - -) \
#       <(zcat $fq2 | paste - - - -) |\
# tr '\t' '\n' |\
# cutadapt -j 1 --interleaved -a CTGTCTCTTATA -A CTGTCTCTTATA -u 10 -U 10 -q 30 --trim-n --minimum-length 36 - |\
# $my_bwa mem $my_gen -p -R $rg -t 2 - |\
# $my_sam view -S -h -u - | \
# $my_sam sort -T $my_out/$outroot > $my_out/$outroot

In [3]:
job_name = 'trim_align_high_cov_ph'

sbatch_header(job_name,'16','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('# cp /home/eoziolor/phgenome/data/raw/PH-Sitka-93_S1_L008_R1_001.fastq.gz /home/jamcgirr/ph/data/hi_cov/ \n')
o.write('# cp /home/eoziolor/phgenome/data/raw/PH-Sitka-93_S1_L008_R2_001.fastq.gz /home/jamcgirr/ph/data/hi_cov/ \n')
o.write('#gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq.gz \n')
o.write('#gzip -d /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq.gz \n\n')

o.write('# module load trim_galore \n')
o.write('# trim_galore -q 20 --paired --illumina /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001.fastq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001.fastq \n\n') 
o.write('# mv PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov \n')
o.write('# mv PH-Sitka-93_S1_L008_R2_001_val_2.fq /home/jamcgirr/ph/data/hi_cov \n')

o.write('# module load bwa\n')
o.write('# bwa mem -aM -p -t 4 -R "@RG\\tID:group1\\tSM:PH-Sitka-93_S1_L008\\tPL:illumina\\tLB:lib1" /home/eoziolor/phpopg/data/genome_chr/c.harengus.fa /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R1_001_val_1.fq /home/jamcgirr/ph/data/hi_cov/PH-Sitka-93_S1_L008_R2_001_val_2.fq > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam \n')
o.write('module load samtools\n')
o.write('# samtools view -Shu /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam > /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam\n')
o.write('# samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam\n')
o.write('# rm /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sam \n')
o.write('samtools sort /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.bam -o /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam\n')
o.write('samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam\n') 

o.write('\n\n#run: sbatch '+script)
o.close()

In [4]:
job_name = 'mark_dups'

sbatch_header(job_name,'16','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('#module load picardtools \n')
o.write('#picard-tools MarkDuplicates INPUT=/home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.bam OUTPUT=/home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.dedup.bam METRICS_FILE=/home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.dedup.bam.metrics.txt MAX_FILE_HANDLES=1000')
o.write('module load samtools \n')
o.write('samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.dedup.bam \n\n') 


o.write('\n\n#run: sbatch '+script)
o.close()

In [10]:
job_name = 'call_snps'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

#sbatch command
o.write('#module load samtools \n')
o.write('#samtools index /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.dedup.bam\n')

o.write('#module load R \n')
o.write('#module load maven \n')
o.write('#module load java \n')
o.write('#module load GATK/4.1.4.1 \n\n')

o.write('#gatk HaplotypeCaller --java-options "-Xmx14g -XX:ParallelGCThreads=4" -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -I /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.dedup.bam -O /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.raw.g.vcf --emit-ref-confidence GVCF \n')
o.write('#gatk SelectVariants --java-options "-Xmx14g -XX:ParallelGCThreads=4" -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.raw.g.vcf --select-type-to-include SNP -O /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.raw_snps.vcf \n\n')

o.write('#module load vcftools \n')
o.write('#vcftools --vcf /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.raw.g.vcf --remove-indels --recode --recode-INFO-all --out /home/jamcgirr/ph/data/hi_cov/raw_snps_PH_Sitka_93 \n')
o.write('#vcftools --vcf /home/jamcgirr/ph/data/hi_cov/raw_snps_PH_Sitka_93.recode.vcf --get-INFO MQ --get-INFO QUAL --get-INFO SOR --get-INFO DP --get-INFO FS --get-INFO QD --get-INFO MQRankSum --get-INFO ReadPosRankSum \n\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('samtools mpileup -uf /home/jamcgirr/ph/data/c_harengus/c.harengus.fa /home/jamcgirr/ph/data/hi_cov/PH_Sitka_93.sort.dedup.bam | bcftools call --threads 4 -m -Ov -o /home/jamcgirr/ph/data/hi_cov/raw_variants_PH_Sitka_93.vcf \n')

o.write('\n\n#run: sbatch '+script)
o.close()

In [11]:
job_name = '_CollectWgsMetrics'

infiles = ["00","01","02","03","04","05","06","07","08","09","1"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','48', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('module load picardtools\n') 

    o.write('for file in /home/eoziolor/phpopg/data/align/'+infile+'*.bam\n')
    o.write('do\n')
    
    o.write('filename=$(basename $file .bam)\n')
    o.write('picard-tools CollectWgsMetrics I=$file R=/home/jamcgirr/ph/data/c_harengus/c.harengus.fa O=/home/jamcgirr/ph/familiarize/elias_qc_stats/wgsMetrics/$filename.collect_wgs_metrics.txt\n')
    
    o.write('done\n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()


In [4]:
job_name = '_make_db'

infiles =["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]
#infiles =["chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'16','4','72', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
    
    o.write('scratch_tmp=/scratch/jamcgirr/combine_gvcfs_'+infile+'_tmp \n')
    o.write('scratch_cp_to_home=/scratch/jamcgirr/combine_gvcfs_'+infile+' \n')
    o.write('home_cp=/home/jamcgirr/ph/data/combine_gvcfs/'+infile+' \n\n')
    
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('mkdir -vp $scratch_tmp \n\n')

    o.write('gatk --java-options "-Xmx12g -Xms8g" GenomicsDBImport --genomicsdb-workspace-path $scratch_cp_to_home --batch-size 100 -L '+infile+'_interval.list --sample-name-map /home/jamcgirr/ph/scripts/fastq_to_vcf/combine_gvcfs/ph.gvcfs_map --tmp-dir=$scratch_tmp --reader-threads 4 \n\n')
    
    o.write('cp -r $scratch_cp_to_home/* $home_cp \n')

    o.write('##rm SCRATCH ONLY!! \n')
    o.write('rm -r $scratch_tmp \n')
    o.write('rm -r $scratch_cp_to_home \n\n')

    o.write('sbatch script_'+infile+'_1_genotypegvcf.sh \n')
    o.write('sbatch script_'+infile+'_2_genotypegvcf.sh \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()


In [4]:
job_name = '_genotypegvcf'
infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}

for infile in infiles:
    chr_name = infile.split("_")[0]
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'16','4','144', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
    
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://'+chr_name+' -L '+chr_name+':'+chr_dict[infile]+' -O raw_variants_'+infile+'.vcf \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

job_name = '_genotypegvcf_allsites'
infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}

for infile in infiles:
    chr_name = infile.split("_")[0]
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'60','4','144', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    o.write('# This must be run in directory containing final DB (gendb://) (/home/jamcgirr/ph/data/combine_gvcfs)\n\n')
    
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('gatk GenotypeGVCFs -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V gendb://'+chr_name+' -L '+chr_name+':'+chr_dict[infile]+' -all-sites -O raw_variants_allsites_'+infile+'.vcf \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

In [6]:
# Dont use this, see next chunk
job_name = '_filter_gvcf'
infiles = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
        
        
    o.write('module load R \n')
    o.write('module load maven \n')
    o.write('module load java \n')
    o.write('module load GATK/4.1.4.1 \n\n')

    o.write('#gatk SelectVariants -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_1.vcf --select-type-to-include SNP -O /home/jamcgirr/ph/data/combine_gvcfs/raw_snps_'+infile+'_1.vcf \n\n')
    
    # no maf filter (removed --maf 0.05)
    o.write('module load vcftools \n')
    o.write('vcftools --vcf /home/jamcgirr/ph/data/combine_gvcfs/raw_snps_'+infile+'_1.vcf --remove-indels --maf 0.05 --min-alleles 2 --max-alleles 2 --minQ 20 --minDP 600 --maxDP 2000 --recode --recode-INFO-all --out /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1_delete \n\n')
    
    o.write('#gatk SelectVariants -R /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -V /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_2.vcf --select-type-to-include SNP -O /home/jamcgirr/ph/data/combine_gvcfs/raw_snps_'+infile+'_2.vcf \n\n')
    
    o.write('vcftools --vcf /home/jamcgirr/ph/data/combine_gvcfs/raw_snps_'+infile+'_2.vcf --remove-indels --maf 0.05 --min-alleles 2 --max-alleles 2 --minQ 20 --minDP 600 --maxDP 2000 --recode --recode-INFO-all --out /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2_delete \n\n')
     
    o.write('module load samtools \n')
    o.write('module load bcftools \n')
    
    o.write('bcftools filter -Oz -i \'MQ>30\' /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1_delete.recode.vcf -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
    o.write('bcftools filter -Oz -i \'MQ>30\' /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2_delete.recode.vcf -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n')
    
    # delete next two the following if everythin works (1/4/21)
    #o.write('bgzip -c /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1_delete.recode.vcf > /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1_delete.recode.vcf.gz \n')
    #o.write('bgzip -c /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2_delete.recode.vcf > /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2_delete.recode.vcf.gz \n')
    
    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')
   
    o.write('#rm /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1_delete.recode.vcf \n')
    o.write('#rm /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2_delete.recode.vcf \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

    #EST time ~ 4 hr

In [9]:
# coming back to the original vcf filtering step in 2021
# I used the same steps to filter 
# many more SNPs were filtered out for some reason
# The following uses bcftools to filter in case problem is caused by vcftools is deprecated
# NO MINOR Allele freq cuttoff
job_name = '_filter_gvcf'
infiles = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
           
    o.write('module load samtools \n')
    o.write('module load bcftools \n\n')
    
    o.write('bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_1.vcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000\' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
    o.write('bcftools view -S /home/jamcgirr/ph/data/combine_gvcfs/plates_1_through_5_rm.txt -m2 -M2 -v snps /home/jamcgirr/ph/data/combine_gvcfs/raw_variants_'+infile+'_2.vcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'MQ>30 && QUAL>20 && INFO/DP>600 && INFO/DP<2000\' -o /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')

    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_1.recode.vcf.gz \n')
    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_'+infile+'_2.recode.vcf.gz \n\n')
   
    
    o.write('\n\n#run: sbatch '+script)
    o.close()

    #EST time ~ 1.5hr

In [10]:
job_name = 'concat_snp_vcfs'

sbatch_header(job_name,'16','8','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('bcftools concat -o /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -O b --threads 8 /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr1_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr2_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr3_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr4_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr5_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr6_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr7_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr8_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr9_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr10_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr11_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr12_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr13_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr14_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr15_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr16_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr17_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr18_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr19_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr20_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr21_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr22_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr23_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr24_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr25_2.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_1.recode.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/filtered_snps_chr26_2.recode.vcf.gz \n') 
  
o.write('\n\n#run: sbatch '+script)
o.close()

# EST time ~6hr

In [3]:
#job_name = '_subset_snps_by_pop'

#infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

#for infile in infiles:
#    script = 'script_' + infile + job_name + '.sh'
#    sbatch_header_loop(job_name,'8','4','24', infile)
#    o = io.open(script,'a+', newline='\n')
#        
#        
#    o.write('module load samtools \n')
#    o.write('module load bcftools \n')
#
#    o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt -O z /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf > /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_snps.vcf.gz \n')
#   
#    o.write('module load vcftools \n')
#    #o.write('vcftools --gzvcf /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_snps.vcf.gz --minDP 25 --max-missing 0.5 --recode-INFO-all --recode --stdout | bgzip -c > /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n')
#    # min DP of 25 across 14 population:year groups = min DP of 350 
#    # minDP 350, maxDP 2000, maf 5%, minQ30 across all smaples and 50% genotyping rate across each population:year group)
#
#    o.write('vcftools --gzvcf /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_snps.vcf.gz --max-missing 0.5 --recode-INFO-all --recode --stdout | bgzip -c > /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n')
#    # minDP 600, maxDP 2000, maf 5%, minQ20 across all smaples and 80% genotyping rate across each population:year group)
#
#    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n\n')
#
#    o.write('\n\n#command to run: sbatch '+script)
#    o.close()

# EST time ~2hr

In [4]:
# coming back to the original vcf filtering step in 2021
# I used the same steps to filter 
# many more SNPs were filtered out for some reason
# The following uses bcftools to filter in case problem is caused by vcftools is deprecated
# NO MINOR Allele freq cuttoff
job_name = '_subset_snps_by_pop'

pop_n = {"BC17":"64","CA17":"70","PWS07":"46","PWS17":"56","PWS91":"58","PWS96":"72","SS06":"41","SS17":"64","SS96":"78","TB06":"52","TB17":"72","TB91":"74","TB96":"73","WA17":"72"}
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
        
    inds_with_data = math.ceil(int(pop_n[infile]) * 0.5)   
    o.write('module load bcftools \n')

    o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf | bcftools +fill-tags -- -t all,\'DP=sum(DP)\' | bcftools filter -Oz -i \'NS>'+str(inds_with_data)+' \' > /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n')
    o.write('bcftools index /home/jamcgirr/ph/data/combine_gvcfs/population_'+infile+'_filtered_snps.vcf.gz \n\n')

    o.write('\n\n#command to run: sbatch '+script)
    o.close()

# EST time ~1hr

In [25]:
# Intersect instead of merge

# coming back to the original vcf filtering step in 2021
# I used the same steps to filter 
# many more SNPs were filtered out for some reason
# The following uses bcftools to filter in case problem is caused by vcftools is deprecated
# NO MINOR Allele freq cuttoff

job_name = 'isec'

sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load bcftools \n')
o.write('bcftools isec -n=14 -p isec_test --threads 16 /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz \n\n')


o.write('\n\n#run: sbatch '+script)
o.close()

In [3]:
# change name to reflect that maf filter is still downstream
job_name = 'final_subset_maf_0.01'

sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('#module load bcftools \n')
o.write('##bcftools index /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf \n') 
o.write('#bcftools view -R /home/jamcgirr/ph/data/combine_gvcfs/isec_test/sites.txt /home/jamcgirr/ph/data/combine_gvcfs/merged_filtered_snps.bcf -Oz > /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5.vcf.gz \n')
o.write('#bcftools index /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5.vcf.gz  \n')
o.write('bcftools filter -Oz -i \'INFO/AF>0.01\' -o /home/jamcgirr/ph/data/combine_gvcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.01.vcf.gz \n')

o.write('\n\n#run: sbatch '+script)
o.close()
# 5 hrs (for view)

In [7]:
# use intersect instead of merge, see above
job_name = 'merge_filtered_pops'

#vcf_name = 'ph_filtered_snps_minDP350_maxDP2000_maf0.05_minQ30_maxmiss0.5' # used a dumb minDP method
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_maxmiss0.5' # 3 million
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ30_maxmiss0.5' # very few
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ30_minMQ30_maxmiss0.5' # very few
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ30_minMQ30_maxmiss0.8' # very few
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.8' 

# using bcftools filter (no maf cutoff)
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_maxmiss0.8_outliers_rm' #1384195 SNPs


sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('bcftools merge -O z --threads 16 -o /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_BC17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_CA17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS07_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_PWS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_SS96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB06_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB17_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB91_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_TB96_filtered_snps.vcf.gz /home/jamcgirr/ph/data/combine_gvcfs/population_WA17_filtered_snps.vcf.gz \n\n')
o.write('bcftools view /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz | wc -l > lines_'+vcf_name+'.txt \n')
o.write('#module load vcftools \n')
o.write('#vcftools --vcf /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz --BEAGLE-PL --chr chr1 --out /home/jamcgirr/ph/data/vcfs/bcftools_filter/chr1_'+vcf_name+' \n')
o.write('#vcftools --vcf /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz --plink --out /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+' \n')
o.write('#module load plink \n')
o.write('#plink --file /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+' --pca --out /home/jamcgirr/ph/data/plink/bcftools_filter/'+vcf_name+'\n') 


o.write('\n\n#run: sbatch '+script)
o.close()

In [13]:
# convert to plink and beagle

# coming back to the original vcf filtering step in 2021
# I used the same steps to filter 
# many more SNPs were filtered out for some reason
# The following uses bcftools to filter in case problem is caused by vcftools is deprecated
# NO MINOR Allele freq cuttoff

job_name = 'convert_vcf3'

vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5' # 8732577
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.01' # 1107746
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05' # 204153


sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('module load vcftools \n')
o.write('vcftools --gzvcf /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz --BEAGLE-PL --chr chr1 --out /home/jamcgirr/ph/data/vcfs/bcftools_filter/chr1_'+vcf_name+' \n')
o.write('vcftools --gzvcf /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz --plink --out /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+' \n\n')
o.write('module load plink \n')
o.write('plink --file /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+' --pca --out /home/jamcgirr/ph/data/plink/bcftools_filter/'+vcf_name+'\n\n') 
o.write('module load bcftools \n')
o.write('bcftools view /home/jamcgirr/ph/data/vcfs/bcftools_filter/'+vcf_name+'.vcf.gz | wc -l > /home/jamcgirr/ph/data/vcfs/bcftools_filter/lines_'+vcf_name+'\n')

o.write('\n\n#run: sbatch '+script)
o.close()

In [5]:
job_name = '_mpileup'

#https://speciationgenomics.github.io/variant_calling/

# use chr intervals to create one large vcf
# infiles = ["chr1_1","chr2_1","chr3_1","chr4_1","chr5_1","chr6_1","chr7_1","chr8_1","chr9_1","chr10_1","chr11_1","chr12_1","chr13_1","chr14_1","chr15_1","chr16_1","chr17_1","chr18_1","chr19_1","chr20_1","chr21_1","chr22_1","chr23_1","chr24_1","chr25_1","chr26_1","chr1_2","chr2_2","chr3_2","chr4_2","chr5_2","chr6_2","chr7_2","chr8_2","chr9_2","chr10_2","chr11_2","chr12_2","chr13_2","chr14_2","chr15_2","chr16_2","chr17_2","chr18_2","chr19_2","chr20_2","chr21_2","chr22_2","chr23_2","chr24_2","chr25_2","chr26_2"]
# chr_dict = {"chr1_1":"1-16542129","chr2_1":"1-16505160","chr3_1":"1-16263781","chr4_1":"1-16133824","chr5_1":"1-15793431","chr6_1":"1-15730777","chr7_1":"1-15495311","chr8_1":"1-15364778","chr9_1":"1-15238691","chr10_1":"1-15113866","chr11_1":"1-15048164","chr12_1":"1-15011240","chr13_1":"1-14922870","chr14_1":"1-14666386","chr15_1":"1-14356761","chr16_1":"1-13886911","chr17_1":"1-13784255","chr18_1":"1-13623647","chr19_1":"1-13565322","chr20_1":"1-13347081","chr21_1":"1-13232991","chr22_1":"1-12832026","chr23_1":"1-12646449","chr24_1":"1-10045549","chr25_1":"1-7462096","chr26_1":"1-6221605","chr1_2":"16542129-33084258","chr2_2":"16505160-33010319","chr3_2":"16263781-32527562","chr4_2":"16133824-32267647","chr5_2":"15793431-31586861","chr6_2":"15730777-31461554","chr7_2":"15495311-30990621","chr8_2":"15364778-30729556","chr9_2":"15238691-30477381","chr10_2":"15113866-30227731","chr11_2":"15048164-30096327","chr12_2":"15011240-30022480","chr13_2":"14922870-29845739","chr14_2":"14666386-29332771","chr15_2":"14356761-28713521","chr16_2":"13886911-27773822","chr17_2":"13784255-27568510","chr18_2":"13623647-27247294","chr19_2":"13565322-27130643","chr20_2":"13347081-26694162","chr21_2":"13232991-26465981","chr22_2":"12832026-25664052","chr23_2":"12646449-25292897","chr24_2":"10045549-20091098","chr25_2":"7462096-14924191","chr26_2":"6221605-12443209"}

# alternatively, just make one vcf for each population
# mpileup is really just converting bam to vcf
# after calling and pixy stats, use strict vcftools filters
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
infiles = ["BC17"]

for infile in infiles:
    chr_name = infile.split("_")[0]
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','24','144', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('module load bcftools \n')
    o.write('module load samtools \n')
    
    # get likelihoods (PL format) and call snps
    # o.write('#bcftools mpileup -f /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -b /home/jamcgirr/ph/data/angsd/SFS/bamlist_test/all_bams_p_1_5_rm.txt -I -r chr1:1-100000 --threads 24 | bcftools call -m -Ov -f GQ,GP --threads 24 -o delete2_all_sites_chr1.vcf
    # just get likelihoods (PL format) and call snps separately
    # bcf for angsd, vcf.gz for pixy
    # o.write('bcftools mpileup -f /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -b /home/jamcgirr/ph/data/angsd/SFS/bamlist_test/all_bams_p_1_5_rm.txt -I -r '+chr_name+':'+chr_dict[infile]+' --threads 24 -Ob -o allsites_'+infile+'.bcf \n')
    # o.write('bcftools call -m -Oz -f GQ --threads 24 -o allsites_calls_'+infile+'.vcf.gz allsites_'+infile+'.bcf \n')
    
    # use populations
    o.write('bcftools mpileup -f /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -b /home/jamcgirr/ph/data/angsd/SFS/bamlist_test/'+infile+'_bams_p_1_5_rm.txt -I -a AD,DP,SP --threads 24 -Ob -o /home/jamcgirr/ph/data/mpileup/allsites_'+infile+'.bcf \n')
    o.write('bcftools call -m -Oz -f GQ,GP --threads 24 -o /home/jamcgirr/ph/data/mpileup/allsites_calls_'+infile+'.vcf.gz /home/jamcgirr/ph/data/mpileup/allsites_'+infile+'.bcf \n')


    # see also script_mpileup_test_allsites_chr1.sh
    
    o.write('\n\n#run: sbatch '+script)
    o.close()
    
#o.write('#bcftools reheader -s samples.txt all_'+infile+'.vcf.gz > allsites_'+infile+'.vcf.gz  \n')
#o.write('#rm all_'+infile+'.bcf \n')
    
    # data carpentry / TA workshops
    # noah
    # elias
    # lisa
    # jeff miller minisota data carpentry email
    # titus brown introduction
    # picasso
    # data intensive listserv