In [2]:
import io
import pandas as pd
import numpy as np

def sbatch_header(job,mem,tasks,hours):
    #sbatch submission script header
    script = 'script_' + job + '.sh'
    outfile = io.open(script,'w', newline='\n')    
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00  \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()
    
def sbatch_header_loop(job,mem,tasks,hours,infile):
    #sbatch submission script header
    script = 'script_' + infile + job + '.sh'
    outfile = io.open(script,'w', newline='\n') 
    jobname= infile + job   
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+jobname+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+jobname+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00 \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()

In [None]:
# Fst persite PWS96 + PWS07 + PWS17 vs. PWS 91
# Fst persite PWS96 + PWS07 + PWS17 vs. NEPac (SS96,SS06,SS17,BC17,WA17)
# Fst persite PWS17 vs. NEPac (SS17,BC17,WA17)

job_name = 'fst_pops_PWS96_07_17_v_PWS91'
script_dir = '/home/jamcgirr/ph/scripts/angsd/SFS/fst_fixed_PWS/'
fst_dir = '/home/jamcgirr/ph/data/angsd/SFS/fst_fixed_PWS/'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS07_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt > '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS96_07_17.txt \n\n')

o.write('cp /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS91_bams_p_1_5_rm.txt '+fst_dir+'pops_PWS91.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS91.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS91.txt \n\n')

#subset pop vcfs
o.write('bcftools view -S '+fst_dir+'pops_PWS96_07_17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS96_07_17.vcf \n')
o.write('bcftools view -S '+fst_dir+'pops_PWS91.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS91.vcf \n\n')

# make saf from vcfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS96_07_17.vcf -out '+fst_dir+'pops_PWS96_07_17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS91.vcf -out '+fst_dir+'pops_PWS91 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n\n')
o.write('rm '+fst_dir+'pops_PWS96_07_17.vcf')
o.write('rm '+fst_dir+'pops_PWS91.vcf')

# make folded sfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS96_07_17.sfs \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS91.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS91.sfs \n\n')

# make 2dsfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_PWS91.saf.idx -fold 1 -P 4 > '+fst_dir+'pops_PWS96_07_17_v_PWS91.sfs \n\n')

# estimate fst
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst index '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_PWS91.saf.idx -sfs '+fst_dir+'pops_PWS96_07_17_v_PWS91.sfs -fold 1 -fstout '+fst_dir+'pops_PWS96_07_17_v_PWS91 \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst stats2 '+fst_dir+'pops_PWS96_07_17_v_PWS91.fst.idx -win 50000 -step 10000 > '+fst_dir+'fst_50kb_win_10kb_step_folded_pops_PWS96_07_17_v_PWS91.txt \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst print '+fst_dir+'pops_PWS96_07_17_v_PWS91.fst.idx > '+fst_dir+'fst_persite_folded_pops_PWS96_07_17_v_PWS91.txt')
    
o.write('\n\n#run: sbatch '+script)
o.close()
    
    
    
###################################################################
job_name = 'fst_pops_PWS96_07_17_v_NEPac'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')  
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/SS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS06_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/BC17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/WA17_bams_p_1_5_rm.txt > '+fst_dir+'pops_NEPac.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_NEPac.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_NEPac.txt \n\n')

o.write('# cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS07_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt > '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('# sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('# sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS96_07_17.txt \n\n')

#subset pop vcfs
o.write('#bcftools view -S '+fst_dir+'pops_PWS96_07_17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS96_07_17.vcf \n')
o.write('bcftools view -S '+fst_dir+'pops_NEPac.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_NEPac.vcf \n\n')

# make saf from vcfs
o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS96_07_17.vcf -out '+fst_dir+'pops_PWS96_07_17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_NEPac.vcf -out '+fst_dir+'pops_NEPac -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n\n')
o.write('#rm '+fst_dir+'pops_PWS96_07_17.vcf')
o.write('rm '+fst_dir+'pops_NEPac.vcf')

# make folded sfs
o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS96_07_17.sfs \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_NEPac.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_NEPac.sfs \n\n')

# make 2dsfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_NEPac.saf.idx -fold 1 -P 4 > '+fst_dir+'pops_PWS96_07_17_v_NEPac.sfs \n\n')

# estimate fst
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst index '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_NEPac.saf.idx -sfs '+fst_dir+'pops_PWS96_07_17_v_NEPac.sfs -fold 1 -fstout '+fst_dir+'pops_PWS96_07_17_v_NEPac \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst stats2 '+fst_dir+'pops_PWS96_07_17_v_NEPac.fst.idx -win 50000 -step 10000 > '+fst_dir+'fst_50kb_win_10kb_step_folded_pops_PWS96_07_17_v_NEPac.txt \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst print '+fst_dir+'pops_PWS96_07_17_v_NEPac.fst.idx > '+fst_dir+'fst_persite_folded_pops_PWS96_07_17_v_NEPac.txt')
    
o.write('\n\n#run: sbatch '+script)
o.close()
   

    
###################################################################
job_name = 'fst_pops_PWS17_v_NEPac17'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')  
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/BC17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/WA17_bams_p_1_5_rm.txt  > '+fst_dir+'pops_NEPac17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_NEPac17.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_NEPac17.txt \n\n')

o.write('cp /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt '+fst_dir+'pops_PWS17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS17.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS17.txt \n\n')

#subset pop vcfs
o.write('bcftools view -S '+fst_dir+'pops_PWS17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS17.vcf \n')
o.write('bcftools view -S '+fst_dir+'pops_NEPac17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_NEPac17.vcf \n\n')

# make saf from vcfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS17.vcf -out '+fst_dir+'pops_PWS17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_NEPac17.vcf -out '+fst_dir+'pops_NEPac17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n\n')
o.write('rm '+fst_dir+'pops_PWS17.vcf')
o.write('rm '+fst_dir+'pops_NEPac17.vcf')

# make folded sfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS17.sfs \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_NEPac17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_NEPac17.sfs \n\n')

# make 2dsfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS17.saf.idx '+fst_dir+'pops_NEPac17.saf.idx -fold 1 -P 4 > '+fst_dir+'pops_PWS17_v_NEPac17.sfs \n\n')

# estimate fst
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst index  '+fst_dir+'pops_PWS17.saf.idx '+fst_dir+'pops_NEPac17.saf.idx -sfs '+fst_dir+'pops_PWS17_v_NEPac17.sfs -fold 1 -fstout '+fst_dir+'pops_PWS17_v_NEPac17 \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst stats2 '+fst_dir+'pops_PWS17_v_NEPac17.fst.idx -win 50000 -step 10000 > '+fst_dir+'fst_50kb_win_10kb_step_folded_pops_PWS17_v_NEPac17.txt \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst print '+fst_dir+'pops_PWS17_v_NEPac17.fst.idx > '+fst_dir+'fst_persite_folded_pops_PWS17_v_NEPac17.txt')
    
o.write('\n\n#run: sbatch '+script)
o.close()
    
#EST run time ~ 3 days



In [3]:
# output SNP table

job_name = 'snp_table'

sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load R \n')
o.write('module load maven \n')
o.write('module load java \n')
o.write('module load GATK/4.1.4.1 \n\n')

o.write('gatk VariantsToTable -V /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -F CHROM -F POS -F REF -F ALT -O /home/jamcgirr/ph/data/vcfs/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm_table.txt \n\n')

#sed 's/chr//g' /home/jamcgirr/ph/data/vcfs/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm_table.txt > /home/jamcgirr/ph/data/LoF/snp_table_VEP.txt
#sed -i 's/\t/:/g' /home/jamcgirr/ph/data/LoF/snp_table_VEP.txt
#sed -i '1d' /home/jamcgirr/ph/data/LoF/snp_table_VEP.txt

o.write('\n\n#run: sbatch '+script)
o.close()
#EST run time ~ 1 hr

In [6]:
# parse SNP table with R script (LoF.rmd)
# run variant effect predictor (https://uswest.ensembl.org/info/docs/tools/vep/script/index.html)

job_name = 'vep'

sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('/home/jamcgirr/ph/data/LoF/ensembl-vep/vep --input_file /home/jamcgirr/ph/data/LoF/snp_table_VEP_standard_format.txt --output_file /home/jamcgirr/ph/data/LoF/test_output.txt --cache --species clupea_harengus \n')
      
o.write('\n\n#run: sbatch '+script)
o.close()       
        
#EST run time ~ 1 hr 20 min      