In [2]:
import io
import pandas as pd
import numpy as np
import math

In [3]:
def sbatch_header(job,mem,tasks,hours):
    #sbatch submission script header
    script = 'script_' + job + '.sh'
    outfile = io.open(script,'w', newline='\n')    
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00  \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()
    
def sbatch_header_loop(job,mem,tasks,hours,infile):
    #sbatch submission script header
    script = 'script_' + infile + job + '.sh'
    outfile = io.open(script,'w', newline='\n') 
    jobname= infile + job   
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+jobname+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+jobname+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00 \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()


In [18]:
job_name = 'vcf_to_beagle'

vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

sbatch_header(job_name,'16','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load vcftools \n')
o.write('vcftools --gzvcf /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz --BEAGLE-PL --chr chr1 --out /home/jamcgirr/ph/data/vcfs/my_chr1 \n')

o.write('for i in {2..26} \n')
o.write('do \n')
o.write('vcftools --gzvcf /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz --BEAGLE-PL --chr chr$i --out /home/jamcgirr/ph/data/vcfs/my_chr$i \n')
o.write('done \n\n')

o.write('cat my_chr1.BEAGLE.PL my_chr2.BEAGLE.PL my_chr3.BEAGLE.PL my_chr4.BEAGLE.PL my_chr5.BEAGLE.PL my_chr6.BEAGLE.PL my_chr7.BEAGLE.PL my_chr8.BEAGLE.PL my_chr9.BEAGLE.PL my_chr10.BEAGLE.PL my_chr11.BEAGLE.PL my_chr12.BEAGLE.PL my_chr13.BEAGLE.PL my_chr14.BEAGLE.PL my_chr15.BEAGLE.PL my_chr16.BEAGLE.PL my_chr17.BEAGLE.PL my_chr18.BEAGLE.PL my_chr19.BEAGLE.PL my_chr20.BEAGLE.PL my_chr21.BEAGLE.PL my_chr22.BEAGLE.PL my_chr23.BEAGLE.PL my_chr24.BEAGLE.PL my_chr25.BEAGLE.PL my_chr26.BEAGLE.PL > my_chrs.BEAGLE.PL \n')  
o.write('head -1 my_chr1.BEAGLE.PL > my_chr_beagle_header.txt\n')
o.write('grep -v "marker" my_chrs.BEAGLE.PL > my_chrs.BEAGLE.PL2 \n')
o.write('cat my_chr_beagle_header.txt my_chrs.BEAGLE.PL2 > ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.BEAGLE.PL\n')
o.write('wc -l ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.BEAGLE.PL > my_chr_lines.txt\n')
o.write('#rm my_chr*\n')

# convert to plink
# o.write('#vcftools --vcf /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf --plink --out /home/jamcgirr/ph/data/vcfs/'+vcf_name+' \n')

o.write('\n\n#command to run: sbatch '+script)
o.close()

In [6]:
job_name = 'PCAngsd'

#git clone https://github.com/Rosemeis/pcangsd.git
#cd pcangsd/\n')
#python setup.py build_ext --inplace


sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('module load pcangsd \n')

vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
o.write('#gzip /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL \n')
o.write('python /home/jamcgirr/apps/pcangsd/pcangsd.py -beagle /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL.gz -o /home/jamcgirr/ph/data/angsd/PCAngsd/'+vcf_name+' -threads 16 \n')

# vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.01'
# o.write('gzip /home/jamcgirr/ph/data/vcfs/chr1_'+vcf_name+'.BEAGLE.PL \n')
# o.write('python /home/jamcgirr/apps/pcangsd/pcangsd.py -beagle /home/jamcgirr/ph/data/vcfs/chr1_'+vcf_name+'.BEAGLE.PL.gz -o /home/jamcgirr/ph/data/angsd/PCAngsd/chr1_'+vcf_name+' -threads 4 \n')


o.write('\n\n#command to run: sbatch '+script)
o.close()

# ~5min

In [None]:
job_name = 'PCAngsd_NEpops'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

#git clone https://github.com/Rosemeis/pcangsd.git
#cd pcangsd/\n')
#python setup.py build_ext --inplace

sbatch_header(job_name,'16','16','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('module load pcangsd \n')

o.write('#gzip /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL \n')
o.write('python /home/jamcgirr/apps/pcangsd/pcangsd.py -beagle /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL.gz -o /home/jamcgirr/ph/data/angsd/PCAngsd/'+vcf_name+' -threads 16 \n')

# vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.01'
# o.write('gzip /home/jamcgirr/ph/data/vcfs/chr1_'+vcf_name+'.BEAGLE.PL \n')
# o.write('python /home/jamcgirr/apps/pcangsd/pcangsd.py -beagle /home/jamcgirr/ph/data/vcfs/chr1_'+vcf_name+'.BEAGLE.PL.gz -o /home/jamcgirr/ph/data/angsd/PCAngsd/chr1_'+vcf_name+' -threads 4 \n')


o.write('\n\n#command to run: sbatch '+script)
o.close()

# ~5min

In [7]:
job_name = 'plink_PCA'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('module load plink \n')
o.write('plink --file /home/jamcgirr/ph/data/vcfs/'+vcf_name+' --pca --out /home/jamcgirr/ph/data/plink/'+vcf_name+'\n') 


o.write('\n\n#command to run: sbatch '+script)
o.close()

In [9]:
job_name = 'MDS'
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('module load plink \n')
o.write('#plink --file /home/jamcgirr/ph/data/vcfs/'+vcf_name+' --genome --chr chr1 --out /home/jamcgirr/ph/data/plink/chr_1'+vcf_name+'\n')
o.write('#plink --file /home/jamcgirr/ph/data/vcfs/'+vcf_name+' --cluster --chr chr1 --out /home/jamcgirr/ph/data/plink/chr_1'+vcf_name+'\n') 
o.write('plink --file /home/jamcgirr/ph/data/vcfs/'+vcf_name+' --read-genome /home/jamcgirr/ph/data/plink/chr_1'+vcf_name+'.genome --mds-plot 4 --cluster --chr chr1 --out /home/jamcgirr/ph/data/plink/chr_1'+vcf_name+'\n') 


o.write('\n\n#command to run: sbatch '+script)
o.close()

In [18]:
job_name = 'NGSadmix'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

sbatch_header(job_name,'8','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('#git clone https://github.com/Rosemeis/pcangsd.git\n')
o.write('#cd pcangsd/\n')
o.write('#python setup.py build_ext --inplace\n\n')

o.write('module load angsd \n')
o.write('gzip /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL \n\n')

o.write('for i in {1..20} \n')
o.write('do \n')
o.write('NGSadmix -likes /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL.gz -K $i -o /home/jamcgirr/ph/data/angsd/NGSadmix/'+vcf_name+'_k$i -P 4 \n')
o.write('done \n')


o.write('\n\n#command to run: sbatch '+script)
o.close()

# ~4 days

# run 9 more times
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

for infile in np.arange(1, 10, 1):
    script = 'script_' + str(infile) + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','144', str(infile))
    o = io.open(script,'a+', newline='\n')
    
    o.write('module load angsd \n')
    
    o.write('for i in {1..20} \n')
    o.write('do \n')
    o.write('NGSadmix -likes /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.BEAGLE.PL.gz -K $i -o /home/jamcgirr/ph/data/angsd/NGSadmix/reps/rep'+str(infile)+'_'+vcf_name+'_k$i -P 4 \n')
    o.write('done \n')


    o.write('\n\n#command to run: sbatch '+script)
    o.close()

In [18]:
job_name = 'evalAdmix'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5'
infiles = list(range(2,21))

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'16','4','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    o.write('module load angsd \n')
    o.write('/home/jamcgirr/apps/evalAdmix/evalAdmix -beagle /home/jamcgirr/ph/data/vcfs/chr1_'+vcf_name+'.BEAGLE.PL.gz -fname /home/jamcgirr/ph/data/angsd/NGSadmix/k'+str(infile)+'_chr1_'+vcf_name+'.fopt.gz -qname /home/jamcgirr/ph/data/angsd/NGSadmix/k'+str(infile)+'_chr1_'+vcf_name+'.qopt -o /home/jamcgirr/ph/data/angsd/NGSadmix/evalAdmix/eval_k'+str(infile)+'_chr1_'+vcf_name+' -P 4 \n')

    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()

# EST run time ~1hr 30min

In [13]:
job_name = '_subset_pops_vcfs'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    #vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.01'

    o.write('module load bcftools \n')
    #o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt --threads 4 /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz | bcftools +fill-tags -Oz -- -t all,\'DP=sum(DP)\' > /home/jamcgirr/ph/data/vcfs/split_pops/maf01/population_'+infile+'_'+vcf_name+'.vcf.gz \n')
    
    #vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

    
    #o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt --threads 4 /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz | bcftools +fill-tags -Oz -- -t all,\'DP=sum(DP)\' > /home/jamcgirr/ph/data/vcfs/split_pops/maf05/population_'+infile+'_'+vcf_name+'.vcf.gz \n')
    
    vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5'

    
    o.write('bcftools view -S /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+infile+'_plates_1_through_5_rm.txt --threads 4 /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz | bcftools +fill-tags -Oz -- -t all,\'DP=sum(DP)\' > /home/jamcgirr/ph/data/vcfs/split_pops/maf00/population_'+infile+'_'+vcf_name+'.vcf.gz \n')

    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()
    
#EST run time ~15min

In [25]:
# install angsd locally to use most recent version and avoid issues with vcf parsing
# git clone https://github.com/samtools/htslib.git;
# git clone https://github.com/angsd/angsd.git;
# cd htslib;make;cd ../angsd;make HTSSRC=../htslib

job_name = '_SFS'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    #vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.01'
    
    # make saf from vcf
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl /home/jamcgirr/ph/data/vcfs/split_pops/maf01/population_'+infile+'_'+vcf_name+'.vcf.gz -out /home/jamcgirr/ph/data/angsd/SFS/vcf/maf01/'+infile+' -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
    # make folded sfs
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/vcf/maf01/'+infile+'.saf.idx -P 4 -fold 1 > /home/jamcgirr/ph/data/angsd/SFS/vcf/maf01/'+infile+'_folded.sfs \n')
    # make unfolded sfs
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/vcf/maf01/'+infile+'.saf.idx -P 4 > /home/jamcgirr/ph/data/angsd/SFS/vcf/maf01/'+infile+'_unfolded.sfs \n')
    
    vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
    
    # make saf from vcf
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl /home/jamcgirr/ph/data/vcfs/split_pops/maf05/population_'+infile+'_'+vcf_name+'.vcf.gz -out /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+' -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
    # make folded sfs
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'.saf.idx -P 4 -fold 1 > /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'_folded.sfs \n')
    # make unfolded sfs
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'.saf.idx -P 4 > /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'_unfolded.sfs \n')
    
    #vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5'
    
    # make saf from vcf
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl /home/jamcgirr/ph/data/vcfs/split_pops/maf00/population_'+infile+'_'+vcf_name+'.vcf.gz -out /home/jamcgirr/ph/data/angsd/SFS/vcf/maf00/'+infile+' -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
    # make folded sfs
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/vcf/maf00/'+infile+'.saf.idx -P 4 -fold 1 > /home/jamcgirr/ph/data/angsd/SFS/vcf/maf00/'+infile+'_folded.sfs \n')
    # make unfolded sfs
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/vcf/maf00/'+infile+'.saf.idx -P 4 > /home/jamcgirr/ph/data/angsd/SFS/vcf/maf00/'+infile+'_unfolded.sfs \n')
    
    
    # add calc maf
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -out /home/jamcgirr/ph/data/angsd/SFS/dxy/mafs/'+infile+'_vcf -fai /home/jamcgirr/ph/data/c_harengus/c.harengus.fa.fai -doMajorMinor 1 -doMaf 3 -vcf-pl /home/jamcgirr/ph/data/vcfs/split_pops/population_'+infile+'_ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_plates_1_5_rm.vcf -P 4 \n')
    #o.write('gzip -d /home/jamcgirr/ph/data/angsd/SFS/dxy/mafs/'+infile+'_vcf.mafs.gz \n')
    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()

# EST run time ~15min

In [5]:
# pi estimates using saf from vcf are way too high
# using angsd to estimate saf from bam files generates reasonable theta
# likely has to do with invariant site corrections and MAF cutoff

# if using vcf sites, need to index sites.txt 
# /home/jamcgirr/apps/angsd/angsd sites index sites.txt

job_name = '_thetas'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'60','8','120', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    # make saf from bams
    o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -bam /home/jamcgirr/ph/data/angsd/SFS/thetas/'+infile+'_bams_p_1_5_rm.txt -doSaf 1 -doMajorMinor 1 -doMaf 3 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -minMapQ 30 -minQ 20 -GL 1 -P 8 -out /home/jamcgirr/ph/data/angsd/SFS/thetas/population_'+infile+'_minQ20_minMQ30 \n')
    # make unfolded sfs
    o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/thetas/population_'+infile+'_minQ20_minMQ30.saf.idx -P 8 -nSites 100000 > /home/jamcgirr/ph/data/angsd/SFS/thetas/unfolded/population_'+infile+'_minQ20_minMQ30_unfolded.sfs \n')
    o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS saf2theta /home/jamcgirr/ph/data/angsd/SFS/thetas/population_'+infile+'_minQ20_minMQ30.saf.idx -outname /home/jamcgirr/ph/data/angsd/SFS/thetas/unfolded/population_'+infile+'_minQ20_minMQ30 -sfs /home/jamcgirr/ph/data/angsd/SFS/thetas/unfolded/population_'+infile+'_minQ20_minMQ30_unfolded.sfs \n')
    o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat /home/jamcgirr/ph/data/angsd/SFS/thetas/unfolded/population_'+infile+'_minQ20_minMQ30.thetas.idx \n')
    o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat /home/jamcgirr/ph/data/angsd/SFS/thetas/unfolded/population_'+infile+'_minQ20_minMQ30.thetas.idx -win 50000 -step 10000 -outnames /home/jamcgirr/ph/data/angsd/SFS/thetas/unfolded/population_'+infile+'_minQ20_minMQ30_50kb_win_10kb_step \n')
    # make folded sfs
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS /home/jamcgirr/ph/data/angsd/SFS/thetas/population_'+infile+'_minQ20_minMQ30.saf.idx -P 8 -nSites 100000 -fold 1 > /home/jamcgirr/ph/data/angsd/SFS/thetas/folded/population_'+infile+'_minQ20_minMQ30_folded.sfs \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS saf2theta /home/jamcgirr/ph/data/angsd/SFS/thetas/population_'+infile+'_minQ20_minMQ30.saf.idx -outname /home/jamcgirr/ph/data/angsd/SFS/thetas/folded/population_'+infile+'_minQ20_minMQ30 -fold 1 -sfs /home/jamcgirr/ph/data/angsd/SFS/thetas/folded/population_'+infile+'_minQ20_minMQ30_folded.sfs \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat /home/jamcgirr/ph/data/angsd/SFS/thetas/folded/population_'+infile+'_minQ20_minMQ30.thetas.idx \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat /home/jamcgirr/ph/data/angsd/SFS/thetas/folded/population_'+infile+'_minQ20_minMQ30.thetas.idx -win 50000 -step 10000 -outnames /home/jamcgirr/ph/data/angsd/SFS/thetas/folded/population_'+infile+'_minQ20_minMQ30_50kb_win_10kb_step \n')
    o.write('#rm /home/jamcgirr/ph/data/angsd/SFS/thetas/population_'+infile+'_minQ20_minMQ30.saf.gz')
    
    
    o.write('\n\n#run: sbatch '+script)
    o.close()

# EST run time ~

In [15]:
#Steps for Fst and PBS for andgsd folding update

# 1. Create 2d sfs from saf

job_name = '_2d_sfs'

vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
realSFS = '/home/jamcgirr/apps/angsd/misc/realSFS'
wk_dir = '/home/jamcgirr/ph/data/angsd/SFS/fst_pbs/maf05/'
saf_dir = '/home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'

infiles = ["BC17_CA17","BC17_WA17","PWS07_PWS17","PWS07_SS06","PWS17_BC17","PWS17_CA17","PWS17_SS17","PWS17_WA17","PWS91_PWS07","PWS91_PWS17","PWS91_PWS96","PWS96_PWS07","PWS96_PWS17","PWS96_SS96","SS06_SS17","SS17_BC17","SS17_CA17","SS17_WA17","SS96_SS06","SS96_SS17","TB06_PWS07","TB06_SS06","TB06_TB17","TB17_BC17","TB17_CA17","TB17_PWS17","TB17_SS17","TB17_WA17","TB91_TB06","TB91_TB17","TB91_TB96","TB96_PWS96","TB96_SS96","TB96_TB06","TB96_TB17","WA17_CA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    pops = ''.join(infile).split("_")
    o.write(realSFS+' '+saf_dir+pops[0]+'.saf.idx '+saf_dir+pops[1]+'.saf.idx -fold 1 -P 4 > '+wk_dir+'folded_'+infile+'.sfs \n')
    o.write(realSFS+' '+saf_dir+pops[0]+'.saf.idx '+saf_dir+pops[1]+'.saf.idx -P 4 > '+wk_dir+'unfolded_'+infile+'.sfs \n')

    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()

In [16]:
# 2. Estimate Fst and PBS

job_name = '_fst_pbs_folded'

vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
realSFS = '/home/jamcgirr/apps/angsd/misc/realSFS'
wk_dir = '/home/jamcgirr/ph/data/angsd/SFS/fst_pbs/maf05/'
saf_dir = '/home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'

infiles = ["BC17_WA17_CA17","PWS17_SS17_BC17","PWS17_SS17_CA17","PWS17_SS17_WA17","TB17_PWS17_BC17","TB17_PWS17_CA17","TB17_PWS17_SS17","TB17_PWS17_WA17","TB06_PWS07_SS06","TB96_PWS96_SS96","PWS91_PWS07_PWS17","PWS91_PWS96_PWS07","PWS91_PWS96_PWS17","TB91_TB06_TB17","TB91_TB96_TB06","TB91_TB96_TB17","SS96_SS06_SS17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','1','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    pops = ''.join(infile).split("_")
    #o.write(realSFS+' fst index '+saf_dir+'population_'+pops[0]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[1]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'persite \n')
    #o.write(realSFS+' fst index '+saf_dir+'population_'+pops[0]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[2]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'folded_'+pops[0]+'_'+pops[2]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[0]+'_'+pops[2]+'persite \n')
    #o.write(realSFS+' fst index '+saf_dir+'population_'+pops[1]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[2]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'folded_'+pops[1]+'_'+pops[2]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[1]+'_'+pops[2]+'persite \n\n')

    o.write(realSFS+' fst index '+saf_dir+pops[0]+'.saf.idx '+saf_dir+pops[1]+'.saf.idx '+saf_dir+pops[2]+'.saf.idx -sfs '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'.sfs -sfs '+wk_dir+'folded_'+pops[0]+'_'+pops[2]+'.sfs -sfs '+wk_dir+'folded_'+pops[1]+'_'+pops[2]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'_'+pops[2]+'_persite \n')
    o.write(realSFS+' fst stats2 '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'_'+pops[2]+'_persite.fst.idx -win 50000 -step 10000 > '+wk_dir+'fst_pbs_50kb_win_10kb_step_folded_'+pops[0]+'_'+pops[1]+'_'+pops[2]+'.txt \n')

    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()
    
# vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5'
# realSFS = '/home/jamcgirr/apps/angsd/misc/realSFS'
# wk_dir = '/home/jamcgirr/ph/data/angsd/SFS/fst_pbs/'
# saf_dir = '/home/jamcgirr/ph/data/angsd/SFS/split_pops/'
# 
# infiles = ["BC17_WA17_CA17","PWS17_SS17_BC17","PWS17_SS17_CA17","PWS17_SS17_WA17","TB17_PWS17_BC17","TB17_PWS17_CA17","TB17_PWS17_SS17","TB17_PWS17_WA17","TB06_PWS07_SS06","TB96_PWS96_SS96","PWS91_PWS07_PWS17","PWS91_PWS96_PWS07","PWS91_PWS96_PWS17","TB91_TB06_TB17","TB91_TB96_TB06","TB91_TB96_TB17","SS96_SS06_SS17"]
# 
# job_name = '_fst_pbs_unfolded'
# 
# for infile in infiles:
#     script = 'script_' + infile + job_name + '.sh'
#     sbatch_header_loop(job_name,'8','1','24', infile)
#     o = io.open(script,'a+', newline='\n')
#     
#     
#     pops = ''.join(infile).split("_")
#     #o.write(realSFS+' fst index '+saf_dir+'population_'+pops[0]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[1]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[0]+'_'+pops[1]+'persite \n')
#     #o.write(realSFS+' fst index '+saf_dir+'population_'+pops[0]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[2]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'folded_'+pops[0]+'_'+pops[2]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[0]+'_'+pops[2]+'persite \n')
#     #o.write(realSFS+' fst index '+saf_dir+'population_'+pops[1]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[2]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'folded_'+pops[1]+'_'+pops[2]+'.sfs -fold 1 -fstout '+wk_dir+'folded_'+pops[1]+'_'+pops[2]+'persite \n\n')
# 
#     o.write(realSFS+' fst index '+saf_dir+'population_'+pops[0]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[1]+'_'+vcf_name+'.saf.idx '+saf_dir+'population_'+pops[2]+'_'+vcf_name+'.saf.idx -sfs '+wk_dir+'unfolded_'+pops[0]+'_'+pops[1]+'.sfs -sfs '+wk_dir+'unfolded_'+pops[0]+'_'+pops[2]+'.sfs -sfs '+wk_dir+'unfolded_'+pops[1]+'_'+pops[2]+'.sfs -fstout '+wk_dir+'unfolded_'+pops[0]+'_'+pops[1]+'_'+pops[2]+'_persite \n')
#     o.write(realSFS+' fst stats2 '+wk_dir+'unfolded_'+pops[0]+'_'+pops[1]+'_'+pops[2]+'_persite.fst.idx -win 50000 -step 10000 > '+wk_dir+'fst_pbs_50kb_win_10kb_step_unfolded_'+pops[0]+'_'+pops[1]+'_'+pops[2]+'.txt \n')
# 
#     
#     o.write('\n\n#command to run: sbatch '+script)
#     o.close()

In [22]:
job_name = '_thetas_vcf'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS saf2theta /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'.saf.idx -outname /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/thetas/'+infile+' -fold 1 -sfs /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'_folded.sfs \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/thetas/'+infile+'.thetas.idx \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/thetas/'+infile+'.thetas.idx -win 50000 -step 10000 -outnames /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/thetas/'+infile+'_50kb_win_10kb_step \n')    
    
    o.write('\n\n#run: sbatch '+script)
    o.close()

# EST run time ~ 1min

In [14]:
# create maf, geno, beagle

job_name = 'maf05'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
sbatch_header(job_name,'16','8','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


# make mafs from vcf
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -out /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+vcf_name+' -fai /home/jamcgirr/ph/data/c_harengus/c.harengus.fa.fai -doGlf 2 -doMaf 3 -doMajorMinor 1 -doPost 1 -doGeno 2 -vcf-pl /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz -P 8 \n')


o.write('\n\n#command to run: sbatch '+script)
o.close()

# EST run time ~ 5 minutes


job_name = '_maf'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','1', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    # make mafs from vcf
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -out /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+' -fai /home/jamcgirr/ph/data/c_harengus/c.harengus.fa.fai -doGlf 2 -doMaf 3 -doMajorMinor 1 -doPost 1 -doGeno 2 -vcf-pl /home/jamcgirr/ph/data/vcfs/split_pops/maf05/population_'+infile+'_'+vcf_name+'.vcf.gz -P 4 \n')
    o.write('gzip -d /home/jamcgirr/ph/data/angsd/SFS/vcf/maf05/'+infile+'.mafs.gz \n')
    
    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()

# EST run time ~ 5 minutes

In [10]:
# LD with ngsLD
chroms = ["chr1","chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]
chroms = ["chr2","chr3","chr4","chr5","chr6","chr7","chr8","chr9","chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chr23","chr24","chr25","chr26"]

for chrom in chroms:
    job_name = 'prune_'+chrom
    
    vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm'
    vcf_dir = '/home/jamcgirr/ph/data/vcfs/'
    
    sbatch_header(job_name,'8','1','24')
    script = 'script_' + job_name + '.sh'
    o = io.open(script,'a+', newline='\n')    
    
    
    o.write('module load bcftools \n')
    o.write('bcftools view '+vcf_dir+'vince/'+vcf_name+'.vcf.gz -Ov --regions '+chrom+' > '+vcf_dir+chrom+'_'+vcf_name+'.vcf \n')
    o.write('java -jar /home/jamcgirr/apps/jvarkit/dist/downsamplevcf.jar -n 20000 '+vcf_dir+chrom+'_'+vcf_name+'.vcf > '+vcf_dir+chrom+'_thin20k_'+vcf_name+'.vcf \n')
    o.write('module load vcftools \n')
    o.write('vcftools --vcf '+vcf_dir+chrom+'_thin20k_'+vcf_name+'.vcf --BEAGLE-PL --chr '+chrom+' --out '+vcf_dir+chrom+'_thin20k_'+vcf_name+' \n')
    
    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()
    
    # ~1 hr
    
    # Run to create position file for --pos flag for ngsLD:
    
    # awk '{ print $1 }' chr1_thin20k_ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.BEAGLE.PL > /home/jamcgirr/ph/data/vcfs/split_pops/chr1_thin20k_sites.txt
    # sed 's/:/\t/g' /home/jamcgirr/ph/data/vcfs/split_pops/chr1_thin20k_sites.txt > /home/jamcgirr/ph/data/vcfs/split_pops/test_pos_file2
    # tail -n +2 /home/jamcgirr/ph/data/vcfs/split_pops/test_pos_file2 > /home/jamcgirr/ph/data/vcfs/split_pops/chr1_thin20k_sites.txt
    # rm /home/jamcgirr/ph/data/vcfs/split_pops/test_pos_file2
    
    job_name = '_LD_'+chrom+'_20k_snps'
    infiles = ["BC17_64","CA17_70","PWS07_46","PWS17_56","PWS91_58","PWS96_72","SS06_41","SS17_64","SS96_78","TB06_52","TB17_72","TB91_74","TB96_73","WA17_72"]
    infiles = ["BC17_64"]    
    for infile in infiles:
        script = 'script_' + infile + job_name + '.sh'
        sbatch_header_loop(job_name,'8','1','24', infile)
        o = io.open(script,'a+', newline='\n')
        
        
        pops = ''.join(infile).split("_")
        o.write('module load vcftools \n')
        o.write('vcftools --vcf '+vcf_dir+chrom+'_thin20k_'+vcf_name+'.vcf --keep /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+pops[0]+'_plates_1_through_5_rm.txt --BEAGLE-PL --chr '+chrom+' --out '+vcf_dir+'split_pops/'+chrom+'_thin20k_'+pops[0]+'_'+vcf_name+' \n')
        o.write('gzip '+vcf_dir+'split_pops/'+chrom+'_thin20k_'+pops[0]+'_'+vcf_name+'.BEAGLE.PL \n')
        o.write('/home/jamcgirr/apps/ngsLD/ngsLD --geno '+vcf_dir+'split_pops/'+chrom+'_thin20k_'+pops[0]+'_'+vcf_name+'.BEAGLE.PL.gz --pos '+vcf_dir+'split_pops/'+chrom+'_thin20k_sites.txt --n_ind '+pops[1]+' --n_sites 20000 --out /home/jamcgirr/ph/data/angsd/ngsLD/'+chrom+'_thin20k_'+pops[0]+'_'+vcf_name+'_ld.txt \n')
    
        
        o.write('\n\n#run: sbatch '+script)
        o.close()
    
# Rscript --vanilla --slave /home/jamcgirr/apps/ngsLD/scripts/fit_LDdecay.R --ld_files /home/jamcgirr/ph/data/angsd/ngsLD/ld_files.list.txt --fit_level 3 --out test_plot.pdf
# Rscript --vanilla --slave /home/jamcgirr/apps/ngsLD/scripts/fit_LDdecay.R --ld_files /home/jamcgirr/ph/data/angsd/ngsLD/ld_files.list_lg.txt --max_kb_dist 100 --fit_level 3 --out test_plot_lg.pdf


In [5]:
# LD with plink

job_name = 'LD'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'

sbatch_header(job_name,'8','8','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')


o.write('module load plink \n')
o.write('plink --file /home/jamcgirr/ph/data/vcfs/'+vcf_name+' --indep-pairwise 100 10 0.8 --r2 --out /home/jamcgirr/ph/data/plink/'+vcf_name+'_indep_pairwise_100_10_0.8 --threads 8 \n') 


#run sbatch submission 
o.write('\n\n#command to run: sbatch '+script)
o.close()

# ~2 min

# LD for each population
job_name = '_LD'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','1', infile)
    o = io.open(script,'a+', newline='\n')

    
    o.write('module load vcftools \n')
    o.write('vcftools --gzvcf /home/jamcgirr/ph/data/vcfs/split_pops/maf05/population_'+infile+'_'+vcf_name+'.vcf.gz --plink --out /home/jamcgirr/ph/data/plink/population_'+infile+'_'+vcf_name+' \n\n')

    o.write('module load plink \n')
    o.write('plink --file /home/jamcgirr/ph/data/plink/population_'+infile+'_'+vcf_name+' --indep-pairwise 100 10 0.8 --r2 --out /home/jamcgirr/ph/data/plink/population_'+infile+'_'+vcf_name+'_indep_pairwise_100_10_0.8 --threads 4 \n') 

    o.write('\n\n#command to run: sbatch '+script)
    o.close()
# ~ 5 min

In [20]:
'''
job_name = 'popvae_PWS'

sbatch_header(job_name,'32','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')  
   
    
o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS91_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS07_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt > /home/jamcgirr/ph/data/popvae/pops_PWS.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' /home/jamcgirr/ph/data/popvae/pops_PWS.txt \n')
o.write('sed -i \'s/.bam//g\' /home/jamcgirr/ph/data/popvae/pops_PWS.txt \n')

o.write('bcftools view -S /home/jamcgirr/ph/data/popvae/pops_PWS.txt -Oz /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -R /home/jamcgirr/ph/data/moments/ld_prune/random_thin_keep.txt --threads 4 > /home/jamcgirr/ph/data/popvae/PWS_random_thin.vcf.gz \n')
o.write('source /home/jamcgirr/apps/my_python3.7/bin/activate \n')
o.write('popvae.py --infile /home/jamcgirr/ph/data/popvae/PWS_random_thin.vcf.gz --out /home/jamcgirr/ph/data/popvae/PWS_random_thin --seed 42 \n')


o.write('\n\n#run: sbatch '+script)
o.close()

###################################
job_name = 'popvae_TB'

sbatch_header(job_name,'32','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')  
   
    
o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/TB91_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/TB96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/TB06_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/TB17_bams_p_1_5_rm.txt > /home/jamcgirr/ph/data/popvae/pops_TB.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' /home/jamcgirr/ph/data/popvae/pops_TB.txt \n')
o.write('sed -i \'s/.bam//g\' /home/jamcgirr/ph/data/popvae/pops_TB.txt \n')

o.write('bcftools view -S /home/jamcgirr/ph/data/popvae/pops_TB.txt -Oz /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -R /home/jamcgirr/ph/data/moments/ld_prune/random_thin_keep.txt --threads 4 > /home/jamcgirr/ph/data/popvae/TB_random_thin.vcf.gz \n')
o.write('source /home/jamcgirr/apps/my_python3.7/bin/activate \n')
o.write('popvae.py --infile /home/jamcgirr/ph/data/popvae/TB_random_thin.vcf.gz --out /home/jamcgirr/ph/data/popvae/TB_random_thin --seed 42 \n')


o.write('\n\n#run: sbatch '+script)
o.close()

###################################
job_name = 'popvae_SS'

sbatch_header(job_name,'32','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')  
   
    
o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/SS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS06_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt > /home/jamcgirr/ph/data/popvae/pops_SS.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' /home/jamcgirr/ph/data/popvae/pops_SS.txt \n')
o.write('sed -i \'s/.bam//g\' /home/jamcgirr/ph/data/popvae/pops_SS.txt \n')

o.write('bcftools view -S /home/jamcgirr/ph/data/popvae/pops_SS.txt -Oz /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -R /home/jamcgirr/ph/data/moments/ld_prune/random_thin_keep.txt --threads 4 > /home/jamcgirr/ph/data/popvae/SS_random_thin.vcf.gz \n')
o.write('source /home/jamcgirr/apps/my_python3.7/bin/activate \n')
o.write('popvae.py --infile /home/jamcgirr/ph/data/popvae/SS_random_thin.vcf.gz --out /home/jamcgirr/ph/data/popvae/SS_random_thin --seed 42 \n')


o.write('\n\n#run: sbatch '+script)
o.close()
###################################
job_name = 'popvae_NEPac17'

sbatch_header(job_name,'32','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')  
   
    
o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/BC17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/WA17_bams_p_1_5_rm.txt > /home/jamcgirr/ph/data/popvae/pops_NEPac17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' /home/jamcgirr/ph/data/popvae/pops_NEPac17.txt \n')
o.write('sed -i \'s/.bam//g\' /home/jamcgirr/ph/data/popvae/pops_NEPac17.txt \n')

o.write('bcftools view -S /home/jamcgirr/ph/data/popvae/pops_NEPac17.txt -Oz /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -R /home/jamcgirr/ph/data/moments/ld_prune/random_thin_keep.txt --threads 4 > /home/jamcgirr/ph/data/popvae/NEPac17_random_thin.vcf.gz \n')
o.write('source /home/jamcgirr/apps/my_python3.7/bin/activate \n')
o.write('popvae.py --infile /home/jamcgirr/ph/data/popvae/NEPac17_random_thin.vcf.gz --out /home/jamcgirr/ph/data/popvae/NEPac17_random_thin --seed 42 \n')


o.write('\n\n#run: sbatch '+script)
o.close()

###################################
job_name = 'popvae_PWS_SS'

sbatch_header(job_name,'32','4','72')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')  
   
    
o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS91_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS07_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS06_bams_p_1_5_rm.txt > /home/jamcgirr/ph/data/popvae/pops_PWS_SS.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' /home/jamcgirr/ph/data/popvae/pops_PWS_SS.txt \n')
o.write('sed -i \'s/.bam//g\' /home/jamcgirr/ph/data/popvae/pops_PWS_SS.txt \n')

o.write('bcftools view -S /home/jamcgirr/ph/data/popvae/pops_PWS_SS.txt -Oz /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -R /home/jamcgirr/ph/data/moments/ld_prune/random_thin_keep.txt --threads 4 > /home/jamcgirr/ph/data/popvae/PWS_SS_random_thin.vcf.gz \n')
o.write('source /home/jamcgirr/apps/my_python3.7/bin/activate \n')
o.write('popvae.py --infile /home/jamcgirr/ph/data/popvae/PWS_SS_random_thin.vcf.gz --out /home/jamcgirr/ph/data/popvae/PWS_SS_random_thin --seed 42 \n')


o.write('\n\n#run: sbatch '+script)
o.close()
'''
###################################
job_name = 'popvae'

sbatch_header(job_name,'32','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')  
   
o.write('source /home/jamcgirr/apps/my_python3.7/bin/activate \n')
o.write('popvae.py --infile /home/jamcgirr/ph/data/vcfs/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05.vcf.gz --out /home/jamcgirr/ph/data/popvae/ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05 --seed 42 \n')


o.write('\n\n#run: sbatch '+script)
o.close()




In [12]:
# This is slow and clunky. Dont use
job_name = '_fst_perms'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm'
realSFS = '/home/jamcgirr/apps/angsd/misc/realSFS'
wk_dir = '/home/jamcgirr/ph/data/angsd/SFS/fst_pbs/perms/'
pop_n = {"BC17":"64","CA17":"70","PWS07":"46","PWS17":"56","PWS91":"58","PWS96":"72","SS06":"41","SS17":"64","SS96":"78","TB06":"52","TB17":"72","TB91":"74","TB96":"73","WA17":"72"}
infiles = ["TB17_PWS17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','72', infile)
    o = io.open(script,'a+', newline='\n')
    
    pops = ''.join(infile).split("_")

    o.write('cat /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+pops[0]+'_plates_1_through_5_rm.txt /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+pops[1]+'_plates_1_through_5_rm.txt > '+wk_dir+'pops'+pops[0]+pops[1]+'.txt \n')
    o.write('shuf '+wk_dir+'pops'+pops[0]+pops[1]+'.txt > '+wk_dir+'shufpops'+pops[0]+pops[1]+'.txt \n')
    o.write('head -'+pop_n[pops[0]]+' '+wk_dir+'shufpops'+pops[0]+pops[1]+'.txt > '+wk_dir+'shufpop'+pops[0]+'.txt \n')
    o.write('tail -'+pop_n[pops[1]]+' '+wk_dir+'shufpops'+pops[0]+pops[1]+'.txt > '+wk_dir+'shufpop'+pops[1]+'.txt \n\n')

    o.write('module load samtools \n')
    o.write('module load bcftools \n')
    o.write('bcftools view -S '+wk_dir+'shufpop'+pops[0]+'.txt -Ov /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz > '+wk_dir+'shufpop'+pops[0]+'.vcf \n')
    o.write('bcftools view -S '+wk_dir+'shufpop'+pops[1]+'.txt -Ov /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz > '+wk_dir+'shufpop'+pops[1]+'.vcf \n\n')
    
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+wk_dir+'shufpop'+pops[0]+'.vcf -out '+wk_dir+'shufpop'+pops[0]+' -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+wk_dir+'shufpop'+pops[1]+'.vcf -out '+wk_dir+'shufpop'+pops[1]+' -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')

    o.write(realSFS+' '+wk_dir+'shufpop'+pops[0]+'.saf.idx '+wk_dir+'shufpop'+pops[1]+'.saf.idx -P 4 -fold 1 > '+wk_dir+'shufpops'+pops[0]+pops[1]+'.sfs \n')
    o.write(realSFS+' fst index '+wk_dir+'shufpop'+pops[0]+'.saf.idx '+wk_dir+'shufpop'+pops[1]+'.saf.idx -sfs '+wk_dir+'shufpops'+pops[0]+pops[1]+'.sfs -fold 1 -fstout '+wk_dir+'shufpops'+pops[0]+pops[1]+'_persite \n')
    o.write(realSFS+' fst stats2 '+wk_dir+'shufpops'+pops[0]+pops[1]+'_persite.fst.idx -win 50000 -step 10000 > '+wk_dir+'shufpops'+pops[0]+pops[1]+'_fst_pbs_50kb_win_10kb_step_folded.txt \n')

    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()
    
#EST run time ~8hr

In [None]:
# using mpileup.bcf files with PL Format for angsd input
# for some stupid fucking reason, angsd throws a seg fault halfway through chr1 during dosaf
# so nevermind about this approach. 


In [3]:
# lets try downsampling populations to the smallest n (SS06 = 41)
# run angsd direct on bam files

# if using vcf sites, need to index sites.txt 
# /home/jamcgirr/apps/angsd/angsd sites index sites.txt

job_name = '_angsd_downsample_sfs'
wk_dir = "/home/jamcgirr/ph/data/angsd/SFS/downsample/"
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
infiles = ["PWS07","SS96","TB06","TB17","TB91","TB96","WA17"]
#infiles = ["BC17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'40','8','48', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    o.write('shuf /home/jamcgirr/ph/data/angsd/SFS/bamlist_test/'+infile+'_bams_p_1_5_rm.txt | head -41 > '+wk_dir+'downsample_bams_'+infile+'.txt \n\n')
    # make saf from bams
    #o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -bam '+wk_dir+'downsample_bams_'+infile+'.txt -doSaf 1 -doMajorMinor 1 -doMaf 3 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -minMapQ 30 -minQ 20 -GL 1 -P 8 -out '+wk_dir+infile+'_minQ20_minMQ30 \n\n')
    # make saf from bams using filters
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -bam '+wk_dir+'downsample_bams_'+infile+'.txt -doSaf 1 -doMajorMinor 1 -doMaf 3 -doCounts 1 -doGlf 3 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -ref /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -minMapQ 30 -minQ 20 -GL 1 -P 8 -uniqueOnly 1 -remove_bads 1 -only_proper_pairs 1 -trim 0 -C 50 -minInd 10 -setMinDepth 10 -setMaxDepth 100 -out '+wk_dir+infile+'_minQ20_minMQ30 \n\n')
    # make folded sfs
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+wk_dir+infile+'_minQ20_minMQ30.saf.idx -P 8 -fold 1 -nSites 100000000 > '+wk_dir+infile+'_minQ20_minMQ30_folded.sfs \n')
    # make unfolded sfs
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+wk_dir+infile+'_minQ20_minMQ30.saf.idx -P 8 -nSites 100000000 > '+wk_dir+infile+'_minQ20_minMQ30_unfolded.sfs \n')

    o.write('\n\n#run: sbatch '+script)
    o.close()

# EST run time ~ 1 day

# use the -nSites 100000000 flag due to memory limits
# this breaks up genome into about 7 chunks
# this gives you seven lines in the output.sfs
# need to sum columns to make final sfs for following step

job_name = '_angsd_downsample_thetas'
saf_dir = "/home/jamcgirr/ph/data/angsd/SFS/downsample/saf/"
fold_dir = "/home/jamcgirr/ph/data/angsd/SFS/downsample/sfs/folded/"
unfold_dir = "/home/jamcgirr/ph/data/angsd/SFS/downsample/sfs/folded/"
thetas_dir = "/home/jamcgirr/ph/data/angsd/SFS/downsample/thetas/"
#vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5'
#infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
#infiles = ["CA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'30','8','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    # folded
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS saf2theta '+saf_dir+infile+'_minQ20_minMQ30.saf.idx -outname '+thetas_dir+'folded/'+infile+'_minQ20_minMQ30 -fold 1 -sfs '+fold_dir+infile+'_minQ20_minMQ30_folded.sfs \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat '+thetas_dir+'folded/'+infile+'_minQ20_minMQ30.thetas.idx \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat '+thetas_dir+'folded/'+infile+'_minQ20_minMQ30.thetas.idx -win 50000 -step 10000 -outnames '+thetas_dir+'folded/'+infile+'_minQ20_minMQ30_50kb_win_10kb_step \n\n')
    # unfolded
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS saf2theta '+saf_dir+infile+'_minQ20_minMQ30.saf.idx -outname '+thetas_dir+'unfolded/'+infile+'_minQ20_minMQ30 -sfs '+fold_dir+infile+'_minQ20_minMQ30_folded.sfs \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat '+thetas_dir+'unfolded/'+infile+'_minQ20_minMQ30.thetas.idx \n')
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/thetaStat do_stat '+thetas_dir+'unfolded/'+infile+'_minQ20_minMQ30.thetas.idx -win 50000 -step 10000 -outnames '+thetas_dir+'unfolded/'+infile+'_minQ20_minMQ30_50kb_win_10kb_step \n')
    
    
    o.write('\n\n#run: sbatch '+script)
    o.close()

# EST run time ~4hr

In [3]:
# sfs from downsample_41 saf using sites from plink pruned master vcf looks REAL dumb
# try using genome-wide LD with genotype likelihoods
# doGlf and ngsLD 
#https://github.com/fgvieira/ngsLD/issues/1

# ngsLD doesn't like .glf input. try angsd doGeno to get .geno.gz

# cool, geno works when you split by chr in a pop. Now LD prune in one pop across all chrs and use those sites to make sfs.

job_name = '_ngsLD_downsample'
glf_dir = "/home/jamcgirr/ph/data/angsd/SFS/downsample/glf/"
#infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
infiles = ["PWS07"]    
for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','8','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('#gzip -d '+saf_dir+infile+'_minQ20_minMQ30.glf.gz \n')
    o.write('/home/jamcgirr/apps/ngsLD/ngsLD --geno '+glf_dir+infile+'_minQ20_minMQ30.glf.gz --n_ind 41 --max_kb_dist 5 --n_threads 8 --out /home/jamcgirr/ph/data/angsd/ngsLD/downsample/'+infile+'_minQ20_minMQ30_ld.txt \n')

    
    o.write('\n\n#run: sbatch '+script)
    o.close()

In [10]:
# Does't work, memory limits
# Fst and PBS using downsample 41 samples

# 1. Create 2d sfs from saf

job_name = '_2d_sfs_downsample'

realSFS = '/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS'
wk_dir = '/home/jamcgirr/ph/data/angsd/SFS/downsample/fst/'
saf_dir = '/home/jamcgirr/ph/data/angsd/SFS/downsample/saf/'

infiles = ["BC17_CA17","BC17_WA17","PWS07_PWS17","PWS07_SS06","PWS17_BC17","PWS17_CA17","PWS17_SS17","PWS17_WA17","PWS91_PWS07","PWS91_PWS17","PWS91_PWS96","PWS96_PWS07","PWS96_PWS17","PWS96_SS96","SS06_SS17","SS17_BC17","SS17_CA17","SS17_WA17","SS96_SS06","SS96_SS17","TB06_PWS07","TB06_SS06","TB06_TB17","TB17_BC17","TB17_CA17","TB17_PWS17","TB17_SS17","TB17_WA17","TB91_TB06","TB91_TB17","TB91_TB96","TB96_PWS96","TB96_SS96","TB96_TB06","TB96_TB17","WA17_CA17"]
infiles = ["BC17_CA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','8','24', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    pops = ''.join(infile).split("_")
    o.write(realSFS+' '+saf_dir+pops[0]+'_minQ20_minMQ30.saf.idx '+saf_dir+pops[1]+'_minQ20_minMQ30.saf.idx -fold 1 -P 8 -nSites 100000000 > '+wk_dir+'folded/'+infile+'_folded_chunks.sfs \n')
    o.write(realSFS+' '+saf_dir+pops[0]+'_minQ20_minMQ30.saf.idx '+saf_dir+pops[1]+'_minQ20_minMQ30.saf.idx -fold 1 -P 8 > '+wk_dir+'folded/'+infile+'_folded_full.sfs \n')

    #o.write(realSFS+' '+saf_dir+pops[0]+'_minQ20_minMQ30.saf.idx '+saf_dir+pops[1]+'_minQ20_minMQ30.saf.idx -P 8 > '+wk_dir+'/unfolded/'+infile+'unfolded.sfs \n')

    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()

In [24]:
# get allele frequencies for arcsine transformed shifts
job_name = '_freqs'

infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','1', infile)
    o = io.open(script,'a+', newline='\n') 

    o.write('module load bcftools\n')

    o.write('bcftools query -f \'%CHROM %POS %AF\\n\' /home/jamcgirr/ph/data/vcfs/split_pops/maf05/population_'+infile+'_'+vcf_name+'.vcf.gz > /home/jamcgirr/ph/data/freqs/population_'+infile+'_'+vcf_name+'_freqs.txt \n')

    o.write('\n\n#command to run: sbatch '+script)
    o.close()
            
# calculate direct from AC and AN        
#bcftools query -f '%CHROM %POS %AN %AC{0}\n' /home/jamcgirr/ph/data/vcfs/split_pops/population_PWS07_ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_plates_1_5_rm.vcf > /home/jamcgirr/ph/data/freqs/PWS07_freqs_test.txt

In [13]:
job_name = '_freqs_perms'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_minQ20_minMQ30_NS0.5_maf0.05'
wk_dir = '/home/jamcgirr/ph/data/freqs/perms/'
pop_n = {"BC17":"64","CA17":"70","PWS07":"46","PWS17":"56","PWS91":"58","PWS96":"72","SS06":"41","SS17":"64","SS96":"78","TB06":"52","TB17":"72","TB91":"74","TB96":"73","WA17":"72"}
infiles = ["PWS91_PWS96","PWS96_PWS07","PWS07_PWS17","TB91_TB96","TB96_TB06","TB06_TB17","SS96_SS06","SS06_SS17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','144', infile)
    o = io.open(script,'a+', newline='\n')
    
    o.write('for i in {1..1000} \n')
    o.write('do \n')
    
    pops = ''.join(infile).split("_")

    o.write('cat /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+pops[0]+'_plates_1_through_5_rm.txt /home/jamcgirr/ph/scripts/angsd/SFS/SFS_by_pop/'+pops[1]+'_plates_1_through_5_rm.txt > '+wk_dir+infile+'/pops_'+pops[0]+pops[1]+'.txt \n')
    o.write('shuf '+wk_dir+infile+'/pops_'+pops[0]+pops[1]+'.txt > '+wk_dir+infile+'/shufpops_'+pops[0]+pops[1]+'.txt \n')
    o.write('head -'+pop_n[pops[0]]+' '+wk_dir+infile+'/shufpops_'+pops[0]+pops[1]+'.txt > '+wk_dir+infile+'/shufpop_'+pops[0]+'.txt \n')
    o.write('tail -'+pop_n[pops[1]]+' '+wk_dir+infile+'/shufpops_'+pops[0]+pops[1]+'.txt > '+wk_dir+infile+'/shufpop_'+pops[1]+'.txt \n\n')

    o.write('module load bcftools \n')
    o.write('bcftools view -S '+wk_dir+infile+'/shufpop_'+pops[0]+'.txt /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz --threads 4 | bcftools +fill-tags -- -t AF | bcftools query -f \'%CHROM %POS %AF\\n\' > '+wk_dir+infile+'/shufpop_'+pops[0]+'_freqs.txt \n')
    o.write('bcftools view -S '+wk_dir+infile+'/shufpop_'+pops[1]+'.txt /home/jamcgirr/ph/data/vcfs/'+vcf_name+'.vcf.gz --threads 4 | bcftools +fill-tags -- -t AF | bcftools query -f \'%CHROM %POS %AF\\n\' > '+wk_dir+infile+'/shufpop_'+pops[1]+'_freqs.txt \n\n')
    
    o.write('Rscript '+wk_dir+'/AF_permutations.R --pop0 '+pops[0]+' --pop1 '+pops[1]+' \n')
    o.write('rm '+wk_dir+infile+'/shufpop_'+pops[0]+'_freqs.txt \n')
    o.write('rm '+wk_dir+infile+'/shufpop_'+pops[1]+'_freqs.txt \n')
    
    o.write('done \n\n')
