In [2]:
import io
import pandas as pd
import numpy as np

def sbatch_header(job,mem,tasks,hours):
    #sbatch submission script header
    script = 'script_' + job + '.sh'
    outfile = io.open(script,'w', newline='\n')    
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+job+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+job+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00  \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()
    
def sbatch_header_loop(job,mem,tasks,hours,infile):
    #sbatch submission script header
    script = 'script_' + infile + job + '.sh'
    outfile = io.open(script,'w', newline='\n') 
    jobname= infile + job   
    outfile.write('#!/bin/bash\n\n#SBATCH --job-name='+jobname+'\n')
    outfile.write('#SBATCH --mem='+mem+'G \n')
    outfile.write('#SBATCH --ntasks='+tasks+' \n')
    outfile.write('#SBATCH -e '+jobname+'_%A_%a.err \n')
    outfile.write('#SBATCH --time='+hours+':00:00 \n')
    outfile.write('#SBATCH --mail-user=jamcgirr@ucdavis.edu ##email you when job starts,ends,etc\n#SBATCH --mail-type=ALL\n')
    outfile.write('#SBATCH -p high \n\n')
    outfile.close()

In [None]:
# Fst persite PWS96 + PWS07 + PWS17 vs. PWS 91
# Fst persite PWS96 + PWS07 + PWS17 vs. NEPac (SS96,SS06,SS17,BC17,WA17)
# Fst persite PWS17 vs. NEPac (SS17,BC17,WA17)

job_name = 'fst_pops_PWS96_07_17_v_PWS91'
script_dir = '/home/jamcgirr/ph/scripts/angsd/SFS/fst_fixed_PWS/'
fst_dir = '/home/jamcgirr/ph/data/angsd/SFS/fst_fixed_PWS/'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS07_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt > '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS96_07_17.txt \n\n')

o.write('cp /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS91_bams_p_1_5_rm.txt '+fst_dir+'pops_PWS91.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS91.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS91.txt \n\n')

#subset pop vcfs
o.write('bcftools view -S '+fst_dir+'pops_PWS96_07_17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS96_07_17.vcf \n')
o.write('bcftools view -S '+fst_dir+'pops_PWS91.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS91.vcf \n\n')

# make saf from vcfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS96_07_17.vcf -out '+fst_dir+'pops_PWS96_07_17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS91.vcf -out '+fst_dir+'pops_PWS91 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n\n')
o.write('rm '+fst_dir+'pops_PWS96_07_17.vcf')
o.write('rm '+fst_dir+'pops_PWS91.vcf')

# make folded sfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS96_07_17.sfs \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS91.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS91.sfs \n\n')

# make 2dsfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_PWS91.saf.idx -fold 1 -P 4 > '+fst_dir+'pops_PWS96_07_17_v_PWS91.sfs \n\n')

# estimate fst
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst index '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_PWS91.saf.idx -sfs '+fst_dir+'pops_PWS96_07_17_v_PWS91.sfs -fold 1 -fstout '+fst_dir+'pops_PWS96_07_17_v_PWS91 \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst stats2 '+fst_dir+'pops_PWS96_07_17_v_PWS91.fst.idx -win 50000 -step 10000 > '+fst_dir+'fst_50kb_win_10kb_step_folded_pops_PWS96_07_17_v_PWS91.txt \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst print '+fst_dir+'pops_PWS96_07_17_v_PWS91.fst.idx > '+fst_dir+'fst_persite_folded_pops_PWS96_07_17_v_PWS91.txt')
    
o.write('\n\n#run: sbatch '+script)
o.close()
    
    
    
###################################################################
job_name = 'fst_pops_PWS96_07_17_v_NEPac'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')  
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/SS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS06_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/BC17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/WA17_bams_p_1_5_rm.txt > '+fst_dir+'pops_NEPac.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_NEPac.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_NEPac.txt \n\n')

o.write('# cat /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS96_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS07_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt > '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('# sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS96_07_17.txt \n')
o.write('# sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS96_07_17.txt \n\n')

#subset pop vcfs
o.write('#bcftools view -S '+fst_dir+'pops_PWS96_07_17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS96_07_17.vcf \n')
o.write('bcftools view -S '+fst_dir+'pops_NEPac.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_NEPac.vcf \n\n')

# make saf from vcfs
o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS96_07_17.vcf -out '+fst_dir+'pops_PWS96_07_17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_NEPac.vcf -out '+fst_dir+'pops_NEPac -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n\n')
o.write('#rm '+fst_dir+'pops_PWS96_07_17.vcf')
o.write('rm '+fst_dir+'pops_NEPac.vcf')

# make folded sfs
o.write('#/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS96_07_17.sfs \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_NEPac.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_NEPac.sfs \n\n')

# make 2dsfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_NEPac.saf.idx -fold 1 -P 4 > '+fst_dir+'pops_PWS96_07_17_v_NEPac.sfs \n\n')

# estimate fst
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst index '+fst_dir+'pops_PWS96_07_17.saf.idx '+fst_dir+'pops_NEPac.saf.idx -sfs '+fst_dir+'pops_PWS96_07_17_v_NEPac.sfs -fold 1 -fstout '+fst_dir+'pops_PWS96_07_17_v_NEPac \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst stats2 '+fst_dir+'pops_PWS96_07_17_v_NEPac.fst.idx -win 50000 -step 10000 > '+fst_dir+'fst_50kb_win_10kb_step_folded_pops_PWS96_07_17_v_NEPac.txt \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst print '+fst_dir+'pops_PWS96_07_17_v_NEPac.fst.idx > '+fst_dir+'fst_persite_folded_pops_PWS96_07_17_v_NEPac.txt')
    
o.write('\n\n#run: sbatch '+script)
o.close()
   

    
###################################################################
job_name = 'fst_pops_PWS17_v_NEPac17'

sbatch_header(job_name,'16','4','144')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load samtools \n')
o.write('module load bcftools \n')  
o.write('cat /home/jamcgirr/ph/data/angsd/SFS/thetas/SS17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/BC17_bams_p_1_5_rm.txt /home/jamcgirr/ph/data/angsd/SFS/thetas/WA17_bams_p_1_5_rm.txt  > '+fst_dir+'pops_NEPac17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_NEPac17.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_NEPac17.txt \n\n')

o.write('cp /home/jamcgirr/ph/data/angsd/SFS/thetas/PWS17_bams_p_1_5_rm.txt '+fst_dir+'pops_PWS17.txt \n')
o.write('sed -i \'s/\/home\/eoziolor\/phpopg\/data\/align\///g\' '+fst_dir+'pops_PWS17.txt \n')
o.write('sed -i \'s/.bam//g\' '+fst_dir+'pops_PWS17.txt \n\n')

#subset pop vcfs
o.write('bcftools view -S '+fst_dir+'pops_PWS17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_PWS17.vcf \n')
o.write('bcftools view -S '+fst_dir+'pops_NEPac17.txt -Ov /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz --threads 4 > '+fst_dir+'pops_NEPac17.vcf \n\n')

# make saf from vcfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_PWS17.vcf -out '+fst_dir+'pops_PWS17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -doSaf 1 -vcf-pl '+fst_dir+'pops_NEPac17.vcf -out '+fst_dir+'pops_NEPac17 -anc /home/jamcgirr/ph/data/c_harengus/c.harengus.fa \n\n')
o.write('rm '+fst_dir+'pops_PWS17.vcf')
o.write('rm '+fst_dir+'pops_NEPac17.vcf')

# make folded sfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_PWS17.sfs \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_NEPac17.saf.idx -P 4 -fold 1 > '+fst_dir+'pops_NEPac17.sfs \n\n')

# make 2dsfs
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS '+fst_dir+'pops_PWS17.saf.idx '+fst_dir+'pops_NEPac17.saf.idx -fold 1 -P 4 > '+fst_dir+'pops_PWS17_v_NEPac17.sfs \n\n')

# estimate fst
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst index  '+fst_dir+'pops_PWS17.saf.idx '+fst_dir+'pops_NEPac17.saf.idx -sfs '+fst_dir+'pops_PWS17_v_NEPac17.sfs -fold 1 -fstout '+fst_dir+'pops_PWS17_v_NEPac17 \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst stats2 '+fst_dir+'pops_PWS17_v_NEPac17.fst.idx -win 50000 -step 10000 > '+fst_dir+'fst_50kb_win_10kb_step_folded_pops_PWS17_v_NEPac17.txt \n')
o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/misc/realSFS fst print '+fst_dir+'pops_PWS17_v_NEPac17.fst.idx > '+fst_dir+'fst_persite_folded_pops_PWS17_v_NEPac17.txt')
    
o.write('\n\n#run: sbatch '+script)
o.close()
    
#EST run time ~ 3 days



In [3]:
# output SNP table

job_name = 'snp_table'

sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('module load R \n')
o.write('module load maven \n')
o.write('module load java \n')
o.write('module load GATK/4.1.4.1 \n\n')

o.write('gatk VariantsToTable -V /home/jamcgirr/ph/data/vcfs/vince/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm.vcf.gz -F CHROM -F POS -F REF -F ALT -O /home/jamcgirr/ph/data/vcfs/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm_table.txt \n\n')

#sed 's/chr//g' /home/jamcgirr/ph/data/vcfs/ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_outliers_rm_table.txt > /home/jamcgirr/ph/data/LoF/snp_table_VEP.txt
#sed -i 's/\t/:/g' /home/jamcgirr/ph/data/LoF/snp_table_VEP.txt
#sed -i '1d' /home/jamcgirr/ph/data/LoF/snp_table_VEP.txt

o.write('\n\n#run: sbatch '+script)
o.close()
#EST run time ~ 1 hr

In [6]:
# parse SNP table with R script (LoF.rmd)
# run variant effect predictor (https://uswest.ensembl.org/info/docs/tools/vep/script/index.html)

job_name = 'vep'

sbatch_header(job_name,'8','4','24')
script = 'script_' + job_name + '.sh'
o = io.open(script,'a+', newline='\n')

o.write('/home/jamcgirr/ph/data/LoF/ensembl-vep/vep --input_file /home/jamcgirr/ph/data/LoF/snp_table_VEP_standard_format.txt --output_file /home/jamcgirr/ph/data/LoF/test_output.txt --cache --species clupea_harengus \n')
      
o.write('\n\n#run: sbatch '+script)
o.close()       
        
#EST run time ~ 1 hr 20 min      

In [10]:
# create maf file using -ref to polarize.
# this will allow us to assign frequencies to REF and ALT rather than major and minor
# the latter will vary at some sites by population

job_name = '_maf_for_vep'
vcf_name = 'ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5'
infiles = ["BC17","CA17","PWS07","PWS17","PWS91","PWS96","SS06","SS17","SS96","TB06","TB17","TB91","TB96","WA17"]

for infile in infiles:
    script = 'script_' + infile + job_name + '.sh'
    sbatch_header_loop(job_name,'8','4','1', infile)
    o = io.open(script,'a+', newline='\n')
    
    
    # make mafs from vcf
    o.write('/home/jamcgirr/apps/angsd_sep_20/angsd/angsd -out /home/jamcgirr/ph/data/LoF/mafs_REF_ALT/'+infile+'_vcf -fai /home/jamcgirr/ph/data/c_harengus/c.harengus.fa.fai -ref /home/jamcgirr/ph/data/c_harengus/c.harengus.fa -doMajorMinor 4 -doMaf 1 -vcf-pl /home/jamcgirr/ph/data/vcfs/split_pops/population_'+infile+'_ph_filtered_snps_minDP600_maxDP2000_maf0.05_minQ20_minMQ30_maxmiss0.5_plates_1_5_rm.vcf -P 4 \n')
    o.write('gzip -d /home/jamcgirr/ph/data/LoF/mafs_REF_ALT/'+infile+'_vcf.mafs.gz \n')
    
    
    o.write('\n\n#command to run: sbatch '+script)
    o.close()

# EST run time ~ 5 minutes

In [5]:
snps = pd.read_csv('C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/angsd/SFS/fst/fixed_pws_persite/snp_table_VEP.txt', sep = ':', header = None)
snps.head(10)

Unnamed: 0,0,1,2,3
0,1,2045,G,A
1,1,2079,T,C
2,1,2085,T,C
3,1,2583,C,T
4,1,2674,G,A
5,1,7452,G,T
6,1,7496,G,C
7,1,8834,A,T
8,1,8964,T,C
9,1,9009,G,C


In [7]:
maf = pd.read_csv('C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/angsd/SFS/dxy/mafs/PWS91_vcf.mafs', sep = '\t', header = 0)
maf.head(10)
maf = maf.rename(columns={0:'chrom', 3:'start', 4:'stop'})


Unnamed: 0,chromo,position,major,minor,knownEM,nInd
0,chr1,2079,T,C,0.022064,36
1,chr1,2085,T,C,5e-06,35
2,chr1,2583,C,T,0.176155,34
3,chr1,7452,G,T,0.182796,37
4,chr1,7496,G,C,0.168173,37
5,chr1,9070,T,A,0.029781,25
6,chr1,9098,A,G,0.032971,26
7,chr1,13381,A,C,3e-06,32
8,chr1,13904,C,A,0.50482,26
9,chr1,14280,C,A,0.631408,30


In [8]:
vep = pd.read_csv('C:/Users/jmcgirr/Documents/Whitehead_Lab/ph/angsd/SFS/fst/fixed_pws_persite/VEP_output_all_snps.txt', sep = '\t')
vep.head(10)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 31, saw 14


In [None]:
all_dna_snps = pd.read_table(snp_dir + "58_dna_maf_0.5_maxmiss_0.9_snp_table.txt")
all_dna_snps['snpIndex'] = all_dna_snps['CHROM'].astype(str) + ':' + all_dna_snps['POS'].astype(str)
#all_dna_snps.head(10)
features = pd.read_table(cts_dir +"ref_C_variegatus-1.0_scaffolds_no_header.gff3", header = None)
gene = features[features[2] == 'gene']
gene = gene.join(gene[8].str.split('Name=', 1, expand=True).rename(columns={0:'junk1', 1:'junk2'}))
gene = gene.join(gene['junk2'].str.split(';', 1, expand=True).rename(columns={0:'geneName', 1:'junk3'}))
gene = gene.join(gene['junk1'].str.split('ID=', 1, expand=True).rename(columns={0:'junk4', 1:'junk5'}))
gene = gene.join(gene['junk5'].str.split(';', 1, expand=True).rename(columns={0:'geneID', 1:'junk6'}))
gene['geneID'] = gene['geneID'].astype(str) + ';' + gene['geneName'].astype(str)
gene = gene.rename(columns={0:'chrom', 3:'start', 4:'stop'})
gene = gene[['chrom','start','stop','geneID']]
mrna = features[features[2] == 'mRNA']
mrna = mrna.join(mrna[8].str.split('gene=', 1, expand=True).rename(columns={0:'junk1', 1:'junk2'}))
mrna = mrna.join(mrna['junk2'].str.split(';', 1, expand=True).rename(columns={0:'geneName', 1:'junk3'}))
mrna = mrna.join(mrna['junk3'].str.split('transcript_id=', 1, expand=True).rename(columns={0:'junk7', 1:'transcriptID'}))
mrna = mrna.join(mrna['junk1'].str.split('ID=', 1, expand=True).rename(columns={0:'junk4', 1:'junk5'}))
mrna = mrna.join(mrna['junk5'].str.split(';', 1, expand=True).rename(columns={0:'rnaID', 1:'junk6'}))
mrna['mrnaID'] = mrna['transcriptID'].astype(str) + ';' + mrna['geneName'].astype(str)
mrna = mrna.rename(columns={0:'chrom', 3:'start', 4:'stop'})
mrna = mrna[['chrom','start','stop','mrnaID']]
exon = features[features[2] == 'exon']
exon = exon.join(exon[8].str.split('gene=', 1, expand=True).rename(columns={0:'junk1', 1:'junk2'}))
exon = exon.join(exon['junk2'].str.split(';', 1, expand=True).rename(columns={0:'geneName', 1:'junk3'}))
exon = exon.join(exon['junk1'].str.split('ID=', 1, expand=True).rename(columns={0:'junk4', 1:'junk5'}))
exon = exon.join(exon['junk5'].str.split(';', 1, expand=True).rename(columns={0:'rnaID', 1:'junk6'}))
exon['exonID'] = exon['rnaID'].astype(str) + ';' + exon['geneName'].astype(str)
exon = exon.rename(columns={0:'chrom', 3:'start', 4:'stop'})
exon = exon[['chrom','start','stop','exonID']]

cts = cts_dir +infile + "_counts.csv"
cts = pd.read_table(cts)
cts['snpIndex'] = cts['contig'].astype(str) + ':'+ cts['position'].astype(str) 
cts = cts[['snpIndex','refAllele','altAllele','refCount', 'altCount', 'totalCount']]
mom = mom_dict[infile] +".GT"
snps = all_dna_snps[[mom,'snpIndex']]

mom_kid = cts.merge(snps, on='snpIndex')
mom_kid = mom_kid.join(mom_kid[mom].str.split('/', 1, expand=True).rename(columns={0:'momAllele', 1:'a2'}))
#only analyze homozygous alleles in mom that are het in offspring
mom_kid = mom_kid[mom_kid['momAllele'] == mom_kid['a2']]
mom_kid = mom_kid[(mom_kid['momAllele'] == mom_kid['refAllele']) | (mom_kid['momAllele'] == mom_kid['altAllele'])]
mom_kid = mom_kid[(mom_kid['refCount'] >= 10) & (mom_kid['altCount'] >= 10)]
mom_kid.loc[mom_kid['refAllele'] == mom_kid['momAllele'], 'momCount'] = mom_kid['refCount']
mom_kid.loc[mom_kid['altAllele'] == mom_kid['momAllele'], 'momCount'] = mom_kid['altCount']
mom_kid = mom_kid.join(mom_kid['snpIndex'].str.split(':', 1, expand=True).rename(columns={0:'chrom', 1:'position'}))
mom_kid = mom_kid[['chrom','position','snpIndex','refAllele','altAllele','refCount','altCount','totalCount','momAllele', 'momCount']]

all_feature_snps = pd.DataFrame()
all_feature_snps['chrom'] = []
all_feature_snps['position'] = []
#feature_dfs = [gene, mrna, exon]
feature_dfs = [gene, mrna]
for feature_df in feature_dfs:
    

    #scaff = 'NW_015150454.1'
    #feature_df = gene
    
    feature = feature_df
    scaffs = mom_kid['chrom']
    scaffs1 = list(scaffs)
    scaffs_unique = set(scaffs1)
    scaffs = list(scaffs_unique)
    scaffolds = []  
    start_s = []    
    stop_s = []     
    feature_names = [] 
    snp_pss = [] 
    
    for scaff in scaffs:
        snp_table   = mom_kid[mom_kid['chrom'] == scaff]  
        ftable = feature[feature['chrom'] == scaff]
        fstart =   list(ftable["start"])
        fstop  =   list(ftable["stop"])  
        fname =   list(ftable[ftable.columns[3]]) 
        positions =   snp_table['position']
        
        run = list(range(1, len(fstart)))
        if len(feature_table) > 0:    
            for ps in positions:
                for i in run:
                     if (int(ps) >= int(fstart[i])) & (int(ps) <= int(fstop[i])):
                        scaffolds.append(scaff) 
                        start_s.append(fstart[i])   
                        stop_s.append(fstop[i])    
                        feature_names.append(fname[i])
                        snp_pss.append(ps)
    feature_df_name = ftable.columns[3]
    feature_snps = pd.DataFrame(
        {'chrom': scaffolds,
        'position': snp_pss,
        feature_df_name: feature_names,
        })
    
    all_feature_snps = feature_snps.merge(all_feature_snps,how='left', on = ['chrom','position'])
final = all_feature_snps.merge(mom_kid, on = ['chrom', 'position'])
final.to_csv(out_dir +infile+'maternal_counts_features.txt',index=False, sep = "\t")
