Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 220 lines (179 sloc) 9.5 KB
#!/usr/bin/env bash
export LC_ALL=en_US.UTF-8
echo "This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,"
echo "quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation."
echo -e "The script assumes that loci and individuals with low call rates (or depth) have already been removed. \n"
echo -e "Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters \n"
#Checks for correct ususage
if [[ -z "$2" ]]; then
echo "Usage is bash dDocent_filters.sh VCF_file Output_prefix"
exit 1
fi
#Filteres out sites with that on average have heterzygotes with less than a 0.28 allele balance between reads from each allele and Quality / Depth < 0.5 and ratio of mapping quality for reference and alternate alleles
vcffilter -s -f "AB > 0.2 & AB < 0.8 | AB < 0.01 | AB > 0.99" -s -g "QR > 0 | QA > 0 " $1 | vcffilter -s -f "QUAL / DP > 0.1" | vcffilter -s -f "MQM / MQMR > 0.25 & MQM / MQMR < 1.75" > $2
FILTERED=$(mawk '!/#/' $2 | wc -l)
OLD=$(mawk '!/#/' $1 | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n" > $2.filterstats
#Asks about filting SNPs with reads from both strands filters out loci that have reads from both strands, with some leeway for a bad individual or two
if [[ -z "$3" ]]; then
echo -e "Are reads expected to overlap? In other words, is fragment size less than 2X the read length? Enter yes or no."
read OL
else
OL=$3
fi
if [ "$OL" != "yes" ]; then
vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 50" -s $2 > $2.filAB.vcf
FILTERED=$(mawk '!/#/' $2.filAB.vcf | wc -l)
OLD=$(mawk '!/#/' $2 | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
else
cp $2 $2.filAB.vcf
fi
#Filters out loci that have reads from both paired and unpaired reads
if [[ -z "$4" ]]; then
echo -e "Is this from a mixture of SE and PE libraries? Enter yes or no."
read PE
else
PE=$4
fi
if [ "$PE" != "yes" ]; then
vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s $2.filAB.vcf > $2.fil.vcf
FILTERED=$(mawk '!/#/' $2.fil.vcf | wc -l)
OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
else
vcffilter -f "PAIRED < 0.005 & PAIREDR > 0.005 | PAIRED > 0.005 & PAIREDR < 0.005" -t NP -F PASS -A $2.filAB.vcf | mawk '!/NP/' > $2.fil.vcf
FILTERED=$(mawk '!/#/' $2.fil.vcf | wc -l)
OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
fi
#Uses the VCF file to estimate the original number of individuals in the VCF file
#This is important because the INFO flags are based on this number
IND=$(grep -o -e 'NS=[0-9]*' $1 | sed s/NS=//g | sort | tail -1)
IND=$(($IND - 0 ))
#Creates a file with the original site depth and qual for each locus
cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
#Calculates the average depth and standard deviation
DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
DEPTH=$(perl -e "print int("$DEPTH") + int("$SD")")
#DEPTH=$(perl -e "print int("$DEPTH"+100*("$DEPTH"**0.5))")
#Filters loci above the mean depth + 1 standard deviation that have quality scores that are less than 2*DEPTH
paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' > $1.lowQDloci
LQDL=$(cat $1.lowQDloci | wc -l)
OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n"
echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n" >> $2.filterstats
#Recalculates site depth for sites that have not been previously filtered
vcftools --vcf $2.fil.vcf --remove-filtered NP --exclude-positions $1.lowQDloci --site-depth --out $1 2> /dev/null
cut -f3 $1.ldepth > $1.site.depth
DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
#Calculates actual number of individuals in VCF file
#This is important because loci will now be filtered by mean depth calculated with individuals present in VCF
IND=$(mawk '/#/' $1 | tail -1 | wc -w)
IND=$(($IND - 9))
mawk '!/D/' $1.site.depth | mawk -v x=$IND '{print $1/x}' > meandepthpersite
#Calculates a mean depth cutoff to use for filtering
DP=$(perl -e "print ($DP+ 1.645*$SD) / $IND")
PP=$(mawk '!/SUM/' $1.site.depth | sort -rn | perl -e '$d=.05;@l=<>;print $l[int($d*$#l)]' )
PP=$(perl -e "print int($PP / $IND)")
GP=$(perl -e "print int($PP * 1.10)")
export GP
gnuplot << \EOF >> $2.filterstats
set terminal dumb size 120, 30
set autoscale
high=system("echo $GP")
set xrange [10:high]
unset label
set title "Histogram of mean depth per site"
set ylabel "Number of Occurrences"
set xlabel "Mean Depth"
#set yr [0:100000]
binwidth=1
bin(x,width)=width*floor(x/width) + binwidth/2.0
set xtics floor(high/20)
set lmargin 10
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
gnuplot << \EOF
set terminal dumb size 120, 30
set autoscale
high=system("echo $GP")
set xrange [10:high]
unset label
set title "Histogram of mean depth per site"
set ylabel "Number of Occurrences"
set xlabel "Mean Depth"
#set yr [0:100000]
binwidth=1
bin(x,width)=width*floor(x/width) + binwidth/2.0
set xtics floor(high/20)
set lmargin 10
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
if [[ -z "$5" ]]; then
echo "If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be" $DP
echo "The 95% cutoff would be" $PP
echo "Would you like to use a different maximum mean depth cutoff than "$PP", yes or no"
read NEWCUTOFF
else
NEWCUTOFF=$5
fi
if [ "$NEWCUTOFF" != "yes" ]; then
echo -e "Maximum mean depth cutoff is" $PP >> $2.filterstats
#Combines all filters to create a final filtered VCF file
vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $PP --recode 2> /dev/null
FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf | wc -l)
OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
else
if [[ -z "$6" ]]; then
echo "Please enter new cutoff"
read PP
else
PP=$6
fi
echo -e "Maximum mean depth cutoff is" $PP >> $2.filterstats
#Combines all filters to create a final filtered VCF file
vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $PP --recode 2> /dev/null
FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf | wc -l)
OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
fi
vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL1 --max-meanDP $PP --exclude-positions $1.lowQDloci --recode 2> /dev/null
vcftools --vcf $2.FIL1.recode.vcf --site-depth --out $2.FIL1 2> /dev/null
mawk '!/CHR/' $2.FIL1.ldepth | mawk '{
if (NR == 1) {chrom=$1;i=1;pos[i]=$2;dp[i]=$3}
else if ($1 == chrom) {i++;pos[i]=$2;dp[i]=$3}
else if ($1 != chrom) {for (x in dp) dpp= dpp + dp[x]; adp=dpp / i; adp= adp / 10 * 6; for (j = 1; j <= i; j++) if (dp[j] < adp) print chrom"\t"pos[j];chrom=$1;i=1;delete pos;pos[i]=$2;delete dp;dpp=0;dp[i]=$3}
}' > $2.dpmismatch.loci
OLD=$(mawk '!/#/' $2.FIL1.recode.vcf | wc -l)
DPMM=$(cat $2.dpmismatch.loci | wc -l)
echo -e "Number of sites filtered based on within locus depth mismatch\n" $DPMM "of" $OLD "\n"
echo -e "Number of sites filtered based on within locus depth mismatch\n" $DPMM "of" $OLD "\n" >> $2.filterstats
vcftools --vcf $2.FIL1.recode.vcf --exclude-positions $2.dpmismatch.loci --recode --recode-INFO-all --out $2.FIL 2> /dev/null
OLD=$(mawk '!/#/' $1 | wc -l)
FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
echo -e "Remaining sites\n" $FILTERED "\n"
echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
echo -e "Filtered VCF file is called Output_prefix.FIL.recode.vcf\n"
echo "Filter stats stored in $2.filterstats"
You can’t perform that action at this time.