-
Notifications
You must be signed in to change notification settings - Fork 41
/
dDocent_filters
executable file
·219 lines (179 loc) · 9.49 KB
/
dDocent_filters
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#!/usr/bin/env bash
export LC_ALL=en_US.UTF-8
echo "This script will automatically filter a FreeBayes generated VCF file using criteria related to site depth,"
echo "quality versus depth, strand representation, allelic balance at heterzygous individuals, and paired read representation."
echo -e "The script assumes that loci and individuals with low call rates (or depth) have already been removed. \n"
echo -e "Contact Jon Puritz (jpuritz@gmail.com) for questions and see script comments for more details on particular filters \n"
#Checks for correct ususage
if [[ -z "$2" ]]; then
echo "Usage is bash dDocent_filters.sh VCF_file Output_prefix"
exit 1
fi
#Filteres out sites with that on average have heterzygotes with less than a 0.28 allele balance between reads from each allele and Quality / Depth < 0.5 and ratio of mapping quality for reference and alternate alleles
vcffilter -s -f "AB > 0.2 & AB < 0.8 | AB < 0.01 | AB > 0.99" -s -g "QR > 0 | QA > 0 " $1 | vcffilter -s -f "QUAL / DP > 0.1" | vcffilter -s -f "MQM / MQMR > 0.25 & MQM / MQMR < 1.75" > $2
FILTERED=$(mawk '!/#/' $2 | wc -l)
OLD=$(mawk '!/#/' $1 | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of sites filtered based on allele balance at heterozygous loci, locus quality, and mapping quality / Depth\n" $NUMFIL "of" $OLD "\n" > $2.filterstats
#Asks about filting SNPs with reads from both strands filters out loci that have reads from both strands, with some leeway for a bad individual or two
if [[ -z "$3" ]]; then
echo -e "Are reads expected to overlap? In other words, is fragment size less than 2X the read length? Enter yes or no."
read OL
else
OL=$3
fi
if [ "$OL" != "yes" ]; then
vcffilter -f "SAF / SAR > 100 & SRF / SRR > 100 | SAR / SAF > 100 & SRR / SRF > 50" -s $2 > $2.filAB.vcf
FILTERED=$(mawk '!/#/' $2.filAB.vcf | wc -l)
OLD=$(mawk '!/#/' $2 | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of additional sites filtered based on overlapping forward and reverse reads\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
else
cp $2 $2.filAB.vcf
fi
#Filters out loci that have reads from both paired and unpaired reads
if [[ -z "$4" ]]; then
echo -e "Is this from a mixture of SE and PE libraries? Enter yes or no."
read PE
else
PE=$4
fi
if [ "$PE" != "yes" ]; then
vcffilter -f "PAIRED > 0.05 & PAIREDR > 0.05 & PAIREDR / PAIRED < 1.75 & PAIREDR / PAIRED > 0.25 | PAIRED < 0.05 & PAIREDR < 0.05" -s $2.filAB.vcf > $2.fil.vcf
FILTERED=$(mawk '!/#/' $2.fil.vcf | wc -l)
OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
else
vcffilter -f "PAIRED < 0.005 & PAIREDR > 0.005 | PAIRED > 0.005 & PAIREDR < 0.005" -t NP -F PASS -A $2.filAB.vcf | mawk '!/NP/' > $2.fil.vcf
FILTERED=$(mawk '!/#/' $2.fil.vcf | wc -l)
OLD=$(mawk '!/#/' $2.filAB.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of additional sites filtered based on properly paired status\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
fi
#Uses the VCF file to estimate the original number of individuals in the VCF file
#This is important because the INFO flags are based on this number
IND=$(grep -o -e 'NS=[0-9]*' $1 | sed s/NS=//g | sort | tail -1)
IND=$(($IND - 0 ))
#Creates a file with the original site depth and qual for each locus
cut -f8 $1 | grep -oe "DP=[0-9]*" | sed -s 's/DP=//g' > $1.DEPTH
mawk '!/#/' $1 | cut -f1,2,6 > $1.loci.qual
#Calculates the average depth and standard deviation
DEPTH=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.DEPTH)
SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.DEPTH)
DEPTH=$(perl -e "print int("$DEPTH") + int("$SD")")
#DEPTH=$(perl -e "print int("$DEPTH"+100*("$DEPTH"**0.5))")
#Filters loci above the mean depth + 1 standard deviation that have quality scores that are less than 2*DEPTH
paste $1.loci.qual $1.DEPTH | mawk -v x=$DEPTH '$4 > x'| mawk '$3 < 2 * $4' > $1.lowQDloci
LQDL=$(cat $1.lowQDloci | wc -l)
OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n"
echo -e "Number of sites filtered based on high depth and lower than 2*DEPTH quality score\n" $LQDL "of" $OLD "\n" >> $2.filterstats
#Recalculates site depth for sites that have not been previously filtered
vcftools --vcf $2.fil.vcf --remove-filtered NP --exclude-positions $1.lowQDloci --site-depth --out $1 2> /dev/null
cut -f3 $1.ldepth > $1.site.depth
DP=$(mawk '{ sum += $1; n++ } END { if (n > 0) print sum / n; }' $1.site.depth)
SD=$(mawk '{delta = $1 - avg; avg += delta / NR; mean2 += delta * ($1 - avg); } END { print sqrt(mean2 / NR); }' $1.site.depth)
#Calculates actual number of individuals in VCF file
#This is important because loci will now be filtered by mean depth calculated with individuals present in VCF
IND=$(mawk '/#/' $1 | tail -1 | wc -w)
IND=$(($IND - 9))
mawk '!/D/' $1.site.depth | mawk -v x=$IND '{print $1/x}' > meandepthpersite
#Calculates a mean depth cutoff to use for filtering
DP=$(perl -e "print ($DP+ 1.645*$SD) / $IND")
PP=$(mawk '!/SUM/' $1.site.depth | sort -rn | perl -e '$d=.05;@l=<>;print $l[int($d*$#l)]' )
PP=$(perl -e "print int($PP / $IND)")
GP=$(perl -e "print int($PP * 1.10)")
export GP
gnuplot << \EOF >> $2.filterstats
set terminal dumb size 120, 30
set autoscale
high=system("echo $GP")
set xrange [10:high]
unset label
set title "Histogram of mean depth per site"
set ylabel "Number of Occurrences"
set xlabel "Mean Depth"
#set yr [0:100000]
binwidth=1
bin(x,width)=width*floor(x/width) + binwidth/2.0
set xtics floor(high/20)
set lmargin 10
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
gnuplot << \EOF
set terminal dumb size 120, 30
set autoscale
high=system("echo $GP")
set xrange [10:high]
unset label
set title "Histogram of mean depth per site"
set ylabel "Number of Occurrences"
set xlabel "Mean Depth"
#set yr [0:100000]
binwidth=1
bin(x,width)=width*floor(x/width) + binwidth/2.0
set xtics floor(high/20)
set lmargin 10
plot 'meandepthpersite' using (bin($1,binwidth)):(1.0) smooth freq with boxes
pause -1
EOF
if [[ -z "$5" ]]; then
echo "If distrubtion looks normal, a 1.645 sigma cutoff (~90% of the data) would be" $DP
echo "The 95% cutoff would be" $PP
echo "Would you like to use a different maximum mean depth cutoff than "$PP", yes or no"
read NEWCUTOFF
else
NEWCUTOFF=$5
fi
if [ "$NEWCUTOFF" != "yes" ]; then
echo -e "Maximum mean depth cutoff is" $PP >> $2.filterstats
#Combines all filters to create a final filtered VCF file
vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $PP --recode 2> /dev/null
FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf | wc -l)
OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
else
if [[ -z "$6" ]]; then
echo "Please enter new cutoff"
read PP
else
PP=$6
fi
echo -e "Maximum mean depth cutoff is" $PP >> $2.filterstats
#Combines all filters to create a final filtered VCF file
vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL --max-meanDP $PP --recode 2> /dev/null
FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf | wc -l)
OLD=$(mawk '!/#/' $2.fil.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n"
echo -e "Number of sites filtered based on maximum mean depth\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
fi
vcftools --vcf $2.fil.vcf --remove-filtered NP --recode-INFO-all --out $2.FIL1 --max-meanDP $PP --exclude-positions $1.lowQDloci --recode 2> /dev/null
vcftools --vcf $2.FIL1.recode.vcf --site-depth --out $2.FIL1 2> /dev/null
mawk '!/CHR/' $2.FIL1.ldepth | mawk '{
if (NR == 1) {chrom=$1;i=1;pos[i]=$2;dp[i]=$3}
else if ($1 == chrom) {i++;pos[i]=$2;dp[i]=$3}
else if ($1 != chrom) {for (x in dp) dpp= dpp + dp[x]; adp=dpp / i; adp= adp / 10 * 6; for (j = 1; j <= i; j++) if (dp[j] < adp) print chrom"\t"pos[j];chrom=$1;i=1;delete pos;pos[i]=$2;delete dp;dpp=0;dp[i]=$3}
}' > $2.dpmismatch.loci
OLD=$(mawk '!/#/' $2.FIL1.recode.vcf | wc -l)
DPMM=$(cat $2.dpmismatch.loci | wc -l)
echo -e "Number of sites filtered based on within locus depth mismatch\n" $DPMM "of" $OLD "\n"
echo -e "Number of sites filtered based on within locus depth mismatch\n" $DPMM "of" $OLD "\n" >> $2.filterstats
vcftools --vcf $2.FIL1.recode.vcf --exclude-positions $2.dpmismatch.loci --recode --recode-INFO-all --out $2.FIL 2> /dev/null
OLD=$(mawk '!/#/' $1 | wc -l)
FILTERED=$(mawk '!/#/' $2.FIL.recode.vcf | wc -l)
NUMFIL=$(($OLD - $FILTERED))
echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n"
echo -e "Total number of sites filtered\n" $NUMFIL "of" $OLD "\n" >> $2.filterstats
echo -e "Remaining sites\n" $FILTERED "\n"
echo -e "Remaining sites\n" $FILTERED "\n" >> $2.filterstats
echo -e "Filtered VCF file is called $2.FIL.recode.vcf\n"
echo "Filter stats stored in $2.filterstats"