-
Notifications
You must be signed in to change notification settings - Fork 3
/
samToPolyA.pl
executable file
·291 lines (218 loc) · 9.69 KB
/
samToPolyA.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
#!/usr/bin/env perl
use strict;
use warnings;
use Getopt::Long;
use Pod::Usage;
use Bio::DB::Fasta;
$|=1;
my $message_text = "Error\n";
my $exit_status = 2; ## The exit status to use
my $verbose_level = 99; ## The verbose level to use
my $filehandle = \*STDERR; ## The filehandle to write to
my $sections = "NAME|SYNOPSIS|DESCRIPTION";
=head1 NAME
samToPolyA
=head1 SYNOPSIS
A utility to detect poly-adenylated sequencing reads, call on-genome polyA sites and infer the reads' strand based on reads-to-genome alignments in SAM format.
B<Usage example> (on a BAM file):
C<< samtools view $file.bam |samToPolyA.pl --minClipped=20 --minAcontent=0.9 - > ${file}_polyAsites.bed >>
=head2 INPUT
Read-to-genome alignments in SAM format, and the corresponding genome sequence in multifasta.
The script looks for terminal soft-clipped A/T sequences (marked as "S" in the CIGAR string).
=head2 OPTIONS
This script maps polyA sites on the genome based on read mappings in SAM format, and according to the following provided parameters:
=over
=item B<minClipped> (integer) = minimum length of A or T tail required to call a PolyA site.
Default: '10'.
=item B<minAcontent> (float) = required A (or T, if minus strand) content of the tail.
Default: '0.8'.
Note: minAcontent affects both the A tail and the upstream A stretch.
=item B<discardInternallyPrimed> = when enabled, the program will try to avoid outputting false polyA sites arising from internal mis-priming during the cDNA library construction. This option is particularly useful if your cDNA was oligo-dT primed.
Default: disabled.
Requires option B<genomeFasta> to be set.
=item B<minUpMisPrimeAlength> (integer) (ignored if B<discardInternallyPrimed> is not set) = minimum length of genomic A stretch immediately upstream a putative site required to call a false positive (presumably due to internal RT priming), and hence not report the corresponding site in the output.
Default: '10'.
=item B<genomeFasta> (string) (valid only if B<discardInternallyPrimed> is set)= path to multifasta of genome (+ spike-in sequences if applicable), used to extract upstream genomic sequence.
B<Note>: You need write access to the directory containing this file, as the included Bio::DB::Fasta module will create a genomeFasta.index file if it doesn't exist.
=back
=head2 OUTPUT
The script will output BED6 with the following columns:
=over
=item column 1: chromosome
=item column 2: start of polyA site (0-based)
=item column 3: end of polyA site
=item column 4: ID of the read containing a polyA tail
=item column 5: length of the polyA tail on read
=item column 6: genomic strand of the read (see DESCRIPTION below)
=back
=head1 DESCRIPTION
The script will search for read alignment patterns such as:
C<< XXXXXXXXXXXAAAAAAAAAAAAAAA(YYYY) [read] >>
C<< |||||||||||..................... [match] >>
C<< XXXXXXXXXXXZ-------------------- [reference sequence] >>
or
C<< (YYYY)TTTTTTTTTTTTTTTTXXXXXXXXXX [read] >>
C<< ......................|||||||||| [match] >>
C<< ---------------------ZXXXXXXXXXX [reference sequence] >>
Where:
=over
=item C<|> / C<.> = a position mapped / unmapped to the reference, respectively
=item C<X> = the mapped portion of the read or reference sequence
=item C<(Y)> = an optional soft-clipped, non-(A|T)-rich sequence (possibly a sequencing adapter)
=item C<Z> = the position on the reference sequence where the alignment breaks
=item The C<A> / C<T> streches are soft-clipped ('S' in CIGAR nomenclature) in the alignment
=item C<-> = the portion of the reference sequence unaligned to the read
=back
The genomic strand of the read + polyA site is inferred from the mapping of the read, I<i.e.>, reads where a polyA tail was detected at their 3' end are assigned a '+' genomic strand, whereas reads with a polyT tail at their 5' end are deduced to originate from the '-' strand. In that example, the first / second alignment would lead to a called polyA site at position Z on the '+' / '-' strand of the reference sequence, respectively.
=head1 DEPENDENCIES
CPAN: Bio::DB::Fasta
=head1 AUTHOR
Julien Lagarde, CRG, Barcelona, contact julienlag@gmail.com
=cut
my $minSoftClippedSeqLength=10;
my $minAcontent=0.8;
my $minUpMisPrimeAlength=10;
my $discardInternallyPrimed='';
my $genomeFa;
GetOptions ('minClipped=i' => \$minSoftClippedSeqLength,
'minAcontent:f' => \$minAcontent,
'minUpMisPrimeAlength=i' => \$minUpMisPrimeAlength,
'genomeFasta=s' => \$genomeFa,
'discardInternallyPrimed' => \$discardInternallyPrimed)
or pod2usage( { -message => "Error in command line arguments",
-exitval => $exit_status ,
-verbose => $verbose_level,
-output => $filehandle } );
unless(defined $minSoftClippedSeqLength && defined $minAcontent){
pod2usage( { -message => "Error in command line arguments",
-exitval => $exit_status ,
-verbose => $verbose_level,
-output => $filehandle } );
}
my $chrdb;
if($discardInternallyPrimed){
print STDERR "Will try to discard false polyA tails resulting from internal mis-priming.\n";
if(defined $genomeFa && defined $minUpMisPrimeAlength){
$chrdb = Bio::DB::Fasta->new("$genomeFa", -reindex => 0);
}
else{
pod2usage( { -message => "Error in command line arguments: --discardInternallyPrimed requires --genomeFasta to be set.\n",
-exitval => $exit_status ,
-verbose => $verbose_level,
-output => $filehandle } );
}
}
else{
# print STDERR "Will not try to discard false polyA tails resulting from internal mis-priming.\n";
if(defined $genomeFa){
pod2usage( { -message => "Error in command line arguments: --genomeFasta requires --discardInternallyPrimed to be set.\n",
-exitval => $exit_status ,
-verbose => $verbose_level,
-output => $filehandle } );
}
}
while (<STDIN>){
my $line=$_;
chomp;
next if($_=~/^\@SQ/); #skip sequence header
my @line=split "\t";
die "Invalid format (doesn't look like SAM)\n" unless ($#line>9);
next if($line[5] eq '*'); #skip unmapped reads
my @cigarNumbers=split (/[A-Z]/,$line[5]);
my @cigarLetters=split(/\d+/,$line[5]);
shift(@cigarLetters); #the first element is empty, remove it
#check if there's any soft-clipped sequence at the end(s) of the read (i.e. unmapped tail)
if($cigarLetters[0] eq 'S' && $cigarNumbers[0] > $minSoftClippedSeqLength){ #tail is likely at the beginning of the read
#it means strand is -
my $tailSeq=substr($line[9], 0, $cigarNumbers[0]);
#reverse string so that we start scanning the sequence where it drops from the genome
$tailSeq=reverse($tailSeq);
$tailSeq=~ tr/ACGTacgt/TGCAtgca/;
my $strand='-';
#there might be an exogenous sequence adapter at the end of the sequence.
my $aTailLength=countAs($tailSeq, $minSoftClippedSeqLength);
if ($aTailLength>0){
my $upstreamGenomeAsLength=0;
if($discardInternallyPrimed){
my $upstreamStop=$line[3]+$minUpMisPrimeAlength;
my $upstreamGenomeSeq=$chrdb->seq($line[2], $line[3], $upstreamStop);
$upstreamGenomeAsLength=0;
if(defined $upstreamGenomeSeq && length($upstreamGenomeSeq)>0){
$upstreamGenomeSeq=reverse($upstreamGenomeSeq);
$upstreamGenomeSeq=~ tr/ACGTacgt/TGCAtgca/;
$upstreamGenomeAsLength=countAs($upstreamGenomeSeq, $minUpMisPrimeAlength);
}
else{
warn "WARNING: Could not extract reference sequence in $line[2] (is it present in $genomeFa?). Assuming no A stretch upstream of putative site.\n"
}
}
unless ($upstreamGenomeAsLength >= $minUpMisPrimeAlength){
my $start=$line[3]-1;
print "$line[2]\t$start\t$line[3]\t$line[0]\t$aTailLength\t$strand\n";
next; # found, no need to look at the other end of the read
}
}
}
if($cigarLetters[$#cigarLetters] eq 'S' && $cigarNumbers[$#cigarNumbers] > $minSoftClippedSeqLength){ #tail is likely at the end of the read
# this is no ELSIF!! need to check second end if first fails
my $tailSeq=substr($line[9], length($line[9]) - $cigarNumbers[$#cigarNumbers], $cigarNumbers[$#cigarNumbers]);
my $strand='+';
#there might be an exogenous sequence adapter at the end of the sequence.
my $aTailLength=countAs($tailSeq, $minSoftClippedSeqLength);
if($aTailLength>0){
my $upstreamGenomeAsLength=0;
#calculate where the start of the tail is on the genome
my $genomicLength=0;
for (my $i=0; $i<=$#cigarLetters;$i++){
if($cigarLetters[$i] =~ /[MDNXP]/){
$genomicLength+=$cigarNumbers[$i];
}
}
my $start=$line[3]+$genomicLength-1;
my $end=$start+1;
if($discardInternallyPrimed){
my $upstreamStop=$start;
my $upstreamStart=$start-$minUpMisPrimeAlength;
my $upstreamGenomeSeq=$chrdb->seq($line[2], $upstreamStart, $upstreamStop);
my $upstreamGenomeAsLength=0;
if(defined $upstreamGenomeSeq && length($upstreamGenomeSeq)>0){
$upstreamGenomeAsLength=countAs($upstreamGenomeSeq, $minUpMisPrimeAlength);
}
else{
warn "WARNING: Could not extract reference sequence in $line[2] (is it present in $genomeFa?). Assuming no A stretch upstream of putative site.\n"
}
}
unless ($upstreamGenomeAsLength >= $minUpMisPrimeAlength){
print "$line[2]\t$start\t$end\t$line[0]\t$aTailLength\t$strand\n";
}
}
}
}
sub countAs{
my $minSeqLength=$_[1];
my $seq=lc($_[0]);
my @seq=split("", $seq);
my $countAs=0;
my $maxMismatches=$minSeqLength-($minAcontent*$minSeqLength);
my $SeqLength=0;
my $countMismatches=0;
for(my $i=0; $i<=$#seq;$i++){
my $nt=$seq[$i];
$SeqLength=$i+1;
if($countMismatches>$maxMismatches && $SeqLength>$minSeqLength){
last;
}
if($nt eq "a"){
$countAs++;
}
else{
$countMismatches++;
}
}
if($SeqLength >= $minSeqLength && $countAs > $minSeqLength-$maxMismatches){
return $SeqLength;
}
else{
return 0;
}
}