/
sscolor.pl
executable file
·518 lines (444 loc) · 12.4 KB
/
sscolor.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
#!/usr/bin/env perl -w
BEGIN {
use FindBin;
use lib $FindBin::Bin;
push @INC, $FindBin::Bin;
}
use strict;
use CGI qw/:standard *table *pre *span/;
my $PS = "PS";
my $SS = "SS";
my $SC = "SC";
my $PS_CONS = "PS_cons";
my $SS_CONS = "SS_cons";
my $SN = "SN";
my $ps_cons_hdr = "#=GC $PS_CONS";
my $ss_cons_hdr = "#=GC $SS_CONS";
my $sn_hdr = "#=GC $SN";
my $sc_hdr = "#=GF $SC";
# Basepair compatibility levels
my $BP_COMPAT_NO_SUB = 0;
my $BP_COMPAT_SGL_SUB = 1;
my $BP_COMPAT_DBL_SUB = 2;
my $BP_NON_COMPAT = 3;
# Create hash maps for secondary structure characters.
# Left to right character
my %rchar = ('('=>')','<'=>'>','['=>']');
# Right to left character
my %lchar = map (($rchar{$_}=>$_), keys(%rchar));
# Gap characters
my %gapchar = map (($_=>1), '-', '.', '_', ',');
my $refSeqNum = 1;
my $progname = $0;
$progname =~ s/^.*?([^\/]+)$/$1/; # Remove path
my $verbose = 0;
my $makeLinks = 0;
my $outFile = "out";
my $usage = "$progname: Color the secondary structure of alignments in Stockholm file.\n";
$usage .= " Secondary structure should be in a line prefixed by '#=GC SS_cons'.\n";
$usage .= " Substitutions are regarded with respect to the reference sequence.\n";
$usage .= " Coloring of nucleotides is as follows:\n";
$usage .= " Green = compatible Watson-Crick basepair with double substitution\n";
$usage .= " Blue = compatible W-C basepair with a single substitution\n";
$usage .= " Red = substitution resulting in a non-compatible basepair\n";
$usage .= " Black = no substitution\n";
$usage .= " Grey = non-basepair\n";
$usage .= " Light Purple = non-basepair with a substitution\n";
$usage .= "\n";
$usage .= "Usage: $progname <Stockholm file>\n";
$usage .= " [-h] print this message\n";
$usage .= " [-ref <n> ] reference sequence number (default = 1)\n";
$usage .= " [-link] add links so viewer can select reference sequence (use with sscolorMult.pl)\n";
$usage .= " [-out <file> ] output file basename used in links (default = 'out')\n";
$usage .= "\n";
$usage .= "For more help, type the following:\n";
$usage .= "perldoc $0\n";
$usage .= "\n";
# parse cmd-line opts
my @argv;
while (@ARGV) {
my $arg = shift;
if ($arg eq "-h")
{
die $usage;
}
elsif ($arg eq "-v")
{
$verbose = 1;
}
elsif ($arg eq "-ref")
{
defined ($refSeqNum = shift) or die $usage;
}
elsif ($arg eq "-link")
{
$makeLinks = 1;
}
elsif ($arg eq "-out")
{
defined ($outFile = shift) or die $usage;
}
else
{
push @argv, $arg;
}
}
@argv = ('-') unless @argv;
die $usage unless @argv == 1;
my ($stockfile) = @argv;
# Define the document style sheet and start printing out the html.
my $style=<<END;
.noPair {
color: #999999; /* med gray */
}
.noPairSub {
color: #9900CC; /* dark purple */
}
.compatSgl {
color: #0000FF; /* blue */
}
.compatDbl {
color: #00FF00; /* green */
}
.nonCompat {
color: #FF0000; /* red */
}
.noSub {
color: #000000; /* black */
}
END
print start_html(-title=>$stockfile, -style=>{-code=>$style}),"\n";
local *STOCK;
open STOCK, "<$stockfile" or die "Couldn't open '$stockfile': $!";
while (1)
{
my (@seqname, %seq, %ss, %gc, %gr, $score);
my ($ps_cons, $ss_cons, $preamble);
my $seqlen = 0;
my $found_separator = 0;
# Read in next alignment in Stockholm file.
while (<STOCK>)
{
# "#=GR...SS" by-seq secondary structure line
if (/^\s*\#=GR\s*(\S+)\s*$SS\s*(\S+)\s*$/)
{
$ss{$1} = "" unless exists $ss{$1};
$ss{$1} .= $2;
}
# miscellaneous "#=GR" lines
elsif (/^\s*\#=GR\s*(\S+)\s*(\S+)\s*(\S+)\s*$/)
{
$gr{$2} = {} unless exists $gr{$1};
# %gr will store a hash reference.
$gr{$1}->{$2} = "" unless exists $gr{$1}->{$2};
$gr{$1}->{$2} .= $3
}
# "#=GC...PS_cons" consensus primary sequence
elsif (/^\s*\#=GC\s*$PS_CONS\s*(\S+)\s*$/)
{
$ps_cons = "" unless defined $ps_cons;
$ps_cons .= $1;
}
# "#=GC...SS_cons" consensus secondary structure
elsif (/^\s*\#=GC\s*$SS_CONS\s*(\S+)\s*$/)
{
$ss_cons = "" unless defined $ss_cons;
$ss_cons .= $1
}
# miscellaneous "#=GC" lines
elsif (/^\s*\#=GC\s*(\S+)\s*(\S+)\s*$/)
{
$gc{$1} = "" unless exists $gc{$1};
$gc{$1} .= $2;
}
# score
elsif (/^\s*\#=GF\s*$SC\s*(\S+)\s*$/)
{
$score = $1;
}
# unrecognised line starting with '#'; append to preamble
elsif (/^\s*\#/)
{
$preamble = "" unless defined $preamble;
$preamble .= $_
}
# Found alignment separator; exit loop
elsif (/^\s*\/\//)
{
$found_separator = 1;
last;
}
elsif (/^\s*(\S+)\s*(\S+)\s*$/)
{
unless (exists $seq{$1})
{
$seq{$1} = "";
push @seqname, $1
}
$seq{$1} .= $2;
$seqlen = length($seq{$1})
}
elsif (/\S/)
{
warn "Ignoring line: $_"
}
} # while (<STOCK>)...
if (@seqname)
{
if (defined($ss_cons) && length($ss_cons))
{
my @pairs;
my $columns = length($ss_cons);
my $rows = @seqname;
my @lpos;
# Get coords of all basepairs.
for (my $i = 0; $i < length($ss_cons); ++$i)
{
my $c = substr ($ss_cons, $i, 1);
# Found left character.
if (exists $rchar{$c})
{
push @lpos, $i;
}
# Found right character
elsif (exists $lchar{$c})
{
unless (@lpos)
{
die "Too many >'s in $SS_CONS string: $ss_cons\n"
}
my $l = pop @lpos;
my $r = $i;
# warn "l=$l r=$r\n";
push @pairs, {coord => [$l,$r], level => $BP_COMPAT_NO_SUB};
}
}
if (@lpos)
{
die "Too many <'s in $SS_CONS string: $ss_cons\n"
}
# Determine maximum sequence name length
my $maxNameLen = max(map(length($_), $SS_CONS, @seqname));
my $fmt = join("", "%-", $maxNameLen, "s ");
# Set reference seq array
if ($refSeqNum > $rows )
{
die "Reference sequence number [$refSeqNum] is greater than number of sequences [$rows]\n";
}
my @refSeq = split("", $seq{$seqname[$refSeqNum-1]});
# Print alignment
print start_pre(),"\n";
for (my $row = 0; $row < $rows; ++$row)
{
my $isRefRow = ($row == 0 ? 1 : 0);
my $seqName = $seqname[$row];
print $makeLinks ? a({-href=>$outFile . "." . ($row+1) . ".html"}, $seqName) : $seqName;
printf(" " x ($maxNameLen-length($seqName) + 1) );
my $rowseq = $seq{$seqName};
# Loop thru each character in a row.
for (my $i=0; $i < length($rowseq); ++$i)
{
my $qChar = substr($rowseq, $i, 1);
my $refChar = $refSeq[$i];
my $class = "noPair";
my $level;
# Determine whether the character is a base pair member and
# the level of base pair compatibility.
# $posL, $posR refer to the left, right positions of a basepair.
my ($posLflag, $posRflag) = (0,0);
foreach my $pair (@pairs)
{
my $posL = $pair->{coord}[0];
my $posR = $pair->{coord}[1];
if ($i == $posL)
{
$posLflag = 1;
if ($isRefRow)
{
$level = $BP_COMPAT_NO_SUB;
}
else
{
print "[L:$qChar]" if $verbose;
my $qChar2 = substr($rowseq, $posR,1);
my $refChar2 = $refSeq[$posR];
$level = getBpCompat($qChar, $qChar2, $refChar, $refChar2);
print ("[$qChar$qChar2$refChar$refChar2:$level]") if $verbose;
}
$pair->{level} = $level;
last;
}
elsif ($i == $posR)
{
$posRflag = 1;
$level = $pair->{level};
print "[i=$i, posL=$posL, posR=$posR]" if $verbose;
print "[R:$qChar,$level]" if $verbose;
last;
}
}
# Character is a pair member.
if ( $posLflag || $posRflag)
{
if ($level == $BP_COMPAT_NO_SUB)
{
$class='noSub';
}
elsif ($level == $BP_COMPAT_SGL_SUB)
{
$class='compatSgl';
}
elsif ($level == $BP_COMPAT_DBL_SUB)
{
$class='compatDbl';
}
elsif ($level == $BP_NON_COMPAT)
{
$class='nonCompat';
}
}
else
{
if ($qChar ne $refChar)
{
$class ='noPairSub';
}
}
print span({-class=>$class}, $qChar);
}
print "\n";
}
printf(join("",$fmt,"%s"), $SS_CONS, $ss_cons);
print "\n", end_pre(),"\n";
} # end of block that looks at SS_cons line
} # if (@seqname) ...
# if no alignment separator, then quit the loop
last unless $found_separator;
}
close STOCK;
print end_html();
# Determine the compatibility of a query sequence base pair relative to
# a reference sequence base pair.
# Output: the level of compatibility.
sub getBpCompat
{
# q1, q2 are the query seq bases.
# r1, r2 are the reference seq bases.
my ($q1, $q2, $r1, $r2) = @_;
my $level = $BP_COMPAT_NO_SUB;
my $numSub = 0;
if (uc($q1) ne uc($r1))
{
$numSub++;
}
if (uc($q2) ne uc($r2))
{
$numSub++;
}
if ($numSub > 0)
{
if (isValidPair($q1, $q2))
{
if ($numSub == 1)
{
$level = $BP_COMPAT_SGL_SUB;
}
else
{
$level = $BP_COMPAT_DBL_SUB;
}
}
else
{
$level = $BP_NON_COMPAT;
}
}
return $level;
}
# Determines whether a DNA/RNA basepair is valid.
sub isValidPair
{
my ($b1, $b2) = @_;
my %validPairs = ( A_T => 1, A_U => 1, G_C => 1, G_U => 1);
my $isValid = 0;
my $pair1 = $b1."_".$b2;
my $pair2 = $b2."_".$b1;
if (exists($validPairs{uc($pair1)}) || exists($validPairs{uc($pair2)}))
{
$isValid = 1;
}
return $isValid;
}
# Determine the list maximum value.
sub max
{
my ($x, @y) = @_;
foreach my $y (@y)
{
$x = $y if $y > $x
}
return $x;
}
=head1 NAME
sscolor.pl - colorize RNA multiple alignments using HTML
=head1 SYNOPSIS
sscolor.pl -h
sscolor.pl FILENAME [-ref N] [-out PREFIX]
cat FILENAME | sscolor.pl [options]
=head1 DESCRIPTION
The program uses HTML colors to highlight compensatory mutations in a multiple alignment of noncoding RNA.
The input alignment must be in Stockholm format.
The output is in HTML.
For the purposes of this program,
a position is defined to display a compensatory mutation if (i) it is base-paired with another position;
(ii) the pair of nucleotides at those two positions comprises a Watson-Crick or wobble base-pair;
(iii) at least one of the two nucleotides is different from the homologous nucleotide in a reference sequence.
=head1 INPUT FORMAT
The input file(s) must be in Stockholm format, defined here:
L<http://biowiki.org/StockholmFormat>
A list of utilities for format conversions and other operations on Stockholm files can be found here:
L<http://biowiki.org/StockholmTools>
=head1 OUTPUT FORMAT
Output in HTML is printed on the standard output.
The C<-link> and C<-out> options can be used to create a family of HTML pages,
with each one using a different reference sequence.
Use the accompanying sscolorMult.pl script to do this automatically.
Coloring of nucleotides is as follows:
=over 2
=item Green
Consensus structure implies Watson-Crick or wobble base-pair.
Double substitution relative to reference sequence.
=item Blue
Consensus structure implies Watson-Crick or wobble base-pair.
Single substitution relative to reference sequence.
=item Black
Consensus structure implies Watson-Crick or wobble base-pair.
No substitution relative to reference sequence.
=item Red
Consensus structure implies non-canonical base-pair.
=item Grey
Consensus structure implies position is single-stranded.
No substitution relative to reference sequence.
=item Light purple
Consensus structure implies position is single-stranded.
Single substitution relative to reference sequence.
=back
=head1 OPTIONS
=over 12
=item C<-h>
Prints a short help message.
=item C<-ref N>
Allows the user to select which row number will be used for the reference sequence.
=item C<-link>
Creates internal links for each sequence name, which the user can click on (in the web browser) to select different reference sequences.
This option presupposes that a separate HTML file has been created for each row of the alignment, wherein that row has been selected as the reference sequence.
To generate these files automatically, use the sscolorMult.pl script.
=item C<-out PREFIX>
Sets the filename prefix for the internal links generated by the C<-link> option.
Used by the sscolorMult.pl script.
=back
=head1 AUTHOR
Yuri Bendana - <ybendana@berkeley.edu>
=head1 LICENSE
This code is released under the GNU Public License v3.0.
=cut