/
correct_gff_feature_order.pl
executable file
·342 lines (259 loc) · 10.1 KB
/
correct_gff_feature_order.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
#!/usr/bin/env perl
=head1 NAME
correct_gff_feature_order.pl - Reorders rows of a GFF file to conform with sane, expected norms.
=head1 SYNOPSIS
USAGE: correct_gff_feature_order.pl
--input=/path/to/some_file.gff
--output=/path/to/new_file.gff
=head1 OPTIONS
B<--input,-i>
This input GFF expects the convention of one CDS per gene.
B<--output,-o>
Output file to be created.
B<--log,-l>
Log file
B<--help,-h>
This help message
=head1 DESCRIPTION
While the GFF3 specification doesn't necessarily impose a restriction on the order of features
within the document, a more defined ordering can help downstream software because of practical
considerations. GFF entries are often processed in groups where each 'group' contains the features
that define a gene and its children. If these are scattered all over the document parsing
scripts have to first load the whole thing in memory.
=head1 INPUT and OUTPUT
This script makes very few assumptions on the overall contents of the input file. The following
columns are used for the final ordering (and applied in this order:)
1 - molecule
3 - feature type
4 - min location (of gene feature in each group)
Parental relationships assumed/supported:
mRNA parent of exon
mRNA parent of CDS
gene parent of mRNA
Comment lines are retained, but will all be at the top of the document. This is conventional for
much GFF header information but will completely foobar any file where comments embedded between
feature lines is important.
WARNING: this script does not currently handle multiple features on different lines with the same
ID value. They'll simply be collapsed, which is doubtfully the desired behavior.
Remember that if you have a file which is nothing but match features with no parental
relationships you can easily do a sort on the command line like this:
sort -t $'\t' -k 1,1 -k 4n,4 -k 5n,5 foo.gff3 > foo.sorted.gff3
Any other FASTA or non-columnar lines will be placed at the end of the output file.
=head1 CONTACT
Joshua Orvis
jorvis@gmail.com
=cut
use strict;
use Getopt::Long qw(:config no_ignore_case no_auto_abbrev pass_through);
use Pod::Usage;
use FindBin;
use lib "$FindBin::Bin/../lib";
use bioUtils;
my %options = ();
my $results = GetOptions (\%options,
'input|i=s',
'output|o=s',
'log|l=s',
'help|h') || pod2usage();
## display documentation
if( $options{'help'} ){
pod2usage( {-exitval => 0, -verbose => 2, -output => \*STDERR} );
}
## make sure everything passed was peachy
&check_parameters(\%options);
## open the log if requested
my $logfh;
if (defined $options{log}) {
open($logfh, ">$options{log}") || die "can't create log file: $!";
}
open(my $ifh, "<$options{input}") || die "can't read input file: $!";
open(my $ofh, ">$options{output}") || die "can't read output file: $!";
## structure like:
my @comment_lines = ();
my @other_lines = ();
my @other_feats = ();
## for all gene graph features [ gene, mRNA, CDS, exon ]
## data structure like:
# $h{$id} => {
# type => 'gene',
# parent => $id || undef,
# cols => [columns],
#
# ## only gene type have this
# # hashref of ids of child elements of this gene
# children => { cds => [], exons => [], mRNAs => [] },
# }
my %features = ();
## keeps all the row information for segmented features.
## data structure like:
# $h{$id} = [ [columns], [columns], ... ]
my %segmented_features = ();
## used to keep references of genes on molecules for sorting purposes
my %molecules = ();
while (my $line = <$ifh>) {
chomp $line;
if ( $line =~ /^\#/ ) {
push @comment_lines, $line;
next;
}
my @cols = split("\t", $line);
if ( scalar @cols < 9 ) {
push @other_lines, $line;
next;
}
my $feat_type = $cols[2];
my $id = gff3_get_column_9_value( $cols[8], 'ID' );
#my ($parent) = $cols[8] =~ /Parent\=(.+?)\;/;
my $parent = gff3_get_column_9_value( $cols[8], 'Parent' );
if ( exists $features{$id} ) {
push @{$segmented_features{$id}}, \@cols;
} else {
$features{$id} = {
type => $feat_type,
parent => $parent,
cols => \@cols,
};
}
if ( $feat_type eq 'gene' ) {
## make sure the molecule doesn't have a gene txhere already
if ( exists $molecules{$cols[0]} && $molecules{$cols[0]}{$cols[3]} ) {
die "found more than one gene at position $cols[3] on molecule $cols[0]\n";
} else {
$molecules{$cols[0]}{$cols[3]} = $id;
}
} elsif ( $feat_type eq 'mRNA' ||
$feat_type eq 'CDS' ||
$feat_type eq 'exon' ||
$feat_type eq 'five_prime_UTR' ||
$feat_type eq 'polypeptide' ||
$feat_type eq 'three_prime_UTR' ||
$feat_type eq 'tRNA' ||
$feat_type eq 'rRNA' ||
$feat_type eq 'non_canonical_five_prime_splice_site' ||
$feat_type eq 'non_canonical_three_prime_splice_site' ||
$feat_type eq 'expressed_sequence_match' ||
$feat_type eq 'repeat_region' ||
$feat_type eq 'tandem_repeat' ||
$feat_type eq 'inverted_repeat' ||
$feat_type eq 'assembly_component' ||
$feat_type eq 'protein_match' ||
$feat_type eq 'nucleotide_match' ||
$feat_type eq 'polypeptide_motif' ||
$feat_type eq 'intron' ||
$feat_type eq 'transcript_region' ||
$feat_type eq 'pseudogenic_transcript' ||
$feat_type eq 'SL1_acceptor_site' ||
$feat_type eq 'protein_coding_primary_transcript' ||
$feat_type eq 'translated_nucleotide_match' ||
$feat_type eq 'pre-miRNA' ||
$feat_type eq 'low_complexity_region'
) {
## nothing to see here, carry on
} elsif ( $feat_type eq 'contig' ||
$feat_type eq 'region' ) {
push @other_feats, $line;
} else {
die("panic! don't know what to do with feat type: $feat_type");
}
}
## first write the comments to the output file
for ( @comment_lines ) {
print $ofh "$_\n";
}
## pair genes with their child features
for my $feat_id ( keys %features ) {
my $parent_id;
if ( $features{$feat_id}{type} eq 'gene' ) {
$parent_id = undef;
} else {
$parent_id = ultimate_parent_of($features{$feat_id}{parent});
}
my $type = $features{$feat_id}{type};
## if it has a parent, make sure we know about it
if (defined $parent_id && ! exists $features{ $parent_id } ) {
die "failed to find parent ID ($parent_id) for feature $feat_id";
}
if ( $features{$feat_id}{type} ne 'gene' ) {
#print "adding: features{$parent_id}{children}{$type} }, $feat_id;\n";
push @{ $features{$parent_id}{children}{$type} }, $feat_id;
}
}
for my $molecule_id ( sort keys %molecules ) {
print "DEBUG: Found " . scalar(keys %{$molecules{$molecule_id}}) . " genes to export on molecule $molecule_id\n";
for my $start ( sort { $a<=>$b } keys %{$molecules{$molecule_id}} ) {
my $feat_id = $molecules{$molecule_id}{$start};
## print the gene first
print $ofh join("\t", @{$features{$feat_id}{cols}}), "\n";
## 5' UTRs
for my $utr_id ( @{$features{$feat_id}{children}{five_prime_UTR}} ) {
print $ofh join("\t", @{$features{$utr_id}{cols}}), "\n";
}
## then the mRNAs
for my $mRNA_id ( @{$features{$feat_id}{children}{mRNA}} ) {
print $ofh join("\t", @{$features{$mRNA_id}{cols}}), "\n";
}
## then the rRNAs
for my $rRNA_id ( @{$features{$feat_id}{children}{rRNA}} ) {
print $ofh join("\t", @{$features{$rRNA_id}{cols}}), "\n";
}
## then the tRNAs
for my $tRNA_id ( @{$features{$feat_id}{children}{tRNA}} ) {
print $ofh join("\t", @{$features{$tRNA_id}{cols}}), "\n";
}
for my $child_type ( qw(exon CDS polypeptide ) ) {
if ( exists $features{$feat_id}{children}{$child_type} ) {
for my $child_id ( sort { $features{$a}{cols}[3] <=> $features{$b}{cols}[3] } @{$features{$feat_id}{children}{$child_type}} ) {
print $ofh join("\t", @{$features{$child_id}{cols}}), "\n";
## if this is a segmented feature, print those too
if (exists $segmented_features{$child_id}) {
for my $child ( $segmented_features{$child_id} ) {
for my $elem ( sort {$$a[3] <=> $$b[3]} @$child ) {
print $ofh join("\t", @$elem), "\n";
}
}
}
}
}
}
## 3' UTRs
for my $utr_id ( @{$features{$feat_id}{children}{three_prime_UTR}} ) {
print $ofh join("\t", @{$features{$utr_id}{cols}}), "\n";
}
}
}
## write features not handled in this script
for ( @other_feats ) {
print $ofh "$_\n";
}
## finally write 'other' lines out (usually FASTA)
for ( @other_lines ) {
print $ofh "$_\n";
}
exit(0);
sub ultimate_parent_of {
my $child_id = shift;
if ( $features{$child_id}{parent} ) {
return ultimate_parent_of( $features{$child_id}{parent} );
} else {
return $child_id;
}
}
sub _log {
my $msg = shift;
print $logfh "$msg\n" if $logfh;
}
sub check_parameters {
my $options = shift;
## make sure required arguments were passed
my @required = qw( input output );
for my $option ( @required ) {
unless ( defined $$options{$option} ) {
die "--$option is a required option";
}
}
##
## you can do other things here, such as checking that files exist, etc.
##
## handle some defaults
#$options{optional_argument2} = 'foo' unless ($options{optional_argument2});
}