/
countTE.xml
executable file
·184 lines (164 loc) · 9.38 KB
/
countTE.xml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
<!--
Copyright (C) 2015 Laurent Modolo
This file is part of TEtools suite for galaxy.
TEtools is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
TEtools is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with TEtools. If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="countTE" name="countTE" version="1.0.0" hidden="false">
<description>fastq quality trimming</description>
<version_command interpreter="python3">countTE.py -version</version_command>
<command interpreter="python3">${__root_dir__}/tools/TEtools/countTE.py
-rosette $rosette_file
-column $count_column
-TE_fasta $fasta_file
-MAPQ $MAPQ
-count $output_file
#if $type_of_input['condition'] == "fastq":
-RNA
#for $file in $type_of_input['fastq_files']:
$file['fastq_file']
#end for
#if $type_of_input['qc_step'] == "run_qc":
-QC
#end if
#if $type_of_input['mapper'] == "bowtie2":
-bowtie2
#end if
#end if
#if $type_of_input['condition'] == "fastq_pair":
-RNA
#for $file in $type_of_input['fastq_files']:
$file['fastq_file']
#end for
-RNApair
#for $file in $type_of_input['fastq_files_pair']:
$file['fastq_file_pair']
#end for
#if $type_of_input['qc_step'] == "run_qc":
-QC
#end if
#if $type_of_input['type_of_mapper']['mapper'] == "bowtie2":
-bowtie2
-insert $type_of_input['type_of_mapper']['insert_size']
#end if
#end if
#if $type_of_input['condition'] == "sam_file":
-sam
#for $fastq in $type_of_input['sam_files']:
$fastq['sam_file']
#end for
#end if
#if $type_of_output['count_sirna']=='yes':
-siRNA $output_sirna_file
#end if
</command>
<inputs>
<param name="rosette_file" type="data" format="tabular" label="rosette file. Each column correspond to a variable and the first one to the TE copy names" />
<param name="count_column" type="integer" value="2" label="rosette file column to count"/>
<param name="fasta_file" type="data" format="fasta" label="list of TE copies in fasta format" />
<conditional name="type_of_input">
<param name="condition" type="select" label="input data type">
<option value="fastq">fastq</option>
<option value="fastq_pair">fastq paired</option>
<option value="sam_file">sam</option>
</param>
<when value="fastq">
<repeat name="fastq_files" title="fastq file">
<param name="fastq_file" type="data" format="fastq" label="fastq file"/>
</repeat>
<param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
<param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
<option value="bowtie">bowtie</option>
<option value="bowtie2">bowtie2</option>
</param>
</when>
<when value="fastq_pair">
<repeat name="fastq_files" title="fastq file">
<param name="fastq_file" type="data" format="fastq" label="fastq file"/>
</repeat>
<repeat name="fastq_files_pair" title="fastq pair file">
<param name="fastq_file_pair" type="data" format="fastq" label="fastq file"/>
</repeat>
<param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
<conditional name="type_of_mapper">
<param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
<option value="bowtie">bowtie</option>
<option value="bowtie2">bowtie2</option>
</param>
<when value="bowtie"/>
<when value="bowtie2">
<param name="insert_size" type="integer" value="500" min="0" label="insert site for the paired-end library"/>
</when>
</conditional>
</when>
<when value="sam_file">
<repeat name="sam_files" title="sam file">
<param name="sam_file" type="data" format="sam" label="sam alignement file if the reads where already mapped on the list of TE copies"/>
</repeat>
</when>
</conditional>
<param name="MAPQ" type="integer" value="255" min="0" max="255" label="maximum MAPQ mapping quality value to count a read as mapped (the lower the number the better the quality)"/>
<conditional name="type_of_output">
<param name="count_sirna" type="select" label="count siRNA (21pb reads) in a different file">
<option value="no">No</option>
<option value="yes">Yes</option>
</param>
</conditional>
</inputs>
<outputs>
<data format="tabular" name="output_file" label="${tool.name} on ${on_string}: reads count table">
<discover_datasets pattern="__designation_and_ext__" directory="alignment" visible="true" />
</data>
<data format="tabular" name="output_sirna_file" label="${tool.name} on ${on_string}: reads count table for siRNA">
<filter>(type_of_output['count_sirna']=='yes')</filter>
</data>
</outputs>
<requirements>
<requirement type="package">python3</requirement>
<requirement type="package">nice</requirement>
</requirements>
<help>
## countTE
Compute a count table file from NGS data file(s), a fasta file containing a list of TE copie sequences and a rosette file.
### rosette file
The rosette file contains at least 2 columns. The first column corresponds to the list of TE copie names from the fasta file, and the second column corresponds to an variable associated to these TE copie names on which we want to compute the counts.
For example, we can write the following rosette file:
```
2L|(3071416..3071503,3071708..3071841)|DNA/P|PROTOP PROTOP
2L|(5363113..5363154,5363819..5363952)|DNA/P|PROTOP PROTOP
2L|c(9889960..9890093,9890313..9890400)|DNA/P|PROTOP PROTOP
2L|(20948958..20949699)|DNA/RC|DNAREP1_DM DNAREP1_DM
2L|c(20958914..20959207)|DNA/RC|DNAREP1_DM DNAREP1_DM
2L|c(20966385..20966456)|DNA/RC|DNAREP1_DM DNAREP1_DM
2L|(20976274..20976387)|DNA/RC|DNAREP1_DM DNAREP1_DM
```
which will allow us to count reads mapping on the `PROTOP` and the `DNAREP1_DM` elements.
The rosette file can contain more TE copie name than the fasta file, but we cannot map a read on a TE copie not present in the fasta file.
And the rosette fasta file can contain copies not present in the rosette file, but reads mapping on these copies will be ignored.
The rosette file can contain as many variable column as necessary.
countTE will group together the count of reads mapping on TE copies according to the variable column defined by `count_column`.
### NGS Data file
The NGS data set can be of two types: fastq sequence files or sam alignement files
#### fastq files
You can add any number of **fastq files** to be mapped on the fasta file, for paired-end data you must add the same number of paired fastq files.
When fastq files are provided countTE compte an index the fasta file and then map the reads using [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) or [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml).
For smallRNA sequencing data we recommand to use [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) which seems to perform beter than [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml).
When using RNA sequencing data we recomand to use [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) and to specify the correct insert size used to build the library.
#### sam file
counTE output the sam alignment files corresponding to each fastq file or pair of fastq files in the case of paired-end data.
You can also directly use sam alignement files instead of fastq files to skip the mapping step of countTE.
This is particallarly usefull to compute a count table according to another column in the rosette file for example.
### output file
countTE reports a space delimited tabular text file of the counts.
The first columns correspond to the columns of the rosette file without the first one and with the `count_column` in first position.
The following column(s) corresponds to the mapping reads counts for each fastq file or sam file and the last column corresponds to the total of these counts.
</help>
</tool>