countTE.xml

<!-- 
    Copyright (C) 2015 Laurent Modolo

    This file is part of TEtools suite for galaxy.

    TEtools is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    TEtools is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with TEtools.  If not, see <http://www.gnu.org/licenses/>.
-->
<tool id="countTE" name="countTE" version="1.0.0" hidden="false">
    <description>fastq quality trimming</description>
    <version_command interpreter="python3">countTE.py -version</version_command>
    <command interpreter="python3">${__root_dir__}/tools/TEtools/countTE.py
        -rosette $rosette_file
        -column $count_column
        -TE_fasta $fasta_file
        -MAPQ $MAPQ
        -count $output_file
        #if $type_of_input['condition'] == "fastq":
            -RNA
            #for $file in $type_of_input['fastq_files']:
                $file['fastq_file']
            #end for
            #if $type_of_input['qc_step'] == "run_qc":
                -QC
            #end if
            #if $type_of_input['mapper'] == "bowtie2":
                -bowtie2
            #end if
        #end if
        #if $type_of_input['condition'] == "fastq_pair":
            -RNA
            #for $file in $type_of_input['fastq_files']:
                $file['fastq_file']
            #end for
            -RNApair
            #for $file in $type_of_input['fastq_files_pair']:
                $file['fastq_file_pair']
            #end for
            #if $type_of_input['qc_step'] == "run_qc":
                -QC
            #end if
            #if $type_of_input['type_of_mapper']['mapper'] == "bowtie2":
                -bowtie2
                -insert $type_of_input['type_of_mapper']['insert_size']
            #end if
        #end if
        #if $type_of_input['condition'] == "sam_file":
            -sam
            #for $fastq in $type_of_input['sam_files']:
                $fastq['sam_file']
            #end for
        #end if
        #if $type_of_output['count_sirna']=='yes':
            -siRNA $output_sirna_file
        #end if
    </command>

    <inputs>
        <param name="rosette_file" type="data" format="tabular" label="rosette file. Each column correspond to a variable and the first one to the TE copy names" />
        <param name="count_column" type="integer" value="2" label="rosette file column to count"/>
        <param name="fasta_file" type="data" format="fasta" label="list of TE copies in fasta format" />
        <conditional name="type_of_input">
            <param name="condition" type="select" label="input data type">
                <option value="fastq">fastq</option>
                <option value="fastq_pair">fastq paired</option>
                <option value="sam_file">sam</option>
            </param>
            <when value="fastq">
                <repeat name="fastq_files" title="fastq file">
                     <param name="fastq_file" type="data" format="fastq" label="fastq file"/>
                </repeat>
                <param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
                <param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
                    <option value="bowtie">bowtie</option>
                    <option value="bowtie2">bowtie2</option>
                </param>
            </when>
            <when value="fastq_pair">
                <repeat name="fastq_files" title="fastq file">
                     <param name="fastq_file" type="data" format="fastq" label="fastq file"/>
                </repeat>
                <repeat name="fastq_files_pair" title="fastq pair file">
                     <param name="fastq_file_pair" type="data" format="fastq" label="fastq file"/>
                </repeat>
                <param name="qc_step" type="boolean" checked="false" truevalue="run_qc" falsevalue="skip_qc" label="Run UrQt quality trimmer on the data"/>
                <conditional name="type_of_mapper">
                    <param name="mapper" type="select" label="Mapper to use: bowtie (smallRNASeq) or bowtie2 (RNASeq)">
                        <option value="bowtie">bowtie</option>
                        <option value="bowtie2">bowtie2</option>
                    </param>
                    <when value="bowtie"/>
                    <when value="bowtie2">
                        <param name="insert_size" type="integer" value="500"  min="0" label="insert site for the paired-end library"/>
                    </when>
                </conditional>
            </when>
            <when value="sam_file">
                <repeat name="sam_files" title="sam file">
                     <param name="sam_file" type="data" format="sam" label="sam alignement file if the reads where already mapped on the list of TE copies"/>
                </repeat>
            </when>
        </conditional>
        <param name="MAPQ" type="integer" value="255" min="0" max="255" label="maximum MAPQ mapping quality value to count a read as mapped (the lower the number the better the quality)"/>
        <conditional name="type_of_output">
            <param name="count_sirna" type="select" label="count siRNA (21pb reads) in a different file">
                <option value="no">No</option>
                <option value="yes">Yes</option>
            </param>
        </conditional>

    </inputs>

    <outputs>
        <data format="tabular" name="output_file" label="${tool.name} on ${on_string}: reads count table">
            <discover_datasets pattern="__designation_and_ext__" directory="alignment" visible="true" />
        </data>
        <data format="tabular" name="output_sirna_file" label="${tool.name} on ${on_string}: reads count table for siRNA">
            <filter>(type_of_output['count_sirna']=='yes')</filter>
        </data>
    </outputs>

    <requirements>
        <requirement type="package">python3</requirement>
        <requirement type="package">nice</requirement>
    </requirements>

    <help>
## countTE

Compute a count table file from NGS data file(s), a fasta file containing a list of TE copie sequences and a rosette file.

### rosette file
The rosette file contains at least 2 columns. The first column corresponds to the list of TE copie names from the fasta file, and the second column corresponds to an variable associated to these TE copie names on which we want to compute the counts.

For example, we can write the following rosette file:
```
2L|(3071416..3071503,3071708..3071841)|DNA/P|PROTOP   PROTOP
2L|(5363113..5363154,5363819..5363952)|DNA/P|PROTOP   PROTOP
2L|c(9889960..9890093,9890313..9890400)|DNA/P|PROTOP  PROTOP
2L|(20948958..20949699)|DNA/RC|DNAREP1_DM             DNAREP1_DM
2L|c(20958914..20959207)|DNA/RC|DNAREP1_DM            DNAREP1_DM
2L|c(20966385..20966456)|DNA/RC|DNAREP1_DM            DNAREP1_DM
2L|(20976274..20976387)|DNA/RC|DNAREP1_DM             DNAREP1_DM
```

which will allow us to count reads mapping on the `PROTOP` and the `DNAREP1_DM` elements.
The rosette file can contain more TE copie name than the fasta file, but we cannot map a read on a TE copie not present in the fasta file.
And the rosette fasta file can contain copies not present in the rosette file, but reads mapping on these copies will be ignored.

The rosette file can contain as many variable column as necessary.
countTE will group together the count of reads mapping on TE copies according to the variable column defined by `count_column`.

### NGS Data file

The NGS data set can be of two types: fastq sequence files or sam alignement files

#### fastq files
You can add any number of **fastq files** to be mapped on the fasta file, for paired-end data you must add the same number of paired fastq files.

When fastq files are provided countTE compte an index the fasta file and then map the reads using [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) or [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml).
For smallRNA sequencing data we recommand to use [bowtie](http://bowtie-bio.sourceforge.net/index.shtml) which seems to perform beter than [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml).
When using RNA sequencing data we recomand to use [bowtie2](http://bowtie-bio.sourceforge.net/bowtie2/index.shtml) and to specify the correct insert size used to build the library.

#### sam file
counTE output the sam alignment files corresponding to each fastq file or pair of fastq files in the case of paired-end data.
You can also directly use sam alignement files instead of fastq files to skip the mapping step of countTE.
This is particallarly usefull to compute a count table according to another column in the rosette file for example.

### output file
countTE reports a space delimited tabular text file of the counts.
The first columns correspond to the columns of the rosette file without the first one and with the `count_column` in first position.
The following column(s) corresponds to the mapping reads counts for each fastq file or sam file and the last column corresponds to the total of these counts.
    </help>
</tool>