Smolak_fve (#18)

* Big development commit * Added config and snakefile * add conda calling * fixed Fast_virome_explorer * changing parameters of fve * Merge branch 'smolak_fve' # Conflicts: # example/config.yaml # example/config_viral_identification.yaml # example/mhv/config_viral_identification.yaml * Minor fixes * Minor fixes * Referenece added * Edited classificatin_readbased pipeline to use Conda enviroment. * Updated assembly/assembler/spades.snake rule.
jbudis · Mar 16, 2019 · 8efcd2d · 8efcd2d
1 parent 0792f61
commit 8efcd2d
Show file tree

Hide file tree

Showing 18 changed files with 5,518 additions and 55 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -26,7 +26,7 @@
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'0.2.0'
+release = u'0.3.0'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/example/config.yaml b/example/config.yaml
@@ -0,0 +1,86 @@
+samples:                            # List of sample categories to be analysed
+    - name: example.*               # Regex expression of sample names to be analysed (reads/original/example.*_R1.fastq.gz)
+
+report_dir: report/public/01-viral         # Generated reports and essential output files would be stored there
+threads: 16                         # Number of threads to use in analysis
+
+reads:                              # Prepare reads and quality reports for downstream analysis
+    preprocess:                     # Pre-process of reads, eliminate sequencing artifacts, contamination ...
+
+        trimmed:                    # Remove low quality parts of reads
+            method: trimmomatic     # Supported values: trimmomatic
+            temporary: False        # If True, generated files would be removed after successful analysis
+            crop: 500               # Maximal number of bases in read to keep. Longer reads would be truncated.
+            quality: 20             # Minimal average quality of read bases to keep (inside sliding window of length 5)
+            headcrop: 20            # Number of bases to remove from the start of read
+            minlen: 35              # Minimal length of trimmed read. Shorter reads would be removed.
+
+        decontaminated:             # Eliminate fragments from known artificial source, e.g. contamination by human
+            method: bowtie2         # Supported values: bowtie2
+            temporary: False        # If True, generated files would be removed after successful analysis
+            references:             # List of reference genomes
+                - mhv
+            keep: True              # Keep reads mapped to references (True) of remove them as contamination (False)
+
+        deduplicated:               # Remove fragments with the same sequence (PCR duplicated)
+            method: fastuniq        # Supported values: fastuniq
+            temporary: False        # If True, generated files would be removed after successful analysis
+
+    report:                         # Summary reports of read characteristics to assess their quality
+        quality_report:             # HTML summary report of read quality
+            method: fastqc          # Supported values: fastqc
+            read_types:             # List of preprocess steps for quality reports
+                - original
+                - trimmed
+                - decontaminated
+                - deduplicated
+
+assembly:                           # Join reads into longer sequences (contigs) based on their overlaps
+    assembler:                      # Method for joining reads
+        method: spades              # Supported values: spades
+        mode: standard              # Supported values: standard, meta, plasmid, rna, iontorrent
+        careful: True               # Tries to reduce number of mismatches and short indels, longer runtime
+
+    report:                         # Summary reports for assembly process and results
+        quality_report:             # Quality of assembled contigs
+            method: quast           # Supported values: quast
+
+        assembly_graph:             # Visualisation of overlaps between assembled contigs
+            method: bandage         # Supported values: bandage
+
+classification:                               # Identify genomic source of sequenced reads
+    contig_based:                             # Find homologue sequences based on assembled contigs
+        method: blast                         # Supported values: blast
+        reference:                            # List of reference genomes to search for homology
+            mhv:                              # Name of reference genome (reference/mhv/mhv.fa)
+                query_type: nucleotide        # Nucleotide or protein, according to sequence type in input .fa files
+                target_type: nucleotide       # Nucleotide or protein, according to sequence type in blast database
+                max_target_seqs: 10           # Number of best hit reference sequences from blast database for each input sequence
+
+    viral:                                    # Customized methods for identification of viruses
+        identification:                       # Identification of contigs with similarity to viral genomes
+            method: virfinder                 # Supported values: virfinder
+
+    report:                                   # Summary reports of classification results
+        summary:                              # Aggregated HTML table with summarized attributes of contigs and homology
+            method: fasta_summary             # Supported values: fasta_summary
+            max_query_seqs: 20000             # Maximal number of contigs to report (ordered by their length)
+            max_target_seqs: 5                # Maximal number of homologues from reference genomes to report
+            min_query_coverage: 0.01          # Show only hits that have at least this proportion of contig mapped to reference
+            include:                          # Optional attributes of contigs to report
+                - virfinder                   # Probability that contig is from virus
+#                - coverage                   # Number of aligned reads per contig
+                - blast                       # Homologues identified by Blast against specified reference databases
+
+            html:                             # Attributes applicable only for the HTML report, would NOT be used in the TSV table
+                seqs_per_page: 100            # Number of table rows (sequences) per page
+                sort_by: 'Sequence'           # Rows would be sorted according to values in this column
+                sort_how: 'asc'               # Values would be sorted in desc(ending) or asc(ending) order
+                columns:                      # Show only these attributes, in this order
+                    - Sequence                # Names of contigs with link to fasta files with its sequence
+                    - Length                  # Number of bases
+                    - Compress ratio          # Complexity of contig sequence, may be used to filter repetitive sequences
+                    - Coverage                # Average number of reads covering each base of contig
+                    - VirFinder pvalue        # Probability that contig is from viral genome
+                    - Homologue link          # Reference sequences with homology
+                    - Mapped reads            # Number of mapped reads to contigs
diff --git a/example/mhv/config_classification_readbased.yaml b/example/mhv/config_classification_readbased.yaml
@@ -0,0 +1,61 @@
+samples:                             # List of sample categories to be analysed
+    - name: .*                       # Regex expression of sample names to be analysed (reads/original/example.*_R1.fastq.gz)
+      reference: mhv_classification  # Reference genome or database, important to create kallisto index for used reference
+
+report_dir: report/public/01-viral   # Generated reports and essential output files would be stored there
+threads: 16                          # Number of threads to use in analysis
+
+reads:                               # Prepare reads and quality reports for downstream analysis
+    preprocess:                      # Pre-process of reads, eliminate sequencing artifacts, contamination ...
+        trimmed:                     # Remove low quality parts of reads
+            method: trimmomatic      # Supported values: trimmomatic
+            temporary: False         # If True, generated files would be removed after successful analysis
+            crop: 500                # Maximal number of bases in read to keep. Longer reads would be truncated.
+            quality: 20              # Minimal average quality of read bases to keep (inside sliding window of length 5)
+            headcrop: 20             # Number of bases to remove from the start of read
+            minlen: 35               # Minimal length of trimmed read. Shorter reads would be removed.
+#        decontaminated:            # Eliminate fragments from known artificial source, e.g. contamination by human
+#            method: bowtie2        # Supported values: bowtie2
+#            temporary: False       # If True, generated files would be removed after successful analysis
+#            references:            # List of reference genomes
+#                - mhv
+#            keep: True             # Keep reads mapped to references (True) or remove them as contamination (False)
+        deduplicated:                # Remove fragments with the same sequence (PCR duplicated)
+            method: fastuniq         # Supported values: fastuniq
+            temporary: False         # If True, generated files would be removed after successful analysis
+
+    report:                         # Summary reports of read characteristics to assess their quality
+        quality_report:             # HTML summary report of read quality
+            method: fastqc          # Supported values: fastqc
+            read_types:             # List of preprocess steps for quality reports (supported values: original, trimmed, decontaminated, deduplicated)
+                - original
+                - trimmed
+#                - decontaminated
+                - deduplicated
+
+classification:
+    read_based:                      # viral identification from the reads
+        method: fast_virome_explorer # Tool for viral identification from the read, supported values: fast_virome_explorer
+        read_type: deduplicated      # Type of reads use as input for fast_virome explorer
+        kmer_size: 31                # Size of analyzed k-mers (lower size = higher senzitivity and lower senzitivity of method and vice-versa)
+        count_type: tpm              # Supported metrics: read_count, tpm (transkripts per milion)
+        ratio_of_coverages: 0.02     # based on the ratio of the observed extent of genome coverage with the expected extent of genome coverage, if a virus has ratio_of_coverage < {set_value}, FastViromeExplorer discards the virus (default value is 0.3)
+        genome_coverage: 0.024       # observed extent of genome coverage by the mapped reads, if a virus has genome_coverage < {set_value}, FastViromeExplorer discards the virus (default value is 0.1, which means coverage of 10% of particular genome)
+        mapped_reads: 5              # the minimal number of mapped reads to the genome, if a virus has mapped_reads < {set_value}, FastViromeExplorer discards the virus (default value is 10)
+    report:                          # Summary reports of classification results
+        taxonomic_counts:            # Number of reads mapped to each taxonomic unit
+            pieplot:                 # Visualisation in pie plot form
+                method: krona        # Supported values: krona
+            count_table:             # Summary table with number of reads per taxonomic unit
+                method: custom       # Supported values: custom
+                tax_levels:          # List of taxonomic levels for which tables would be generated
+                - class
+                - genus
+            barplot:                 # Visualisation in bar plot form
+                method: custom       # Supported values: custom
+                formats:             # Output format of the resulting images (supported values: png, svg)
+                - png
+                - svg
+                tax_levels:          # List of taxonomic levels for which plots would be generated (supporter values: class, genus)
+                - class
+                - genus
diff --git a/example/mhv/reference/mhv_classification/dbsnp/dbsnp.vcf.gz b/example/mhv/reference/mhv_classification/dbsnp/dbsnp.vcf.gz
diff --git a/...mhv/reference/mhv_classification/kallisto_index/mhv_classification-kallisto-index-k31.idx b/...mhv/reference/mhv_classification/kallisto_index/mhv_classification-kallisto-index-k31.idx
diff --git a/example/mhv/reference/mhv_classification/kallisto_index/mhv_classification.lens.txt b/example/mhv/reference/mhv_classification/kallisto_index/mhv_classification.lens.txt
@@ -0,0 +1,5 @@
+NC_034484.2	N/A	N/A	119451
+NC_012532.1	N/A	N/A	10794
+NC_038294.1	N/A	N/A	30111
+NC_027983.1	N/A	N/A	167435
+NC_000867.1	N/A	N/A	10079