Skip to content

Commit

Permalink
Smolak_fve (#18)
Browse files Browse the repository at this point in the history
* Big development commit

* Added config and snakefile

* add conda calling

* fixed Fast_virome_explorer

* changing parameters of fve

* Merge branch 'smolak_fve'

# Conflicts:
#	example/config.yaml
#	example/config_viral_identification.yaml
#	example/mhv/config_viral_identification.yaml

* Minor fixes

* Minor fixes

* Referenece added

* Edited classificatin_readbased pipeline to use Conda enviroment.

* Updated assembly/assembler/spades.snake rule.
  • Loading branch information
sm0XY authored and wernerkrampl committed Mar 16, 2019
1 parent 0792f61 commit 8efcd2d
Show file tree
Hide file tree
Showing 18 changed files with 5,518 additions and 55 deletions.
2 changes: 1 addition & 1 deletion docs/conf.py
Expand Up @@ -26,7 +26,7 @@
# The short X.Y version
version = u''
# The full version, including alpha/beta/rc tags
release = u'0.2.0'
release = u'0.3.0'


# -- General configuration ---------------------------------------------------
Expand Down
86 changes: 86 additions & 0 deletions example/config.yaml
@@ -0,0 +1,86 @@
samples: # List of sample categories to be analysed
- name: example.* # Regex expression of sample names to be analysed (reads/original/example.*_R1.fastq.gz)

report_dir: report/public/01-viral # Generated reports and essential output files would be stored there
threads: 16 # Number of threads to use in analysis

reads: # Prepare reads and quality reports for downstream analysis
preprocess: # Pre-process of reads, eliminate sequencing artifacts, contamination ...

trimmed: # Remove low quality parts of reads
method: trimmomatic # Supported values: trimmomatic
temporary: False # If True, generated files would be removed after successful analysis
crop: 500 # Maximal number of bases in read to keep. Longer reads would be truncated.
quality: 20 # Minimal average quality of read bases to keep (inside sliding window of length 5)
headcrop: 20 # Number of bases to remove from the start of read
minlen: 35 # Minimal length of trimmed read. Shorter reads would be removed.

decontaminated: # Eliminate fragments from known artificial source, e.g. contamination by human
method: bowtie2 # Supported values: bowtie2
temporary: False # If True, generated files would be removed after successful analysis
references: # List of reference genomes
- mhv
keep: True # Keep reads mapped to references (True) of remove them as contamination (False)

deduplicated: # Remove fragments with the same sequence (PCR duplicated)
method: fastuniq # Supported values: fastuniq
temporary: False # If True, generated files would be removed after successful analysis

report: # Summary reports of read characteristics to assess their quality
quality_report: # HTML summary report of read quality
method: fastqc # Supported values: fastqc
read_types: # List of preprocess steps for quality reports
- original
- trimmed
- decontaminated
- deduplicated

assembly: # Join reads into longer sequences (contigs) based on their overlaps
assembler: # Method for joining reads
method: spades # Supported values: spades
mode: standard # Supported values: standard, meta, plasmid, rna, iontorrent
careful: True # Tries to reduce number of mismatches and short indels, longer runtime

report: # Summary reports for assembly process and results
quality_report: # Quality of assembled contigs
method: quast # Supported values: quast

assembly_graph: # Visualisation of overlaps between assembled contigs
method: bandage # Supported values: bandage

classification: # Identify genomic source of sequenced reads
contig_based: # Find homologue sequences based on assembled contigs
method: blast # Supported values: blast
reference: # List of reference genomes to search for homology
mhv: # Name of reference genome (reference/mhv/mhv.fa)
query_type: nucleotide # Nucleotide or protein, according to sequence type in input .fa files
target_type: nucleotide # Nucleotide or protein, according to sequence type in blast database
max_target_seqs: 10 # Number of best hit reference sequences from blast database for each input sequence

viral: # Customized methods for identification of viruses
identification: # Identification of contigs with similarity to viral genomes
method: virfinder # Supported values: virfinder

report: # Summary reports of classification results
summary: # Aggregated HTML table with summarized attributes of contigs and homology
method: fasta_summary # Supported values: fasta_summary
max_query_seqs: 20000 # Maximal number of contigs to report (ordered by their length)
max_target_seqs: 5 # Maximal number of homologues from reference genomes to report
min_query_coverage: 0.01 # Show only hits that have at least this proportion of contig mapped to reference
include: # Optional attributes of contigs to report
- virfinder # Probability that contig is from virus
# - coverage # Number of aligned reads per contig
- blast # Homologues identified by Blast against specified reference databases

html: # Attributes applicable only for the HTML report, would NOT be used in the TSV table
seqs_per_page: 100 # Number of table rows (sequences) per page
sort_by: 'Sequence' # Rows would be sorted according to values in this column
sort_how: 'asc' # Values would be sorted in desc(ending) or asc(ending) order
columns: # Show only these attributes, in this order
- Sequence # Names of contigs with link to fasta files with its sequence
- Length # Number of bases
- Compress ratio # Complexity of contig sequence, may be used to filter repetitive sequences
- Coverage # Average number of reads covering each base of contig
- VirFinder pvalue # Probability that contig is from viral genome
- Homologue link # Reference sequences with homology
- Mapped reads # Number of mapped reads to contigs
61 changes: 61 additions & 0 deletions example/mhv/config_classification_readbased.yaml
@@ -0,0 +1,61 @@
samples: # List of sample categories to be analysed
- name: .* # Regex expression of sample names to be analysed (reads/original/example.*_R1.fastq.gz)
reference: mhv_classification # Reference genome or database, important to create kallisto index for used reference

report_dir: report/public/01-viral # Generated reports and essential output files would be stored there
threads: 16 # Number of threads to use in analysis

reads: # Prepare reads and quality reports for downstream analysis
preprocess: # Pre-process of reads, eliminate sequencing artifacts, contamination ...
trimmed: # Remove low quality parts of reads
method: trimmomatic # Supported values: trimmomatic
temporary: False # If True, generated files would be removed after successful analysis
crop: 500 # Maximal number of bases in read to keep. Longer reads would be truncated.
quality: 20 # Minimal average quality of read bases to keep (inside sliding window of length 5)
headcrop: 20 # Number of bases to remove from the start of read
minlen: 35 # Minimal length of trimmed read. Shorter reads would be removed.
# decontaminated: # Eliminate fragments from known artificial source, e.g. contamination by human
# method: bowtie2 # Supported values: bowtie2
# temporary: False # If True, generated files would be removed after successful analysis
# references: # List of reference genomes
# - mhv
# keep: True # Keep reads mapped to references (True) or remove them as contamination (False)
deduplicated: # Remove fragments with the same sequence (PCR duplicated)
method: fastuniq # Supported values: fastuniq
temporary: False # If True, generated files would be removed after successful analysis

report: # Summary reports of read characteristics to assess their quality
quality_report: # HTML summary report of read quality
method: fastqc # Supported values: fastqc
read_types: # List of preprocess steps for quality reports (supported values: original, trimmed, decontaminated, deduplicated)
- original
- trimmed
# - decontaminated
- deduplicated

classification:
read_based: # viral identification from the reads
method: fast_virome_explorer # Tool for viral identification from the read, supported values: fast_virome_explorer
read_type: deduplicated # Type of reads use as input for fast_virome explorer
kmer_size: 31 # Size of analyzed k-mers (lower size = higher senzitivity and lower senzitivity of method and vice-versa)
count_type: tpm # Supported metrics: read_count, tpm (transkripts per milion)
ratio_of_coverages: 0.02 # based on the ratio of the observed extent of genome coverage with the expected extent of genome coverage, if a virus has ratio_of_coverage < {set_value}, FastViromeExplorer discards the virus (default value is 0.3)
genome_coverage: 0.024 # observed extent of genome coverage by the mapped reads, if a virus has genome_coverage < {set_value}, FastViromeExplorer discards the virus (default value is 0.1, which means coverage of 10% of particular genome)
mapped_reads: 5 # the minimal number of mapped reads to the genome, if a virus has mapped_reads < {set_value}, FastViromeExplorer discards the virus (default value is 10)
report: # Summary reports of classification results
taxonomic_counts: # Number of reads mapped to each taxonomic unit
pieplot: # Visualisation in pie plot form
method: krona # Supported values: krona
count_table: # Summary table with number of reads per taxonomic unit
method: custom # Supported values: custom
tax_levels: # List of taxonomic levels for which tables would be generated
- class
- genus
barplot: # Visualisation in bar plot form
method: custom # Supported values: custom
formats: # Output format of the resulting images (supported values: png, svg)
- png
- svg
tax_levels: # List of taxonomic levels for which plots would be generated (supporter values: class, genus)
- class
- genus
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,5 @@
NC_034484.2 N/A N/A 119451
NC_012532.1 N/A N/A 10794
NC_038294.1 N/A N/A 30111
NC_027983.1 N/A N/A 167435
NC_000867.1 N/A N/A 10079

0 comments on commit 8efcd2d

Please sign in to comment.