From 0234c13aa6a464a858f7f30d4575cc10af4e75c8 Mon Sep 17 00:00:00 2001 From: "Le (Lena) Huang" Date: Thu, 18 Jan 2024 11:02:28 -0500 Subject: [PATCH] 3.x (#154) * dbcan 3.x development (#141) * move cli to the pkg * add .editorconfig * add pre-commit-config.yaml * add .dockerignore * reformat * rm setup script and conf * change ValueError to KeyError * add readthedocs conf * add .toml to manage the proj * add sphinx docs * .dockerignore * add user guide doc * simplify readme * reorder authors * :Update 4.1.0: 1. Update dbCAN 2. Harmonizing codes from Jinfang's updates * fix small bugs * update dbCAN version * revise sha256 for 4.1.0 * add yml file to install environment for dbcan-protocol * remove usage * :add: readthedoc badge * [fix] fix bug in meta.yaml * [Update docs/user_guide/database_preparation.rst] Installation method * [Update] rename sub.prediction.out into substrate.out * 1. rename dbcan_cli into dbcan.cli in plots.py 2. add seaborn in dbcan.yml 3. update run from raw reads rst file 4. fix pyproject.toml for typo * move Dockerfile to root * add cazyme annotation Dockerfile * remove git-lfs db * [Fix] reformat readTheDoc docs/user_guide/run_from_raw_reads.rst * [Fix] docs/user_guide/run_from_raw_reads.rst :fix some grammar issue for rst format: * 1. [update] dbcan/cli/run_dbcan.py 2. [add] add dbcan/cli/dbcan_build.py for database build, add entrypoint in pyproject and meta.yaml * delete redundant file * [fix] remove number after reference * add refs for datasets * add Wastyk2021 dataset to doc (#150) * fix typo * add dbcan_build * revise meta.yaml * fix bug in installation * add database * [fix] fix command error in installation.rst * revise the doc * revise * 1. fix bug in dbcan_sub; 2. add packages matplotlib, openpyxl * update dbcan_build * fix but for gff judgement * revise dependencies * fix bug in the docs * revise run from protein sequence to run from DNA sequence * remove the database installation page. remove pip install part * update docker * 1. add requests package 2. fix bug in dbcan_build * Fix issues in readthedoc * polish the catalog page --------- Co-authored-by: HD Yi --- conda-recipe/meta.yaml | 6 +- dbcan/cli/run_dbcan.py | 121 ++++++++++++++++------- dbcan/utils/dbcan_build.py | 4 +- docs/installation.rst | 61 +++++++----- docs/user_guide/database_preparation.rst | 1 + docs/user_guide/index.rst | 6 +- docs/user_guide/run_from_raw_reads.rst | 15 ++- pyproject.toml | 3 +- 8 files changed, 148 insertions(+), 69 deletions(-) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index 7be1bbfb..b95151f7 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -1,5 +1,5 @@ {% set name = "dbcan" %} -{% set version = "4.1.2" %} +{% set version = "4.1.3" %} package: name: "{{ name|lower }}" @@ -9,7 +9,8 @@ source: # the sha256 sum is generated by doing # wget -0- [URL] | shasum -a 256 url: https://github.com/linnabrown/run_dbcan/releases/download/{{ version }}/dbcan-{{ version }}.tar.gz - sha256: 3a675683379d1afc9f3444fc9894272f1485956df266a6ee4fc11a8f628e6d51 + sha256: 6346e0b6b2c810e2f808bc0bc901643c16f3eda2652d50b8f8470eb722fb419b + build: number: 0 @@ -46,6 +47,7 @@ requirements: - openpyxl - matplotlib-base - session-info + - blast test: imports: diff --git a/dbcan/cli/run_dbcan.py b/dbcan/cli/run_dbcan.py index e45ca9e1..a0e6e270 100755 --- a/dbcan/cli/run_dbcan.py +++ b/dbcan/cli/run_dbcan.py @@ -13,7 +13,7 @@ import time # Recent updated information: -# Jan/01/23: Add doc code [Haidong Yi, Le Huang] +# Jan/01/23: Add doc code [Haidong Yi, Le Huang] [Haidong Yi, Le Huang] # Oct/10/23: Recontructed the run_dbcan [Haidong Yi] # Sep/07/23: Replace hmmscan with hmmsearch. Update perl code [Le Huang, Yanbin Yin] # Dec/15/22: 1.adding function to convert cgc_standard.out to json format. 2. adding function cgc_[Jinfang Zheng] @@ -60,8 +60,21 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name): hmm_file = f"{dbDir}{db_name}.hmm" uniInput_file = f"{outPath}uniInput" - # hmmer = Popen( - # [ + # # hmmer = Popen( + # # [ + # "hmmsearch", + # "--domtblout", + # domtblout_file, + # "--cpu", + # str(hmm_cpu), + # "-o", + # "/dev/null", + # hmm_file, + # uniInput_file, + # ] + # ) + # hmmer.wait() + hmmer_list = [ # "hmmsearch", # "--domtblout", # domtblout_file, @@ -88,6 +101,9 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name): cmd_str = " ".join(hmmer_list) os.system(cmd_str) + cmd_str = " ".join(hmmer_list) + os.system(cmd_str) + parsed_hmm_output = hmmer_parser.run(input_file=f"{outPath}h{db_name}.out", eval_num=hmm_eval, coverage=hmm_cov) with open(f"{outPath}{db_name}.out", "w") as f: f.write(parsed_hmm_output) @@ -138,6 +154,36 @@ def split_uniInput(uniInput, dbcan_thread, outPath, dbDir, hmm_eval, hmm_cov, hm # ) # dbsub.wait() + dbsub_list = [ + "hmmsearch", + "--domtblout", + f"{outPath}d.txt", + "--cpu", + str(hmm_cpu), + "-o", + "/dev/null", + f"{dbDir}dbCAN_sub.hmm", + f"{outPath}uniInput", + ] + + dbsub_str = " ".join(dbsub_list) + os.system(dbsub_str) + + # dbsub = Popen( + # [ + # "hmmsearch", + # "--domtblout", + # f"{outPath}d.txt", + # "--cpu", + # str(hmm_cpu), + # "-o", + # "/dev/null", + # f"{dbDir}dbCAN_sub.hmm", + # f"{outPath}uniInput", + # ] + # ) + # dbsub.wait() + dbsub_list = [ "hmmsearch", "--domtblout", @@ -646,6 +692,9 @@ def run_dbCAN( # End CAZyme Extraction ###################### # Begin GFF preperation + + #union for CAZyme, tf, tp, stp + candidate_gene_set = cazyme.union(tf, tp, stp) if inputType in ["prok", "meta"]: # use Prodigal GFF output with open(outDir + prefix + "prodigal.gff") as f: @@ -679,38 +728,40 @@ def run_dbCAN( gff = True break if gff: # user file was in GFF format - with open(auxFile) as f: - with open(outDir + prefix + "cgc.gff", "w") as out: - for line in f: - row = line.rstrip().split("\t") - if (not line.startswith("#")) and len(row) >= 9: - if row[2] == "CDS": - note = row[8].strip().rstrip(";").split(";") - gene = "" - notes = {} - for x in note: - temp = x.split("=") - notes[temp[0]] = temp[1] - if "ID" in notes: - gene = notes["ID"] - else: - continue - if gene in cazyme: - row[2] = "CAZyme" - row[8] = "DB=" + cazyme_genes[gene] - elif gene in tf: - row[2] = "TF" - row[8] = "DB=" + tf_genes[gene] - elif gene in tp: - row[2] = "TC" - row[8] = "DB=" + tp_genes[gene] - elif gene in stp: - row[2] = "STP" - row[8] = "DB=" + stp_genes[gene] - else: - row[8] = "" - row[8] += ";ID=" + gene - out.write("\t".join(row) + "\n") + with open(auxFile) as f, open(outDir + prefix + "cgc.gff", "w") as out: + for line in f: + row = line.rstrip().split("\t") + if (not line.startswith("#")) and len(row) >= 9: + if row[2] == "CDS": + note = row[8].strip().rstrip(";").split(";") + gene1 = "" + gene2 = "" + notes = {} + for x in note: + temp = x.split("=") + notes[temp[0]] = temp[1] + # fix it tomorrow + if "ID" in notes: + gene1 = notes["ID"] + if "Name" in notes: + gene2 = notes["Name"] + # fix it tomorrow + if gene in cazyme: + row[2] = "CAZyme" + row[8] = "DB=" + cazyme_genes[gene] + elif gene in tf: + row[2] = "TF" + row[8] = "DB=" + tf_genes[gene] + elif gene in tp: + row[2] = "TC" + row[8] = "DB=" + tp_genes[gene] + elif gene in stp: + row[2] = "STP" + row[8] = "DB=" + stp_genes[gene] + else: + row[8] = "" + row[8] += ";ID=" + gene + out.write("\t".join(row) + "\n") else: # user file was in BED format with open(auxFile) as f: with open(outDir + prefix + "cgc.gff", "w") as out: diff --git a/dbcan/utils/dbcan_build.py b/dbcan/utils/dbcan_build.py index ef09f442..31d613cd 100644 --- a/dbcan/utils/dbcan_build.py +++ b/dbcan/utils/dbcan_build.py @@ -94,12 +94,12 @@ def main(): "mv fam-substrate-mapping-08012023.tsv fam-substrate-mapping.tsv", "makeblastdb -in PUL.faa -dbtype prot", "mv dbCAN-PUL_12-12-2023.xlsx dbCAN-PUL.xlsx", - "tar xzvf dbCAN-PUL.tar.gz", + "tar xzf dbCAN-PUL.tar.gz", "hmmpress -f dbCAN_sub.hmm", "mv CAZyDB.07262023.fa CAZyDB.fa", "diamond makedb --in CAZyDB.fa -d CAZy", "mv dbCAN-HMMdb-V12.txt dbCAN.txt", - "hmmpress dbCAN.txt" + "hmmpress dbCAN.txt", "diamond makedb --in tcdb.fa -d tcdb", "hmmpress -f tf-1.hmm", "hmmpress -f tf-2.hmm", diff --git a/docs/installation.rst b/docs/installation.rst index 4fba40dd..9b084c3d 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -57,46 +57,59 @@ You can remove this parameter if you don't want to clean, but we recommend you a away from index contamination. -Installing with PyPI --------------------- +Installing SignalP (Optional) +-------------------------------- -To install the `dbcan`_ package via ``pip``, you first need to install a few executable -dependencies: -- `NCBI-BLAST+ `_; -- `HMMER `_ (:cite:`2011:hmmer`); -- `DIAMOND `_ (:cite:`2021:diamond`); -- `SignalP `_ (:cite:`2017:nielsen`) (Optional). +- `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows: + 1. Visit the `SignalP website `_. + 2. Submit a download `request `_. + 3. Post-download, add `SignalP` to your system's environmental variables to make it executable. -.. warning:: +- For installation assistance, refer to the :doc:`faq/signalp_installation`. - **SignalP Integration Notice** - Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process. - **Installing SignalP (Optional)**: +.. Installing with PyPI +.. -------------------- - - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows: - 1. Visit the `SignalP website `_. - 2. Submit a download `request `_. - 3. Post-download, add `SignalP` to your system's environmental variables to make it executable. +.. To install the `dbcan`_ package via ``pip``, you first need to install a few executable +.. dependencies: + +.. - `NCBI-BLAST+ `_; +.. - `HMMER `_ (:cite:`2011:hmmer`); +.. - `DIAMOND `_ (:cite:`2021:diamond`); +.. - `SignalP `_ (:cite:`2017:nielsen`) (Optional). + +.. .. warning:: + +.. **SignalP Integration Notice** + +.. Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process. + +.. **Installing SignalP (Optional)**: + +.. - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows: +.. 1. Visit the `SignalP website `_. +.. 2. Submit a download `request `_. +.. 3. Post-download, add `SignalP` to your system's environmental variables to make it executable. - - For installation assistance, refer to the :doc:`faq/signalp_installation`. +.. - For installation assistance, refer to the :doc:`faq/signalp_installation`. - This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it. +.. This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it. -After the dependencies are installed, `dbcan`_ can be installed via `PyPI `_: +.. After the dependencies are installed, `dbcan`_ can be installed via `PyPI `_: -.. code-block:: shell +.. .. code-block:: shell - pip install dbcan +.. pip install dbcan -.. note:: +.. .. note:: - Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly. - Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically. +.. Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly. +.. Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically. Installing with Docker ---------------------- diff --git a/docs/user_guide/database_preparation.rst b/docs/user_guide/database_preparation.rst index 368f123a..07752f92 100644 --- a/docs/user_guide/database_preparation.rst +++ b/docs/user_guide/database_preparation.rst @@ -20,3 +20,4 @@ Database Installation Command && cd ../ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.fna \ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.faa \ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.gff + diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index a7c5979e..da7ca52d 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -1,10 +1,14 @@ User Guide ========== +Hint:If you want to run from raw reads from metagenome, please refer to Run from Raw Reads: Automated CAZyme and Glycan Substrate Annotation in Microbiomes: A Step-by-Step Protocol. +Otherwise, please refer to any following instruction + + + .. toctree:: :maxdepth: 1 - database_preparation quick_start run_from_protein_sequence run_with_CGCFinder diff --git a/docs/user_guide/run_from_raw_reads.rst b/docs/user_guide/run_from_raw_reads.rst index 0d7d2ea6..52a59f0d 100644 --- a/docs/user_guide/run_from_raw_reads.rst +++ b/docs/user_guide/run_from_raw_reads.rst @@ -203,6 +203,13 @@ To install the databases, execute the following commands: Download database required by Kraken2 (very slow; can be skipped if users do not intend to run Kraken2): +.. code-block:: shell + + dbcan_build --cpus 8 --db-dir db --clean + +Download database required by Kraken2 (very slow; can be skipped +if users do not intend to run Kraken2): + .. code-block:: shell kraken2-build --standard --db K2 @@ -329,8 +336,8 @@ Use Megahit for assembling reads into contigs: .. code-block:: shell - megahit -m 0.5 -t 32 -o megahit_ Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000 - megahit -m 0.5 -t 32 -o megahit_ Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000 + megahit -m 0.5 -t 32 -o megahit_Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000 + megahit -m 0.5 -t 32 -o megahit_Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000 ``MEGAHIT`` generates two output folders. Each contains five files and one sub-folder (Box 3). @@ -361,8 +368,8 @@ P4. Predict genes by `Prokka` (TIMING ~21h) .. code-block:: shell - prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_ Wet2014/Wet2014.contigs.fa - prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_ Dry2014/Dry2014.contigs.fa + prokka --kingdom Bacteria --cpus 32 --outdir prokka_Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_Wet2014/Wet2014.contigs.fa + prokka --kingdom Bacteria --cpus 32 --outdir prokka_Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_Dry2014/Dry2014.contigs.fa The parameter ``--kingdom Bacteria`` is required for bacterial gene prediction. diff --git a/pyproject.toml b/pyproject.toml index 202e1846..8409605c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ requires = ["hatchling"] [project] name = "dbcan" -version = "4.1.2" +version = "4.1.3" description = "Standalone version of dbCAN annotation tool for automated CAZyme annotation" readme = "README.md" requires-python = ">=3.6" @@ -29,6 +29,7 @@ dependencies = [ "openpyxl", "matplotlib", "pyhmmer", + "requests", # for debug logging (referenced from the issue template) "session-info" ]