From 0234c13aa6a464a858f7f30d4575cc10af4e75c8 Mon Sep 17 00:00:00 2001
From: "Le (Lena) Huang" <lehuang@unc.edu>
Date: Thu, 18 Jan 2024 11:02:28 -0500
Subject: [PATCH] 3.x (#154)

* dbcan 3.x development (#141)

* move cli to the pkg

* add .editorconfig

* add pre-commit-config.yaml

* add .dockerignore

* reformat

* rm setup script and conf

* change ValueError to KeyError

* add readthedocs conf

* add .toml to manage the proj

* add sphinx docs

* .dockerignore

* add user guide doc

* simplify readme

* reorder authors

* :Update 4.1.0: 1. Update dbCAN 2. Harmonizing codes from Jinfang's updates

* fix small bugs

* update dbCAN version

* revise sha256 for 4.1.0

* add yml file to install environment for dbcan-protocol

* remove usage

* :add: readthedoc badge

* [fix] fix bug in meta.yaml

* [Update docs/user_guide/database_preparation.rst] Installation method

* [Update] rename sub.prediction.out into substrate.out

* 1. rename dbcan_cli into dbcan.cli in plots.py 2. add seaborn in dbcan.yml 3. update run from raw reads rst file 4. fix pyproject.toml for typo

* move Dockerfile to root

* add cazyme annotation Dockerfile

* remove git-lfs db

* [Fix] reformat readTheDoc docs/user_guide/run_from_raw_reads.rst

* [Fix] docs/user_guide/run_from_raw_reads.rst :fix some grammar issue for rst format:

* 1. [update] dbcan/cli/run_dbcan.py 2. [add] add dbcan/cli/dbcan_build.py for database build, add entrypoint in pyproject and meta.yaml

* delete redundant file

* [fix] remove number after reference

* add refs for datasets

* add Wastyk2021 dataset to doc (#150)

* fix typo

* add dbcan_build

* revise meta.yaml

* fix bug in installation

* add database

* [fix] fix command error in installation.rst

* revise the doc

* revise

* 1. fix bug in dbcan_sub; 2. add packages matplotlib, openpyxl

* update dbcan_build

* fix but for gff judgement

* revise dependencies

* fix bug in the docs

* revise run from protein sequence to run from DNA sequence

* remove the database installation page. remove pip install part

* update docker

* 1. add requests package 2. fix bug in dbcan_build

* Fix issues in readthedoc

* polish the catalog page

---------

Co-authored-by: HD Yi <haidyi@cs.unc.edu>
---
 conda-recipe/meta.yaml                   |   6 +-
 dbcan/cli/run_dbcan.py                   | 121 ++++++++++++++++-------
 dbcan/utils/dbcan_build.py               |   4 +-
 docs/installation.rst                    |  61 +++++++-----
 docs/user_guide/database_preparation.rst |   1 +
 docs/user_guide/index.rst                |   6 +-
 docs/user_guide/run_from_raw_reads.rst   |  15 ++-
 pyproject.toml                           |   3 +-
 8 files changed, 148 insertions(+), 69 deletions(-)

diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index 7be1bbfb..b95151f7 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -1,5 +1,5 @@
 {% set name = "dbcan" %}
-{% set version = "4.1.2" %}
+{% set version = "4.1.3" %}
 
 package:
   name: "{{ name|lower }}"
@@ -9,7 +9,8 @@ source:
   # the sha256 sum is generated by doing 
   # wget -0- [URL] | shasum -a 256
   url: https://github.com/linnabrown/run_dbcan/releases/download/{{ version }}/dbcan-{{ version }}.tar.gz
-  sha256: 3a675683379d1afc9f3444fc9894272f1485956df266a6ee4fc11a8f628e6d51
+  sha256: 6346e0b6b2c810e2f808bc0bc901643c16f3eda2652d50b8f8470eb722fb419b
+
 
 build:
   number: 0
@@ -46,6 +47,7 @@ requirements:
     - openpyxl
     - matplotlib-base
     - session-info
+    - blast
 
 test:
   imports:
diff --git a/dbcan/cli/run_dbcan.py b/dbcan/cli/run_dbcan.py
index e45ca9e1..a0e6e270 100755
--- a/dbcan/cli/run_dbcan.py
+++ b/dbcan/cli/run_dbcan.py
@@ -13,7 +13,7 @@
 import time
 
 # Recent updated information:
-#   Jan/01/23: Add doc code [Haidong Yi, Le Huang]
+#   Jan/01/23: Add doc code [Haidong Yi, Le Huang] [Haidong Yi, Le Huang]
 #   Oct/10/23: Recontructed the run_dbcan [Haidong Yi]
 #   Sep/07/23: Replace hmmscan with hmmsearch. Update perl code [Le Huang, Yanbin Yin]
 #   Dec/15/22: 1.adding function to convert cgc_standard.out to json format. 2. adding function cgc_[Jinfang Zheng]
@@ -60,8 +60,21 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name):
     hmm_file = f"{dbDir}{db_name}.hmm"
     uniInput_file = f"{outPath}uniInput"
 
-    # hmmer = Popen(
-    #     [
+    # # hmmer = Popen(
+    # #     [
+    #         "hmmsearch",
+    #         "--domtblout",
+    #         domtblout_file,
+    #         "--cpu",
+    #         str(hmm_cpu),
+    #         "-o",
+    #         "/dev/null",
+    #         hmm_file,
+    #         uniInput_file,
+    #     ]
+    # )
+    # hmmer.wait()
+    hmmer_list = [
     #         "hmmsearch",
     #         "--domtblout",
     #         domtblout_file,
@@ -88,6 +101,9 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name):
     cmd_str = " ".join(hmmer_list)
     os.system(cmd_str)
     
+    cmd_str = " ".join(hmmer_list)
+    os.system(cmd_str)
+    
     parsed_hmm_output = hmmer_parser.run(input_file=f"{outPath}h{db_name}.out", eval_num=hmm_eval, coverage=hmm_cov)
     with open(f"{outPath}{db_name}.out", "w") as f:
         f.write(parsed_hmm_output)
@@ -138,6 +154,36 @@ def split_uniInput(uniInput, dbcan_thread, outPath, dbDir, hmm_eval, hmm_cov, hm
     # )
     # dbsub.wait()
 
+    dbsub_list = [
+            "hmmsearch",
+            "--domtblout",
+            f"{outPath}d.txt",
+            "--cpu",
+            str(hmm_cpu),
+            "-o",
+            "/dev/null",
+            f"{dbDir}dbCAN_sub.hmm",
+            f"{outPath}uniInput",
+        ]
+    
+    dbsub_str = " ".join(dbsub_list)
+    os.system(dbsub_str)
+    
+    # dbsub = Popen(
+    #     [
+    #         "hmmsearch",
+    #         "--domtblout",
+    #         f"{outPath}d.txt",
+    #         "--cpu",
+    #         str(hmm_cpu),
+    #         "-o",
+    #         "/dev/null",
+    #         f"{dbDir}dbCAN_sub.hmm",
+    #         f"{outPath}uniInput",
+    #     ]
+    # )
+    # dbsub.wait()
+
     dbsub_list = [
             "hmmsearch",
             "--domtblout",
@@ -646,6 +692,9 @@ def run_dbCAN(
         # End CAZyme Extraction
         ######################
         # Begin GFF preperation
+            
+        #union for CAZyme, tf, tp, stp
+        candidate_gene_set = cazyme.union(tf, tp, stp)
 
         if inputType in ["prok", "meta"]:  # use Prodigal GFF output
             with open(outDir + prefix + "prodigal.gff") as f:
@@ -679,38 +728,40 @@ def run_dbCAN(
                             gff = True
                             break
             if gff:  # user file was in GFF format
-                with open(auxFile) as f:
-                    with open(outDir + prefix + "cgc.gff", "w") as out:
-                        for line in f:
-                            row = line.rstrip().split("\t")
-                            if (not line.startswith("#")) and len(row) >= 9:
-                                if row[2] == "CDS":
-                                    note = row[8].strip().rstrip(";").split(";")
-                                    gene = ""
-                                    notes = {}
-                                    for x in note:
-                                        temp = x.split("=")
-                                        notes[temp[0]] = temp[1]
-                                    if "ID" in notes:
-                                        gene = notes["ID"]
-                                    else:
-                                        continue
-                                    if gene in cazyme:
-                                        row[2] = "CAZyme"
-                                        row[8] = "DB=" + cazyme_genes[gene]
-                                    elif gene in tf:
-                                        row[2] = "TF"
-                                        row[8] = "DB=" + tf_genes[gene]
-                                    elif gene in tp:
-                                        row[2] = "TC"
-                                        row[8] = "DB=" + tp_genes[gene]
-                                    elif gene in stp:
-                                        row[2] = "STP"
-                                        row[8] = "DB=" + stp_genes[gene]
-                                    else:
-                                        row[8] = ""
-                                    row[8] += ";ID=" + gene
-                                    out.write("\t".join(row) + "\n")
+                with open(auxFile) as f, open(outDir + prefix + "cgc.gff", "w") as out:
+                    for line in f:
+                        row = line.rstrip().split("\t")
+                        if (not line.startswith("#")) and len(row) >= 9:
+                            if row[2] == "CDS":
+                                note = row[8].strip().rstrip(";").split(";")
+                                gene1 = ""
+                                gene2 = ""
+                                notes = {}
+                                for x in note:
+                                    temp = x.split("=")
+                                    notes[temp[0]] = temp[1]
+                                 # fix it tomorrow
+                                if "ID" in notes:
+                                    gene1 = notes["ID"]
+                                if "Name" in notes:
+                                    gene2 = notes["Name"]
+                                # fix it tomorrow
+                                if gene in cazyme:
+                                    row[2] = "CAZyme"
+                                    row[8] = "DB=" + cazyme_genes[gene]
+                                elif gene in tf:
+                                    row[2] = "TF"
+                                    row[8] = "DB=" + tf_genes[gene]
+                                elif gene in tp:
+                                    row[2] = "TC"
+                                    row[8] = "DB=" + tp_genes[gene]
+                                elif gene in stp:
+                                    row[2] = "STP"
+                                    row[8] = "DB=" + stp_genes[gene]
+                                else:
+                                    row[8] = ""
+                                row[8] += ";ID=" + gene
+                                out.write("\t".join(row) + "\n")
             else:  # user file was in BED format
                 with open(auxFile) as f:
                     with open(outDir + prefix + "cgc.gff", "w") as out:
diff --git a/dbcan/utils/dbcan_build.py b/dbcan/utils/dbcan_build.py
index ef09f442..31d613cd 100644
--- a/dbcan/utils/dbcan_build.py
+++ b/dbcan/utils/dbcan_build.py
@@ -94,12 +94,12 @@ def main():
         "mv fam-substrate-mapping-08012023.tsv fam-substrate-mapping.tsv",
         "makeblastdb -in PUL.faa -dbtype prot",
         "mv dbCAN-PUL_12-12-2023.xlsx dbCAN-PUL.xlsx",
-        "tar xzvf dbCAN-PUL.tar.gz",
+        "tar xzf dbCAN-PUL.tar.gz",
         "hmmpress -f dbCAN_sub.hmm",
         "mv CAZyDB.07262023.fa CAZyDB.fa",
         "diamond makedb --in CAZyDB.fa -d CAZy",
         "mv dbCAN-HMMdb-V12.txt dbCAN.txt",
-        "hmmpress dbCAN.txt"
+        "hmmpress dbCAN.txt",
         "diamond makedb --in tcdb.fa -d tcdb",
         "hmmpress -f tf-1.hmm",
         "hmmpress -f tf-2.hmm",
diff --git a/docs/installation.rst b/docs/installation.rst
index 4fba40dd..9b084c3d 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -57,46 +57,59 @@ You can remove this parameter if you don't want to clean, but we recommend you a
 away from index contamination.
 
 
-Installing with PyPI
---------------------
+Installing SignalP (Optional)
+--------------------------------
 
-To install the `dbcan`_ package via ``pip``, you first need to install a few executable
-dependencies:
 
-- `NCBI-BLAST+ <https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html>`_;
-- `HMMER <http://hmmer.org/>`_ (:cite:`2011:hmmer`);
-- `DIAMOND <https://github.com/bbuchfink/diamond>`_ (:cite:`2021:diamond`);
-- `SignalP <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_ (:cite:`2017:nielsen`) (Optional).
+- `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows:
+   1. Visit the `SignalP website <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_.
+   2. Submit a download `request <https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux>`_.
+   3. Post-download, add `SignalP` to your system's environmental variables to make it executable.
 
-.. warning::
+- For installation assistance, refer to the :doc:`faq/signalp_installation`.
 
-   **SignalP Integration Notice**
 
-   Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process.
 
-   **Installing SignalP (Optional)**:
+.. Installing with PyPI
+.. --------------------
 
-      - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows:
-         1. Visit the `SignalP website <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_.
-         2. Submit a download `request <https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux>`_.
-         3. Post-download, add `SignalP` to your system's environmental variables to make it executable.
+.. To install the `dbcan`_ package via ``pip``, you first need to install a few executable
+.. dependencies:
+
+.. - `NCBI-BLAST+ <https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html>`_;
+.. - `HMMER <http://hmmer.org/>`_ (:cite:`2011:hmmer`);
+.. - `DIAMOND <https://github.com/bbuchfink/diamond>`_ (:cite:`2021:diamond`);
+.. - `SignalP <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_ (:cite:`2017:nielsen`) (Optional).
+
+.. .. warning::
+
+..    **SignalP Integration Notice**
+
+..    Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process.
+
+..    **Installing SignalP (Optional)**:
+
+..       - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows:
+..          1. Visit the `SignalP website <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_.
+..          2. Submit a download `request <https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux>`_.
+..          3. Post-download, add `SignalP` to your system's environmental variables to make it executable.
       
-      - For installation assistance, refer to the :doc:`faq/signalp_installation`.
+..       - For installation assistance, refer to the :doc:`faq/signalp_installation`.
 
-   This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it.
+..    This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it.
 
 
 
-After the dependencies are installed, `dbcan`_ can be installed via `PyPI <https://pypi.org/>`_:
+.. After the dependencies are installed, `dbcan`_ can be installed via `PyPI <https://pypi.org/>`_:
 
-.. code-block:: shell
+.. .. code-block:: shell
 
-    pip install dbcan
+..     pip install dbcan
 
-.. note::
+.. .. note::
 
-   Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly.
-   Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically.
+..    Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly.
+..    Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically.
 
 Installing with Docker
 ----------------------
diff --git a/docs/user_guide/database_preparation.rst b/docs/user_guide/database_preparation.rst
index 368f123a..07752f92 100644
--- a/docs/user_guide/database_preparation.rst
+++ b/docs/user_guide/database_preparation.rst
@@ -20,3 +20,4 @@ Database Installation Command
         && cd ../ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.fna \
         && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.faa \
         && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.gff
+
diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
index a7c5979e..da7ca52d 100644
--- a/docs/user_guide/index.rst
+++ b/docs/user_guide/index.rst
@@ -1,10 +1,14 @@
 User Guide
 ==========
 
+Hint:If you want to run from raw reads from metagenome, please refer to Run from Raw Reads: Automated CAZyme and Glycan Substrate Annotation in Microbiomes: A Step-by-Step Protocol.
+Otherwise, please refer to any following instruction
+
+
+
 .. toctree::
    :maxdepth: 1
 
-   database_preparation
    quick_start
    run_from_protein_sequence
    run_with_CGCFinder
diff --git a/docs/user_guide/run_from_raw_reads.rst b/docs/user_guide/run_from_raw_reads.rst
index 0d7d2ea6..52a59f0d 100644
--- a/docs/user_guide/run_from_raw_reads.rst
+++ b/docs/user_guide/run_from_raw_reads.rst
@@ -203,6 +203,13 @@ To install the databases, execute the following commands:
 Download database required by Kraken2 (very slow; can be skipped
 if users do not intend to run Kraken2):
 
+.. code-block:: shell
+    
+    dbcan_build --cpus 8 --db-dir db --clean
+
+Download database required by Kraken2 (very slow; can be skipped
+if users do not intend to run Kraken2):
+
 .. code-block:: shell
 
         kraken2-build --standard --db K2
@@ -329,8 +336,8 @@ Use Megahit for assembling reads into contigs:
 
 .. code-block:: shell
 
-    megahit -m 0.5 -t 32 -o megahit_ Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000
-    megahit -m 0.5 -t 32 -o megahit_ Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000
+    megahit -m 0.5 -t 32 -o megahit_Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000
+    megahit -m 0.5 -t 32 -o megahit_Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000
 
 
 ``MEGAHIT`` generates two output folders. Each contains five files and one sub-folder (Box 3).
@@ -361,8 +368,8 @@ P4. Predict genes by `Prokka` (TIMING ~21h)
 
 .. code-block:: shell
 
-    prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_ Wet2014/Wet2014.contigs.fa
-    prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_ Dry2014/Dry2014.contigs.fa
+    prokka --kingdom Bacteria --cpus 32 --outdir prokka_Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_Wet2014/Wet2014.contigs.fa
+    prokka --kingdom Bacteria --cpus 32 --outdir prokka_Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_Dry2014/Dry2014.contigs.fa
 
 
 The parameter ``--kingdom Bacteria`` is required for bacterial gene prediction.
diff --git a/pyproject.toml b/pyproject.toml
index 202e1846..8409605c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ requires = ["hatchling"]
 
 [project]
 name = "dbcan"
-version = "4.1.2"
+version = "4.1.3"
 description = "Standalone version of dbCAN annotation tool for automated CAZyme annotation"
 readme = "README.md"
 requires-python = ">=3.6"
@@ -29,6 +29,7 @@ dependencies = [
     "openpyxl",
     "matplotlib",
     "pyhmmer",
+    "requests",
     # for debug logging (referenced from the issue template)
     "session-info"
 ]