Skip to content

Commit

Permalink
3.x (#154)
Browse files Browse the repository at this point in the history
* dbcan 3.x development (#141)

* move cli to the pkg

* add .editorconfig

* add pre-commit-config.yaml

* add .dockerignore

* reformat

* rm setup script and conf

* change ValueError to KeyError

* add readthedocs conf

* add .toml to manage the proj

* add sphinx docs

* .dockerignore

* add user guide doc

* simplify readme

* reorder authors

* :Update 4.1.0: 1. Update dbCAN 2. Harmonizing codes from Jinfang's updates

* fix small bugs

* update dbCAN version

* revise sha256 for 4.1.0

* add yml file to install environment for dbcan-protocol

* remove usage

* :add: readthedoc badge

* [fix] fix bug in meta.yaml

* [Update docs/user_guide/database_preparation.rst] Installation method

* [Update] rename sub.prediction.out into substrate.out

* 1. rename dbcan_cli into dbcan.cli in plots.py 2. add seaborn in dbcan.yml 3. update run from raw reads rst file 4. fix pyproject.toml for typo

* move Dockerfile to root

* add cazyme annotation Dockerfile

* remove git-lfs db

* [Fix] reformat readTheDoc docs/user_guide/run_from_raw_reads.rst

* [Fix] docs/user_guide/run_from_raw_reads.rst :fix some grammar issue for rst format:

* 1. [update] dbcan/cli/run_dbcan.py 2. [add] add dbcan/cli/dbcan_build.py for database build, add entrypoint in pyproject and meta.yaml

* delete redundant file

* [fix] remove number after reference

* add refs for datasets

* add Wastyk2021 dataset to doc (#150)

* fix typo

* add dbcan_build

* revise meta.yaml

* fix bug in installation

* add database

* [fix] fix command error in installation.rst

* revise the doc

* revise

* 1. fix bug in dbcan_sub; 2. add packages matplotlib, openpyxl

* update dbcan_build

* fix but for gff judgement

* revise dependencies

* fix bug in the docs

* revise run from protein sequence to run from DNA sequence

* remove the database installation page. remove pip install part

* update docker

* 1. add requests package 2. fix bug in dbcan_build

* Fix issues in readthedoc

* polish the catalog page

---------

Co-authored-by: HD Yi <haidyi@cs.unc.edu>
  • Loading branch information
linnabrown and HaidYi committed Jan 18, 2024
1 parent a40d583 commit 0234c13
Show file tree
Hide file tree
Showing 8 changed files with 148 additions and 69 deletions.
6 changes: 4 additions & 2 deletions conda-recipe/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{% set name = "dbcan" %}
{% set version = "4.1.2" %}
{% set version = "4.1.3" %}

package:
name: "{{ name|lower }}"
Expand All @@ -9,7 +9,8 @@ source:
# the sha256 sum is generated by doing
# wget -0- [URL] | shasum -a 256
url: https://github.com/linnabrown/run_dbcan/releases/download/{{ version }}/dbcan-{{ version }}.tar.gz
sha256: 3a675683379d1afc9f3444fc9894272f1485956df266a6ee4fc11a8f628e6d51
sha256: 6346e0b6b2c810e2f808bc0bc901643c16f3eda2652d50b8f8470eb722fb419b


build:
number: 0
Expand Down Expand Up @@ -46,6 +47,7 @@ requirements:
- openpyxl
- matplotlib-base
- session-info
- blast

test:
imports:
Expand Down
121 changes: 86 additions & 35 deletions dbcan/cli/run_dbcan.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import time

# Recent updated information:
# Jan/01/23: Add doc code [Haidong Yi, Le Huang]
# Jan/01/23: Add doc code [Haidong Yi, Le Huang] [Haidong Yi, Le Huang]
# Oct/10/23: Recontructed the run_dbcan [Haidong Yi]
# Sep/07/23: Replace hmmscan with hmmsearch. Update perl code [Le Huang, Yanbin Yin]
# Dec/15/22: 1.adding function to convert cgc_standard.out to json format. 2. adding function cgc_[Jinfang Zheng]
Expand Down Expand Up @@ -60,8 +60,21 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name):
hmm_file = f"{dbDir}{db_name}.hmm"
uniInput_file = f"{outPath}uniInput"

# hmmer = Popen(
# [
# # hmmer = Popen(
# # [
# "hmmsearch",
# "--domtblout",
# domtblout_file,
# "--cpu",
# str(hmm_cpu),
# "-o",
# "/dev/null",
# hmm_file,
# uniInput_file,
# ]
# )
# hmmer.wait()
hmmer_list = [
# "hmmsearch",
# "--domtblout",
# domtblout_file,
Expand All @@ -88,6 +101,9 @@ def runHmmer(outPath, hmm_cpu, dbDir, hmm_eval, hmm_cov, db_name):
cmd_str = " ".join(hmmer_list)
os.system(cmd_str)

cmd_str = " ".join(hmmer_list)
os.system(cmd_str)

parsed_hmm_output = hmmer_parser.run(input_file=f"{outPath}h{db_name}.out", eval_num=hmm_eval, coverage=hmm_cov)
with open(f"{outPath}{db_name}.out", "w") as f:
f.write(parsed_hmm_output)
Expand Down Expand Up @@ -138,6 +154,36 @@ def split_uniInput(uniInput, dbcan_thread, outPath, dbDir, hmm_eval, hmm_cov, hm
# )
# dbsub.wait()

dbsub_list = [
"hmmsearch",
"--domtblout",
f"{outPath}d.txt",
"--cpu",
str(hmm_cpu),
"-o",
"/dev/null",
f"{dbDir}dbCAN_sub.hmm",
f"{outPath}uniInput",
]

dbsub_str = " ".join(dbsub_list)
os.system(dbsub_str)

# dbsub = Popen(
# [
# "hmmsearch",
# "--domtblout",
# f"{outPath}d.txt",
# "--cpu",
# str(hmm_cpu),
# "-o",
# "/dev/null",
# f"{dbDir}dbCAN_sub.hmm",
# f"{outPath}uniInput",
# ]
# )
# dbsub.wait()

dbsub_list = [
"hmmsearch",
"--domtblout",
Expand Down Expand Up @@ -646,6 +692,9 @@ def run_dbCAN(
# End CAZyme Extraction
######################
# Begin GFF preperation

#union for CAZyme, tf, tp, stp
candidate_gene_set = cazyme.union(tf, tp, stp)

if inputType in ["prok", "meta"]: # use Prodigal GFF output
with open(outDir + prefix + "prodigal.gff") as f:
Expand Down Expand Up @@ -679,38 +728,40 @@ def run_dbCAN(
gff = True
break
if gff: # user file was in GFF format
with open(auxFile) as f:
with open(outDir + prefix + "cgc.gff", "w") as out:
for line in f:
row = line.rstrip().split("\t")
if (not line.startswith("#")) and len(row) >= 9:
if row[2] == "CDS":
note = row[8].strip().rstrip(";").split(";")
gene = ""
notes = {}
for x in note:
temp = x.split("=")
notes[temp[0]] = temp[1]
if "ID" in notes:
gene = notes["ID"]
else:
continue
if gene in cazyme:
row[2] = "CAZyme"
row[8] = "DB=" + cazyme_genes[gene]
elif gene in tf:
row[2] = "TF"
row[8] = "DB=" + tf_genes[gene]
elif gene in tp:
row[2] = "TC"
row[8] = "DB=" + tp_genes[gene]
elif gene in stp:
row[2] = "STP"
row[8] = "DB=" + stp_genes[gene]
else:
row[8] = ""
row[8] += ";ID=" + gene
out.write("\t".join(row) + "\n")
with open(auxFile) as f, open(outDir + prefix + "cgc.gff", "w") as out:
for line in f:
row = line.rstrip().split("\t")
if (not line.startswith("#")) and len(row) >= 9:
if row[2] == "CDS":
note = row[8].strip().rstrip(";").split(";")
gene1 = ""
gene2 = ""
notes = {}
for x in note:
temp = x.split("=")
notes[temp[0]] = temp[1]
# fix it tomorrow
if "ID" in notes:
gene1 = notes["ID"]
if "Name" in notes:
gene2 = notes["Name"]
# fix it tomorrow
if gene in cazyme:
row[2] = "CAZyme"
row[8] = "DB=" + cazyme_genes[gene]
elif gene in tf:
row[2] = "TF"
row[8] = "DB=" + tf_genes[gene]
elif gene in tp:
row[2] = "TC"
row[8] = "DB=" + tp_genes[gene]
elif gene in stp:
row[2] = "STP"
row[8] = "DB=" + stp_genes[gene]
else:
row[8] = ""
row[8] += ";ID=" + gene
out.write("\t".join(row) + "\n")
else: # user file was in BED format
with open(auxFile) as f:
with open(outDir + prefix + "cgc.gff", "w") as out:
Expand Down
4 changes: 2 additions & 2 deletions dbcan/utils/dbcan_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,12 @@ def main():
"mv fam-substrate-mapping-08012023.tsv fam-substrate-mapping.tsv",
"makeblastdb -in PUL.faa -dbtype prot",
"mv dbCAN-PUL_12-12-2023.xlsx dbCAN-PUL.xlsx",
"tar xzvf dbCAN-PUL.tar.gz",
"tar xzf dbCAN-PUL.tar.gz",
"hmmpress -f dbCAN_sub.hmm",
"mv CAZyDB.07262023.fa CAZyDB.fa",
"diamond makedb --in CAZyDB.fa -d CAZy",
"mv dbCAN-HMMdb-V12.txt dbCAN.txt",
"hmmpress dbCAN.txt"
"hmmpress dbCAN.txt",
"diamond makedb --in tcdb.fa -d tcdb",
"hmmpress -f tf-1.hmm",
"hmmpress -f tf-2.hmm",
Expand Down
61 changes: 37 additions & 24 deletions docs/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,46 +57,59 @@ You can remove this parameter if you don't want to clean, but we recommend you a
away from index contamination.


Installing with PyPI
--------------------
Installing SignalP (Optional)
--------------------------------

To install the `dbcan`_ package via ``pip``, you first need to install a few executable
dependencies:

- `NCBI-BLAST+ <https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html>`_;
- `HMMER <http://hmmer.org/>`_ (:cite:`2011:hmmer`);
- `DIAMOND <https://github.com/bbuchfink/diamond>`_ (:cite:`2021:diamond`);
- `SignalP <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_ (:cite:`2017:nielsen`) (Optional).
- `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows:
1. Visit the `SignalP website <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_.
2. Submit a download `request <https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux>`_.
3. Post-download, add `SignalP` to your system's environmental variables to make it executable.

.. warning::
- For installation assistance, refer to the :doc:`faq/signalp_installation`.

**SignalP Integration Notice**

Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process.

**Installing SignalP (Optional)**:
.. Installing with PyPI
.. --------------------
- `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows:
1. Visit the `SignalP website <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_.
2. Submit a download `request <https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux>`_.
3. Post-download, add `SignalP` to your system's environmental variables to make it executable.
.. To install the `dbcan`_ package via ``pip``, you first need to install a few executable
.. dependencies:
.. - `NCBI-BLAST+ <https://blast.ncbi.nlm.nih.gov/doc/blast-help/downloadblastdata.html>`_;
.. - `HMMER <http://hmmer.org/>`_ (:cite:`2011:hmmer`);
.. - `DIAMOND <https://github.com/bbuchfink/diamond>`_ (:cite:`2021:diamond`);
.. - `SignalP <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_ (:cite:`2017:nielsen`) (Optional).
.. .. warning::

.. **SignalP Integration Notice**
.. Due to the specific licensing terms of `SignalP`, it is not included directly as a dependency in our package. This requires users to undertake a separate installation process.
.. **Installing SignalP (Optional)**:
.. - `SignalP` is optional and not essential for the core functionality of our software. Users requiring its specific features can integrate it as follows:
.. 1. Visit the `SignalP website <https://services.healthtech.dtu.dk/services/SignalP-4.1/>`_.
.. 2. Submit a download `request <https://services.healthtech.dtu.dk/cgi-bin/sw_request?software=signalp&version=4.1&packageversion=4.1g&platform=Linux>`_.
.. 3. Post-download, add `SignalP` to your system's environmental variables to make it executable.
- For installation assistance, refer to the :doc:`faq/signalp_installation`.
.. - For installation assistance, refer to the :doc:`faq/signalp_installation`.
This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it.
.. This approach ensures compliance with `SignalP`'s licensing while offering the tool's functionality to those who need it.
After the dependencies are installed, `dbcan`_ can be installed via `PyPI <https://pypi.org/>`_:
.. After the dependencies are installed, `dbcan`_ can be installed via `PyPI <https://pypi.org/>`_:
.. code-block:: shell
.. .. code-block:: shell
pip install dbcan
.. pip install dbcan
.. note::
.. .. note::

Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly.
Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically.
.. Since ``PyPI`` doesn't have an independent build system, the dependencies of dbcan need to be installed seperatedly.
.. Therefore, we recommended users to install ``dbcan`` via ``Conda`` which can resolve all dependencies automatically.
Installing with Docker
----------------------
Expand Down
1 change: 1 addition & 0 deletions docs/user_guide/database_preparation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ Database Installation Command
&& cd ../ && wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.fna \
&& wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.faa \
&& wget http://bcb.unl.edu/dbCAN2/download/Samples/EscheriaColiK12MG1655.gff
6 changes: 5 additions & 1 deletion docs/user_guide/index.rst
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
User Guide
==========

Hint:If you want to run from raw reads from metagenome, please refer to Run from Raw Reads: Automated CAZyme and Glycan Substrate Annotation in Microbiomes: A Step-by-Step Protocol.
Otherwise, please refer to any following instruction



.. toctree::
:maxdepth: 1

database_preparation
quick_start
run_from_protein_sequence
run_with_CGCFinder
Expand Down
15 changes: 11 additions & 4 deletions docs/user_guide/run_from_raw_reads.rst
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,13 @@ To install the databases, execute the following commands:
Download database required by Kraken2 (very slow; can be skipped
if users do not intend to run Kraken2):

.. code-block:: shell
dbcan_build --cpus 8 --db-dir db --clean
Download database required by Kraken2 (very slow; can be skipped
if users do not intend to run Kraken2):

.. code-block:: shell
kraken2-build --standard --db K2
Expand Down Expand Up @@ -329,8 +336,8 @@ Use Megahit for assembling reads into contigs:

.. code-block:: shell
megahit -m 0.5 -t 32 -o megahit_ Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000
megahit -m 0.5 -t 32 -o megahit_ Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000
megahit -m 0.5 -t 32 -o megahit_Wet2014 -1 Wet2014_1_val_1.fq.gz -2 Wet2014_2_val_2.fq.gz --out-prefix Wet2014 --min-contig-len 1000
megahit -m 0.5 -t 32 -o megahit_Dry2014 -1 Dry2014_1_val_1.fq.gz -2 Dry2014_2_val_2.fq.gz --out-prefix Dry2014 --min-contig-len 1000
``MEGAHIT`` generates two output folders. Each contains five files and one sub-folder (Box 3).
Expand Down Expand Up @@ -361,8 +368,8 @@ P4. Predict genes by `Prokka` (TIMING ~21h)

.. code-block:: shell
prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_ Wet2014/Wet2014.contigs.fa
prokka --kingdom Bacteria --cpus 32 --outdir prokka_ Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_ Dry2014/Dry2014.contigs.fa
prokka --kingdom Bacteria --cpus 32 --outdir prokka_Wet2014 --prefix Wet2014 --addgenes --addmrna --locustag Wet2014 megahit_Wet2014/Wet2014.contigs.fa
prokka --kingdom Bacteria --cpus 32 --outdir prokka_Dry2014 --prefix Dry2014 --addgenes --addmrna --locustag Dry2014 megahit_Dry2014/Dry2014.contigs.fa
The parameter ``--kingdom Bacteria`` is required for bacterial gene prediction.
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ requires = ["hatchling"]

[project]
name = "dbcan"
version = "4.1.2"
version = "4.1.3"
description = "Standalone version of dbCAN annotation tool for automated CAZyme annotation"
readme = "README.md"
requires-python = ">=3.6"
Expand All @@ -29,6 +29,7 @@ dependencies = [
"openpyxl",
"matplotlib",
"pyhmmer",
"requests",
# for debug logging (referenced from the issue template)
"session-info"
]
Expand Down

0 comments on commit 0234c13

Please sign in to comment.