# Create standard Kraken/Bracken databases
 
The aim of this notebook is to create standard `Kraken 2` and `Bracken 2` databases of RefSeq *Archaea* and *Bacteria* genomes. 

# Init

In [1]:
import os

# Var

In [30]:
# Conda env
Bracken_env = "Bracken2"

In [31]:
# Scripts dir
scripts_dir = "/ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/scripts"
SGE_dir = os.path.join(scripts_dir, "SGE_out")
if not os.path.exists(scripts_dir):
    os.makedirs(scripts_dir)
    os.makedirs(SGE_dir)

In [32]:
# Kraken dir
kraken_dbs = "/tmp/global/jdelacuesta/standard_DBs/kraken"
if not os.path.exists(kraken_dbs):
    os.makedirs(kraken_dbs)

# Build standard databases

## Kraken

In [54]:
# Download taxonomy
taxonomy_cmd = "kraken2-build --download-taxonomy --db {k_db}"
taxonomy_job = taxonomy_cmd.format(k_db = kraken_dbs)
taxonomy_job = 'bash -c "source activate {}; {}"'.format(Bracken_env, taxonomy_job)
print(taxonomy_job)

bash -c "source activate Bracken2; kraken2-build --download-taxonomy --db /tmp/global/jdelacuesta/standard_DBs/kraken"


In [61]:
# If there are errors downloading the taxonomy
# Use already dowloaded files
NCBI_taxonomy = "/ebio/abt3_projects/databases/Kraken/K2_Progenomes/Kraken/taxonomy"
rsync_cmd = "rsync -ah {0} {1}".format(NCBI_taxonomy, kraken_dbs)

# Only sync if there is no taxonomy folder
if not os.path.exists(os.path.join(kraken_dbs, "taxonomy")): 
    print(rsync_cmd)
    !$rsync_cmd

rsync -ah /ebio/abt3_projects/databases/Kraken/K2_Progenomes/Kraken/taxonomy /tmp/global/jdelacuesta/standard_DBs/kraken


In [37]:
# Download Kraken 2 databases
download_cmd = "kraken2-build --download-library {domain} --db {k_db}"

# Download Bacteria DB
download_bacteria = download_cmd.format(domain = "bacteria", k_db = kraken_dbs)
download_bacteria = 'bash -c "source activate {}; {}"'.format(Bracken_env, download_bacteria)
print(download_bacteria)

# Download Archaea DB
download_archaea = download_cmd.format(domain = "archaea", k_db = kraken_dbs)
download_archaea = 'bash -c "source activate {}; {}"'.format(Bracken_env, download_archaea)
print(download_archaea)

bash -c "source activate Bracken2; kraken2-build --download-library bacteria --db /tmp/global/jdelacuesta/standard_DBs/kraken"
bash -c "source activate Bracken2; kraken2-build --download-library archaea --db /tmp/global/jdelacuesta/standard_DBs/kraken"


In [48]:
# Build the database
kraken_build = """#!/bin/bash
#$ -N {name}
#$ -pe parallel {cpu}
#$ -l h_vmem=10G
#$ -l h_rt=200:0:0
#$ -o {SGE_dir}
#$ -j y
#$ -wd {workdir}
#$ -m ea
#$ -M jdelacuesta@tuebingen.mpg.de

export PATH='/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken2/bin':$PATH
/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken/bin/kmer2read_distr

kraken2-build --build --db {k_db} --threads {cpu}
"""

In [50]:
job_name = "kraken2_build.sh"
kraken_build_file = os.path.join(scripts_dir, job_name)
script_build = kraken_build.format(name = job_name,
                                         workdir = kraken_dbs,
                                         cpu = 30,
                                         k_db = kraken_dbs, 
                                         SGE_dir = SGE_dir)
   
print(script_build)

with open(kraken_build_file, "w") as f:
    f.write(script_build) 

#!/bin/bash
#$ -N kraken2_build.sh
#$ -pe parallel 30
#$ -l h_vmem=10G
#$ -l h_rt=200:0:0
#$ -o /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/scripts/SGE_out
#$ -j y
#$ -wd /tmp/global/jdelacuesta/standard_DBs/kraken
#$ -m ea
#$ -M jdelacuesta@tuebingen.mpg.de

export PATH='/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken2/bin':$PATH

kraken2-build --build --db /tmp/global/jdelacuesta/standard_DBs/kraken --threads 30



In [62]:
!qsub $kraken_build_file

Your job 4199461 ("kraken2_build.sh") has been submitted


## Bracken

In [71]:
# Create output directories
dir_100mer = os.path.join(kraken_dbs, "100mers")
dir_150mer = os.path.join(kraken_dbs, "150mers")
if not os.path.exists(dir_100mer):
    os.makedirs(dir_100mer)
    os.makedirs(dir_150mer)

In [73]:
# Create symlinks to kraken files
k2d_files = ["hash.k2d",  "opts.k2d",  "taxo.k2d"]
for k2d in k2d_files:
    file_path = os.path.join("..", k2d)
    link_100mer_path = os.path.join(dir_100mer, k2d)
    link_150mer_path = os.path.join(dir_150mer, k2d)
    symlink_cmd = 'ln -s {file} {link}'
    symlink_100mer_job = symlink_cmd.format(file = file_path, link = link_100mer_path)
    symlink_150mer_job = symlink_cmd.format(file = file_path, link = link_150mer_path)
    print(symlink_100mer_job, symlink_150mer_job)
    !$symlink_100mer_job; $symlink_150mer_job

ln -s ../hash.k2d /tmp/global/jdelacuesta/standard_DBs/kraken/100mers/hash.k2d ln -s ../hash.k2d /tmp/global/jdelacuesta/standard_DBs/kraken/150mers/hash.k2d
ln -s ../opts.k2d /tmp/global/jdelacuesta/standard_DBs/kraken/100mers/opts.k2d ln -s ../opts.k2d /tmp/global/jdelacuesta/standard_DBs/kraken/150mers/opts.k2d
ln -s ../taxo.k2d /tmp/global/jdelacuesta/standard_DBs/kraken/100mers/taxo.k2d ln -s ../taxo.k2d /tmp/global/jdelacuesta/standard_DBs/kraken/150mers/taxo.k2d


In [95]:
# Build the database
bracken_build = """#!/bin/bash
#$ -N {name}
#$ -pe parallel {cpu}
#$ -l h_vmem=10G
#$ -l h_rt=200:0:0
#$ -o {SGE_dir}
#$ -j y
#$ -wd {workdir}
#$ -m ea
#$ -M jdelacuesta@tuebingen.mpg.de

export PATH='/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken2/bin':$PATH

# Create database.kraken
kraken2 --db={k_db} --threads={cpu} <( find -L {k_db} \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {{}} + ) > {kraken_db_file}

# database 100mer
# kmer2read
{kmer2read} --seqid2taxid {k_db}/seqid2taxid.map \
    --taxonomy {k_db}/taxonomy \
    --kraken {kraken_db_file} \
    --output {output_100mer_2read} \
    -k 35 \
    -l 100 \
    -t {cpu}
    
# kmer distribution    
generate_kmer_distribution.py -i {output_100mer_2read} -o {output_100mer_distr}

# database 150mer
# kmer2read
{kmer2read} --seqid2taxid {k_db}/seqid2taxid.map \
    --taxonomy {k_db}/taxonomy \
    --kraken {kraken_db_file} \
    --output {output_150mer_2read} \
    -k 35 \
    -l 150 \
    -t {cpu}
    
# kmer distribution    
generate_kmer_distribution.py -i {output_150mer_2read} -o {output_150mer_distr}
"""

In [96]:
# Files and paths
kmer2read_path = "/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken/bin/kmer2read_distr"

kraken_database_file = os.path.join(kraken_dbs, "database.kraken")

# 100mers
k2read_100mer = os.path.join(dir_100mer, "database100mers.kraken")
k2distr_100mer = os.path.join(dir_100mer, "database100mers.kmer_distrib")

# 150mers
k2read_150mer = os.path.join(dir_150mer, "database150mers.kraken")
k2distr_150mer = os.path.join(dir_150mer, "database150mers.kmer_distrib")

In [99]:
job_name = "bracken2_build.sh"
bracken_build_file = os.path.join(scripts_dir, job_name)
script_build = bracken_build.format(name = job_name,
                                    workdir = kraken_dbs,
                                    cpu = 30,
                                    k_db = kraken_dbs, 
                                    SGE_dir = SGE_dir, 
                                    kraken_db_file = kraken_database_file, 
                                    kmer2read = kmer2read_path, 
                                    output_100mer_2read = k2read_100mer, 
                                    output_100mer_distr = k2distr_100mer,
                                    output_150mer_2read = k2read_150mer, 
                                    output_150mer_distr = k2distr_150mer)
print(script_build)
with open(bracken_build_file, "w") as f:
    f.write(script_build) 

#!/bin/bash
#$ -N bracken2_build.sh
#$ -pe parallel 30
#$ -l h_vmem=10G
#$ -l h_rt=200:0:0
#$ -o /ebio/abt3_projects/small_projects/jdelacuesta/DBs_benchmark/scripts/SGE_out
#$ -j y
#$ -wd /tmp/global/jdelacuesta/standard_DBs/kraken
#$ -m ea
#$ -M jdelacuesta@tuebingen.mpg.de

export PATH='/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken2/bin':$PATH

# Create database.kraken
kraken2 --db=/tmp/global/jdelacuesta/standard_DBs/kraken --threads=30 <( find -L /tmp/global/jdelacuesta/standard_DBs/kraken \( -name "*.fna" -o -name "*.fa" -o -name "*.fasta" \) -exec cat {} + ) > /tmp/global/jdelacuesta/standard_DBs/kraken/database.kraken

# database 100mer
# kmer2read
/ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken/bin/kmer2read_distr --seqid2taxid /tmp/global/jdelacuesta/standard_DBs/kraken/seqid2taxid.map     --taxonomy /tmp/global/jdelacuesta/standard_DBs/kraken/taxonomy     --kraken /tmp/global/jdelacuesta/standard_DBs/kraken/database.kraken     --output /tmp/global/jde

In [100]:
!qsub $bracken_build_file

Your job 4216709 ("bracken2_build.sh") has been submitted


# Move database to databases folder

In [105]:
databases_project = os.path.join("/ebio/abt3_projects/databases/Kraken/K2_Standard")
if not os.path.exists(databases_project):
    os.makedirs(databases_project)

rsync_kraken = "rsync -ah --bwlimit 300m {0}/ {1}".format(kraken_dbs, databases_project)
print(rsync_kraken)
#!$rsync_kraken

rsync -ah --bwlimit 300m /tmp/global/jdelacuesta/standard_DBs/kraken/ /ebio/abt3_projects/databases/Kraken/K2_Standard


# Session info

In [3]:
!conda list -n Bracken2

# packages in environment at /ebio/abt3_projects/software/miniconda3_gt4.4/envs/Bracken2:
#
# Name                    Version                   Build  Channel
appdirs                   1.4.3                      py_1    conda-forge
asn1crypto                0.24.0                     py_1    conda-forge
attrs                     18.1.0                     py_1    conda-forge
automat                   0.7.0                    py36_0    conda-forge
backcall                  0.1.0                      py_0    conda-forge
blas                      1.1                    openblas    conda-forge
blast                     2.7.1                h4422958_6    bioconda
bleach                    2.1.3                      py_0    conda-forge
boost                     1.67.0           py36h3e44d54_0    conda-forge
boost-cpp                 1.67.0               h3a22d5f_0    conda-forge
bracken                   2.2              py36h2d50403_0    bioconda/label/broken
bzip2            

r-dbplyr                  1.2.2                    r341_0    conda-forge
r-digest                  0.6.15           r341h470a237_1    conda-forge
r-dplyr                   0.7.6            r341h9d2a408_0    conda-forge
r-evaluate                0.10.1                 r3.4.1_0    conda-forge
r-glue                    1.3.0            r341h470a237_1    conda-forge
r-htmltools               0.3.6            r341hfc679d8_1    conda-forge
r-irdisplay               0.4.4                  r3.4.1_0    conda-forge
r-irkernel                0.8.12                   r341_0    conda-forge
r-jsonlite                1.5              r341h470a237_1    conda-forge
r-magrittr                1.5                    r3.4.1_0    conda-forge
r-memoise                 1.1.0                  r3.4.1_0    conda-forge
r-pbdzmq                  0.3_2                  r3.4.1_0    conda-forge
r-pillar                  1.2.2            r341h6115d3f_1    conda-forge
r-pkgconfig               2.0.1       