<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Combining-all-antismash-GBKs" data-toc-modified-id="Combining-all-antismash-GBKs-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Combining all antismash GBKs</a></span><ul class="toc-item"><li><span><a href="#Writing-table-of-BGC-IDs" data-toc-modified-id="Writing-table-of-BGC-IDs-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Writing table of BGC IDs</a></span></li></ul></li><li><span><a href="#BiGSCAPE" data-toc-modified-id="BiGSCAPE-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>BiGSCAPE</a></span></li><li><span><a href="#Summary" data-toc-modified-id="Summary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Summary</a></span></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* Run `BiGSCAPE` on all BGCs identified by `antismash`

# Var

In [1]:
# Dereplicated MAGs
MAG_dir = file.path('/ebio', 'abt3_projects', 'Georg_animal_feces', 'data',
                     'metagenome', 'multi-study', 'BioProjects', 'summary',
                     'LLMGA', 'wGeorgAnimal', 'drep-0.995')

# de-rep'd MAG metadata
MAG_meta_file = file.path(MAG_dir, 'drep-MAG_metadata.tsv')

# antismash dir
antismash_dir = file.path(MAG_dir, 'BGCs', 'antismash_v5')

# working dir
work_dir = file.path(MAG_dir, 'BGCs', 'bigscape')

# pfam dir (hmms)
pfam_dir = '/ebio/abt3_projects/databases_no-backup/pfam/v32.0/'

# params
conda_env = 'bigscape'
threads = 32

# Init

In [3]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(doParallel)

library(future)
library(future.batchtools)
library(future.apply)
options(future.wait.interval = 2.0)

set.seed(3784)

source('/ebio/abt3_projects/Georg_animal_feces/code/misc_r_functions/init.R')

In [4]:
make_dir(work_dir)

Created directory: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/bigscape 


# Combining all antismash GBKs

In [15]:
antismash_gbk_dir = file.path(work_dir, 'antismash_gbks')
make_dir(antismash_gbk_dir)

Directory already exists: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/bigscape/antismash_gbks 


In [16]:
gbk_files = list.files(antismash_dir, '*.gbk', full.names=TRUE, recursive=TRUE)
gbk_files = gbk_files[grepl('.+\\.\\.\\.region[0-9]+\\.gbk$', gbk_files)]
gbk_files %>% length %>% print

[1] 3699


In [17]:
# creating an index of file names
gbk_files = data.frame(MAG = basename(dirname(gbk_files)),
                       gbk_file = gbk_files)

gbk_files %>% dfhead

[1] 3699    2


MAG,gbk_file
artificially_reared_2__maxbin2_low_prob_006,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__maxbin2_low_prob_006/c00016_artific...region001.gbk
artificially_reared_2__maxbin2_low_prob_006,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__maxbin2_low_prob_006/c00086_artific...region001.gbk
artificially_reared_2__metabat2_low_PE_017,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__metabat2_low_PE_017/c00005_artific...region001.gbk


In [18]:
# creating unique BGC IDs
gbk_files$BGC_ID = sapply(1:nrow(gbk_files), uuid::UUIDgenerate) %>%
    gsub('-', '', .) %>%
    gsub('^', 'BGC-', .) 

gbk_files %>% dfhead

[1] 3699    3


MAG,gbk_file,BGC_ID
artificially_reared_2__maxbin2_low_prob_006,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__maxbin2_low_prob_006/c00016_artific...region001.gbk,BGC-24345b46200411eaa82bacde48b9eeb0
artificially_reared_2__maxbin2_low_prob_006,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__maxbin2_low_prob_006/c00086_artific...region001.gbk,BGC-e03e16d421584bb1babb99548acd406f
artificially_reared_2__metabat2_low_PE_017,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__metabat2_low_PE_017/c00005_artific...region001.gbk,BGC-743bd64bdd4f45f083ffc9568cc09491


In [19]:
# creating output file names
gbk_files = gbk_files %>%
    mutate(X = antismash_gbk_dir,
           Y = gsub('$', '.gbk', BGC_ID)) %>%
    unite(out_file, X, Y, sep='/') 

gbk_files %>% dfhead

[1] 3699    4


MAG,gbk_file,BGC_ID,out_file
artificially_reared_2__maxbin2_low_prob_006,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__maxbin2_low_prob_006/c00016_artific...region001.gbk,BGC-24345b46200411eaa82bacde48b9eeb0,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/bigscape/antismash_gbks/BGC-24345b46200411eaa82bacde48b9eeb0.gbk
artificially_reared_2__maxbin2_low_prob_006,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__maxbin2_low_prob_006/c00086_artific...region001.gbk,BGC-e03e16d421584bb1babb99548acd406f,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/bigscape/antismash_gbks/BGC-e03e16d421584bb1babb99548acd406f.gbk
artificially_reared_2__metabat2_low_PE_017,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/artificially_reared_2__metabat2_low_PE_017/c00005_artific...region001.gbk,BGC-743bd64bdd4f45f083ffc9568cc09491,/ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/bigscape/antismash_gbks/BGC-743bd64bdd4f45f083ffc9568cc09491.gbk


In [20]:
cp_file = function(in_file, out_file){
    if(out_file == in_file){
        stop('output == input')
    }
    file.copy(in_file, out_file, overwrite = TRUE)
}

ret = mapply(cp_file, 
             in_file=gbk_files$gbk_file %>% as.character, 
             out_file=gbk_files$out_file %>% as.character)
ret %>% length

In [21]:
# checking number of files
list.files(antismash_gbk_dir, '*.gbk') %>% length %>% print

[1] 7398


## Writing table of BGC IDs

In [22]:
BGC_id_file = file.path(antismash_dir, 'BGC_uuids.tsv')
gbk_files %>%
    dplyr::select(BGC_ID, MAG, gbk_file) %>%
    write.table(file=BGC_id_file, sep='\t', quote=FALSE, row.names=FALSE)
cat('File written:', BGC_id_file, '\n')

File written: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/antismash_v5/BGC_uuids.tsv 


# BiGSCAPE

In [23]:
exe = '/ebio/abt3_projects/Georg_animal_feces/bin/BiG-SCAPE/bigscape.py'
cmd = '{exe} -i {in_dir} -o {out_dir} --pfam_dir {pfam_dir} --cores {threads} --mibig --include_gbk_str \"*\"'
cmd = glue::glue(cmd, exe=exe,
                 in_dir=antismash_gbk_dir, 
                 out_dir=work_dir, pfam_dir=pfam_dir,
                 threads=threads)
cmd

In [24]:
bash_job(cmd, conda_env=conda_env, stderr=TRUE)

# Summary

In [25]:
cat('Output:', work_dir, '\n')

Output: /ebio/abt3_projects/Georg_animal_feces/data/metagenome/multi-study/BioProjects/summary/LLMGA/wGeorgAnimal/drep-0.995/BGCs/bigscape 


In [26]:
list.files(file.path(work_dir, 'html_content'))

# sessionInfo

In [27]:
sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS

Matrix products: default
BLAS: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRblas.so
LAPACK: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] bindrcpp_0.2.2          future.apply_1.0.0      future.batchtools_0.7.1
 [4] future_1.9.0            doParallel_1.0.11       iterators_1.0.10       
 [7] foreach_1.4.4           dat

In [28]:
condaInfo(conda_env)

# packages in environment at /ebio/abt3_projects/software/miniconda3_gt4.4/envs/bigscape:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                        main    conda-forge
biopython                 1.68                     py27_0    bioconda
blas                      1.1                    openblas    conda-forge
ca-certificates           2019.11.28           hecc5488_0    conda-forge
certifi                   2019.11.28               py27_0    conda-forge
decorator                 4.4.1                      py_0    conda-forge
fasttree                  2.1.10               h14c3975_3    bioconda
freetype                  2.10.0               he983fc9_1    conda-forge
hmmer                     3.2.1                he1b5a44_2    bioconda
jpeg                      9c                h14c3975_1001    conda-forge
libblas                   3.8.0               11_openblas    conda-forge
libcblas                  3.8.0               1