<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Load" data-toc-modified-id="Load-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load</a></span></li><li><span><a href="#Run" data-toc-modified-id="Run-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Run</a></span><ul class="toc-item"><li><span><a href="#Determining-which-succeeded" data-toc-modified-id="Determining-which-succeeded-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Determining which succeeded</a></span></li><li><span><a href="#Re-run" data-toc-modified-id="Re-run-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Re-run</a></span><ul class="toc-item"><li><span><a href="#Creating-blank-summary.tsv-files" data-toc-modified-id="Creating-blank-summary.tsv-files-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>Creating blank summary.tsv files</a></span></li></ul></li></ul></li><li><span><a href="#Summary" data-toc-modified-id="Summary-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Summary</a></span></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-7"><span class="toc-item-num">7&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* run VALET on PasolliE-2019 MAGs (those selected for analysis with DeepMAsED)

# Var

In [1]:
work_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET/'

# file of MAGs used for DeepMAsED analysis 
MAGs_n143_file = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//samples_Almeida-Pasolli.txt'

# bam file dir (mapping of metagenome reads to corresponding MAGs)
map_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/map/'

# params
conda_env = 'VALET'
threads = 8

# Init

In [3]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(doParallel)

library(future)
library(future.batchtools)
library(future.apply)
options(future.wait.interval = 2.0)

In [4]:
bash_job = function(cmd, conda_env, stdout=TRUE, stderr=FALSE){
    # cmd : string; commandline job (eg., 'ls -thlc')
    # conda_env : string; conda environment name
    cmd = sprintf('. ~/.bashrc; conda activate %s; %s', conda_env, cmd)
    cmd = sprintf('-c "%s"', cmd)
    system2('bash', cmd, stdout=stdout, stderr=stderr)
}

# Load

In [5]:
MAGs_n143 = read.delim(MAGs_n143_file, sep='\t')
MAGs_n143 %>% head(n=3)

Taxon,Fasta,Sample,Read1,Read2
VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/all_MAGs/SGB_genome_fastas_part1/1024/VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43.fa.gz,ERR1293531,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R2_final.fq.gz
VogtmannE_2016__MMRS51737257ST-27-0-0__bin.46,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/all_MAGs/SGB_genome_fastas_part1/1408/VogtmannE_2016__MMRS51737257ST-27-0-0__bin.46.fa.gz,ERR1293612,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293612/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293612/R2_final.fq.gz
VogtmannE_2016__MMRS51737257ST-27-0-0__bin.34,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/all_MAGs/SGB_genome_fastas_part1/1472/VogtmannE_2016__MMRS51737257ST-27-0-0__bin.34.fa.gz,ERR1293612,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293612/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293612/R2_final.fq.gz


# Run

In [8]:
#' creating valet commands
VALET_cmd = function(ref, read1, read2, outdir, threads){
    # L = c(ref, read1, read2)
    baseout = basename(as.character(ref))
    baseout = gsub('\\.fa\\.gz', '', baseout)
    outdir = file.path(outdir, baseout)
    tmp_ref = gsub('\\.gz$', '', ref)
    cmd = 'rm -rf {outdir}; mkdir -p {outdir}; gunzip -c {ref} > {tmp_ref}; valet.py --threads {threads} --assembly-names reference --skip-reapr --output-dir {outdir} -a {tmp_ref} -1 {read1} -2 {read2}; rm -f {tmp_ref}'
    cmd = glue::glue(cmd, threads=threads, outdir=outdir, 
                     ref=ref, read1=read1, read2=read2, tmp_ref=tmp_ref)
    return(cmd)
}

In [9]:
# creating commands for VALET runs
cmds = mapply(VALET_cmd, MAGs_n143$Fasta, MAGs_n143$Read1, 
              MAGs_n143$Read2, outdir=work_dir, threads=8) %>%
    as.list()
cmds %>% length
cmds[[1]]

In [None]:
# test run
bash_job(res[[1]], conda_env = conda_env)

In [10]:
# cluster resources
resources = list(h_rt = '08:00:00',
                 h_vmem = '8G',
                 threads = threads,
                 conda.env = 'py3_physeq')     # conda env with batchtools installed
plan(batchtools_sge, resources=resources, workers=30)

In [None]:
# running cluster jobs 
res = cmds %>%
    future_lapply(FUN = bash_job, conda_env=conda_env)

res %>% length

## Determining which succeeded

In [26]:
sum_files = list.files(work_dir, pattern='summary.tsv', recursive=TRUE, full.names=TRUE)
sum_files %>% length

In [27]:
taxa_complete = sum_files %>%
    as.character %>%
    dirname %>% dirname %>% basename 

taxa_complete %>% length

In [28]:
MAGs_n143_f = MAGs_n143 %>%
    filter(! Taxon %in% taxa_complete)

MAGs_n143_f %>% nrow

## Re-run

In [29]:
# creating commands for VALET runs
cmds = mapply(VALET_cmd, MAGs_n143_f$Fasta, MAGs_n143_f$Read1, 
             MAGs_n143_f$Read2, outdir=work_dir, threads=8) %>%
    as.list()
cmds %>% length
cmds[[1]]

In [30]:
# cluster resources
resources = list(h_rt = '36:00:00',
                 h_vmem = '8G',
                 threads = threads,
                 conda.env = 'py3_physeq')     # conda env with batchtools installed
plan(batchtools_sge, resources=resources, workers=30)

In [31]:
# running cluster jobs 
res = cmds %>%
    future_lapply(FUN = bash_job, conda_env=conda_env)

res %>% length

### Creating blank summary.tsv files

* For those lacking reads

In [33]:
# F = file.path(work_dir, 'SRR1196604_bin.22', 'reference', 'summary.tsv')
# tmpl_sum = read.delim(F, sep='\t')
# tmpl_sum %>% head(n=3)
# tmpl_sum %>% tail(n=3)

In [34]:
#' getting sequence IDs from the fasta
get_seq_ids = function(fasta, conda_env){
    cmd = "gunzip -c {fasta} | grep '>'"
    cmd = glue::glue(cmd, fasta=fasta)
    ret = bash_job(cmd, conda_env, stdout=TRUE)
    ret = gsub('^>', '', ret)
    contig_length = gsub('.+_length_([0-9]+)_cov.+', '\\1', ret)
    ret = data.frame(contig_name = ret, contig_length=contig_length, 
                     abundance = NA, low_cov = NA, low_cov_bps = NA, 
                     high_cov = NA, high_cov_bps = NA, reapr = NA, 
                     reapr_bps = NA, breakpoints = NA, breakpoints_bps = NA)
    return(ret)
}

#' creating a VALET summary file 
make_summary = function(fasta, out_dir, conda_env){
    # output directory structure
    MAG_ID = gsub('.+/(.+)\\.fa.gz', '\\1', fasta)
    F = file.path(out_dir, MAG_ID)
    dir.create(F, showWarnings = FALSE) 
    F = file.path(F, 'reference')
    dir.create(F, showWarnings = FALSE) 
    
    # sequence IDs
    seqIDs = get_seq_ids(fasta, conda_env=conda_env)
    
    # writing table
    outF = file.path(F, 'summary.tsv')
    write.table(seqIDs, file=outF, sep='\t', quote=FALSE, row.names=FALSE)
    cat('File written:', outF, '\n')
    return(outF)
        
}

# creating summary files
sum_tbls = MAGs_n143_f$Fasta %>%
    as.list %>% lapply(make_summary, out_dir=work_dir, conda_env=conda_env)

# status
sum_tbls %>% length
sum_tbls[[1]]

File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET//NielsenHB_2014__MH0032__bin.6/reference/summary.tsv 
File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET//NielsenHB_2014__MH0107__bin.3/reference/summary.tsv 
File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET//NielsenHB_2014__O2_UC47_2__bin.42/reference/summary.tsv 
File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET//NielsenHB_2014__MH0017__bin.5/reference/summary.tsv 
File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET//NielsenHB_2014__MH0032__bin.3/reference/summary.tsv 
File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/VALET/

# Summary

In [6]:
sum_files = list.files(work_dir, pattern='summary.tsv', recursive=TRUE, full.names=TRUE)
sum_files %>% length

In [7]:
# loading all tables
valet = list()
for(f in sum_files){
    x = read.delim(f, sep='\t')
    x$MAG = basename(dirname(dirname(f)))
    valet[[f]] = x
}

valet = do.call(rbind, valet)
rownames(valet) = 1:nrow(valet)

# status
valet %>% nrow
valet %>% head(n=3)

contig_name,contig_length,abundance,low_cov,low_cov_bps,high_cov,high_cov_bps,reapr,reapr_bps,breakpoints,breakpoints_bps,MAG
NODE_1121_length_5579_cov_11.9477,5579,25,0,0,1,52,0,0,0,0,BackhedF_2015__SID201_4M__bin.1
NODE_118_length_39319_cov_15.9866,39319,34,1,165,0,0,0,0,0,0,BackhedF_2015__SID201_4M__bin.1
NODE_123_length_37699_cov_19.1645,37699,40,0,0,2,87,0,0,0,0,BackhedF_2015__SID201_4M__bin.1


In [9]:
# % of contigs marked as mis-assemblies
n_contigs = valet %>% nrow 
n_misass = valet %>% filter(!is.na(low_cov),
                            !is.na(high_cov),
                            !is.na(breakpoints)) %>% nrow

x = glue::glue('{x} of {y} ({z} %) contigs identified as miassembled', 
               x=n_misass, y=n_contigs, z=round(x / y * 100, 2))
cat(x)

23118 of 232308 (9.95 %) contigs identified as miassembled

# sessionInfo

In [36]:
sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS

Matrix products: default
BLAS: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRblas.so
LAPACK: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] bindrcpp_0.2.2          future.apply_1.0.0      future.batchtools_0.7.1
 [4] future_1.9.0            doParallel_1.0.11       iterators_1.0.10       
 [7] foreach_1.4.4           dat