<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Load" data-toc-modified-id="Load-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load</a></span></li><li><span><a href="#Running-ALE" data-toc-modified-id="Running-ALE-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Running ALE</a></span></li><li><span><a href="#Summarizing-results" data-toc-modified-id="Summarizing-results-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Summarizing results</a></span><ul class="toc-item"><li><span><a href="#Writing-combined-ALE-results" data-toc-modified-id="Writing-combined-ALE-results-6.1"><span class="toc-item-num">6.1&nbsp;&nbsp;</span>Writing combined ALE results</a></span></li></ul></li><li><span><a href="#---WAITING---" data-toc-modified-id="---WAITING----7"><span class="toc-item-num">7&nbsp;&nbsp;</span>-- WAITING --</a></span></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-8"><span class="toc-item-num">8&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* run ALE on simulated test dataset for DeepMAsED 
  * the ground truth results can be used for selecting cutoffs for ALE and assessing accuracy

# Var

In [5]:
work_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/ALE/'

# MAGs for genome analysis
asmbl_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly/'

# bam file dir (mapping of metagenome reads to corresponding MAGs)
map_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map/'

# ALE executable
ALE_exe = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/bin/ALE/src/ALE'

# params
conda_env = 'base'
threads = 12

# Init

In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(doParallel)

library(future)
library(future.batchtools)
library(future.apply)
options(future.wait.interval = 2.0)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last

Loading required package: foreach
Loading required package: iterators
Loading required package: parallel

Attaching package: ‘future.apply’

The following object is masked from ‘package:future’:

    future_lapply



# Load

In [3]:
# adding bam files
fasta_files = list.files(asmbl_dir, '*.fasta$', recursive=TRUE, full.names=TRUE)
fasta_files = data.frame(Assembler = fasta_files %>% dirname %>% basename,
                         Rep = fasta_files %>% dirname %>% dirname %>% basename,
                         Fasta = fasta_files)

fasta_files %>% nrow %>% print
fasta_files %>% head(n=3)

[1] 60


Assembler,Rep,Fasta
megahit,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//1/megahit/contigs_filtered.fasta
metaspades,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//1/metaspades/contigs_filtered.fasta
megahit,10,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//10/megahit/contigs_filtered.fasta


In [6]:
# adding bam files
bam_files = list.files(map_dir, '*.bam$', recursive=TRUE, full.names=TRUE)
bam_files = data.frame(Assembler = bam_files %>% basename %>% gsub('\\.bam$', '', .),
                       Rep = bam_files %>% dirname %>% basename,
                       Bam = bam_files)

bam_files %>% nrow %>% print
bam_files %>% head(n=3)

[1] 58


Assembler,Rep,Bam
megahit,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//1/megahit.bam
metaspades,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//1/metaspades.bam
megahit,10,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//10/megahit.bam


In [7]:
# joining
MAGs = fasta_files %>%
    inner_join(bam_files, c('Assembler', 'Rep'))
MAGs %>% nrow %>% print
MAGs %>% head(n=3)

“Column `Rep` joining factors with different levels, coercing to character vector”

[1] 58


Assembler,Rep,Fasta,Bam
megahit,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//1/megahit/contigs_filtered.fasta,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//1/megahit.bam
metaspades,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//1/metaspades/contigs_filtered.fasta,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//1/metaspades.bam
megahit,10,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//10/megahit/contigs_filtered.fasta,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//10/megahit.bam


In [8]:
# ALE output file name
MAGs = MAGs %>%
    mutate(ALE = mapply(function(A, R) file.path(work_dir, R, paste0(A, '.txt')),
                        A=Assembler, R=Rep))
                        
MAGs %>% head(n=3)

Assembler,Rep,Fasta,Bam,ALE
megahit,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//1/megahit/contigs_filtered.fasta,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//1/megahit.bam,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/ALE//1/megahit.txt
metaspades,1,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//1/metaspades/contigs_filtered.fasta,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//1/metaspades.bam,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/ALE//1/metaspades.txt
megahit,10,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/assembly//10/megahit/contigs_filtered.fasta,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/map//10/megahit.bam,/ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/ALE//10/megahit.txt


In [10]:
# making output directories
ret = MAGs$ALE %>% 
    dirname() %>%
    unique() %>%
    sapply(., function(x) dir.create(x, showWarnings=FALSE))

# Running ALE

In [11]:
# cluster resources
resources = list(h_rt = '00:59:00',
                 h_vmem = '8G',
                 threads = '1',
                 conda.env = 'py3_physeq')     # conda env with batchtools installed
plan(batchtools_sge, resources=resources, workers=30)

In [12]:
# creating commands for calling ALE 
ALE_params = function(x, exe){
    cmd = '{exe} {bam} {fasta} {outfile}'
    cmd = glue::glue(cmd, exe=exe, bam=x[['Bam']], fasta=x[['Fasta']], outfile=x[['ALE']])
    return(cmd)
}

ALE_cmds = apply(MAGs, 1, ALE_params, exe=ALE_exe) %>%
    as.list

ALE_cmds %>% length %>% print
ALE_cmds[[1]]

[1] 58


In [13]:
# general bash job function
bash_job = function(cmd, conda_env, stdout=TRUE, stderr=FALSE){
    # cmd : string; commandline job (eg., 'ls -thlc')
    # conda_env : string; conda environment name
    cmd = sprintf('. ~/.bashrc; conda activate %s; %s', conda_env, cmd)
    cmd = sprintf('-c "%s"', cmd)
    system2('bash', cmd, stdout=stdout, stderr=stderr)
}

In [14]:
# apply function on cluster
job_ret = future_lapply(ALE_cmds, FUN = function(x) bash_job(x, conda_env=conda_env))
job_ret

# Summarizing results

In [15]:
ALE_res_files = list.files(work_dir, '*.txt$', recursive=TRUE, full.names=TRUE)
ALE_res_files %>% length %>% print
ALE_res_files %>% head

[1] 58


In [16]:
read_ALE_files = function(in_file){
    # input command
    cmd = 'egrep -v "^#" {in_file}'
    cmd = glue::glue(cmd, in_file=in_file)
    # reading in table
    df = fread(cmd, sep=' ', header=FALSE)
    # formatting
    colnames(df) = c('contig', 'position', 'depth', 
                     'ln__depthLike', 'ln__placeLike', 
                     'ln__insertLike', 'ln__kmerLike')
    df$Assembler = in_file %>% basename %>% gsub('\\.txt$', '', .)
    df$Rep = in_file %>% dirname %>% basename
    return(df)
}

In [17]:
doParallel::registerDoParallel(threads)
ALE_res = plyr::llply(as.list(ALE_res_files), read_ALE_files, .parallel=TRUE) %>%
    do.call(rbind, .)

ALE_res %>% nrow %>% print
ALE_res %>% head(n=3)

[1] 1641203714


contig,position,depth,ln__depthLike,ln__placeLike,ln__insertLike,ln__kmerLike,Assembler,Rep
0,0,1,-2.601,-0.003,0.044,0.001,megahit,1
0,1,1,-2.601,-0.003,0.044,0.076,megahit,1
0,2,1,-2.601,-0.003,0.044,0.246,megahit,1


## Writing combined ALE results

In [18]:
out_file = file.path(work_dir, 'ALE_results.tsv')
ALE_res %>%
    write.table(file=out_file, sep='\t', quote=FALSE, row.names=FALSE)
cat('File written:', out_file, '\n')

File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/test_runs/n100_r25/ALE//ALE_results.tsv 


# sessionInfo

In [20]:
sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.1 LTS

Matrix products: default
BLAS: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRblas.so
LAPACK: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] bindrcpp_0.2.2          future.apply_1.0.0      future.batchtools_0.7.1
 [4] future_1.9.0            doParallel_1.0.11       iterators_1.0.10       
 [7] foreach_1.4.4           dat