<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span><ul class="toc-item"><li><span><a href="#Init" data-toc-modified-id="Init-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Init</a></span></li></ul></li><li><span><a href="#DeepMAsED-SM" data-toc-modified-id="DeepMAsED-SM-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>DeepMAsED-SM</a></span><ul class="toc-item"><li><span><a href="#Config" data-toc-modified-id="Config-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Config</a></span></li><li><span><a href="#Run" data-toc-modified-id="Run-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Run</a></span></li></ul></li><li><span><a href="#--WAITING--" data-toc-modified-id="--WAITING---4"><span class="toc-item-num">4&nbsp;&nbsp;</span>--WAITING--</a></span></li><li><span><a href="#Summary" data-toc-modified-id="Summary-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Summary</a></span><ul class="toc-item"><li><span><a href="#Communities" data-toc-modified-id="Communities-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Communities</a></span></li><li><span><a href="#Feature-tables" data-toc-modified-id="Feature-tables-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Feature tables</a></span><ul class="toc-item"><li><span><a href="#No.-of-contigs" data-toc-modified-id="No.-of-contigs-5.2.1"><span class="toc-item-num">5.2.1&nbsp;&nbsp;</span>No. of contigs</a></span></li><li><span><a href="#Misassembly-types" data-toc-modified-id="Misassembly-types-5.2.2"><span class="toc-item-num">5.2.2&nbsp;&nbsp;</span>Misassembly types</a></span></li></ul></li></ul></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* Replicate metagenome assemblies using intra-spec training genome dataset
* Richness = 0.7 (70% of all ref genomes used)

# Var

In [1]:
ref_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/GTDB_ref_genomes/intraSpec/'
ref_file = file.path(ref_dir, 'GTDBr86_genome-refs_train_clean.tsv')
work_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/train_runs/intra-species/diff_richness/n1000_r6_rich0p7/'

# params
pipeline_dir = '/ebio/abt3_projects/databases_no-backup/bin/deepmased/DeepMAsED-SM/'

## Init

In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
source('/ebio/abt3_projects/software/dev/DeepMAsED/bin/misc_r_functions/init.R')


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last



In [3]:
#' "cat {file}" in R
cat_file = function(file_name){
    cmd = paste('cat', file_name, collapse=' ')
    system(cmd, intern=TRUE) %>% paste(collapse='\n') %>% cat
}

# DeepMAsED-SM

## Config

In [4]:
config_file = file.path(work_dir, 'config.yaml')
cat_file(config_file)

# Input
genomes_file: /ebio/abt3_projects/databases_no-backup/DeepMAsED/GTDB_ref_genomes/intraSpec/GTDBr86_genome-refs_train_clean.tsv

# Output location
output_dir: /ebio/abt3_projects/databases_no-backup/DeepMAsED/train_runs/intra-species/diff_richness/n1000_r6_rich0p7/


# software parameters
# Use "Skip" to skip  steps. If no params for rule, use ""
params:
  # simulating metagenomes
  reps: 6
  MGSIM:
    genome_download: ""
    communities: --richness 0.7
    reads: --sr-seq-depth 1e6 --art-paired --art-mflen 250
  # coverage
  nonpareil: -T kmer
  nonpareil_summary: 1e9   # this is target seq. depth    
  # assemblying metagenomes
  assemblers:
    metaspades: -k auto --only-assembler
    megahit: --min-count 3 --min-contig-len 1000 --presets meta-sensitive
  # assembly filtering
  contig_length_cutoff: 1000       # length in bp 
  # assessing assembly errors
  minimap2: ""
  metaquast: --min-identity 95 --extensive-mis-size 100 --no-icarus --max-ref-number 0
  # mapping reads t

## Run

```
(snakemake_dev) @ rick:/ebio/abt3_projects/databases_no-backup/bin/deepmased/DeepMAsED-SM
$ screen -L -S DM-intraS-rich0.7 ./snakemake_sge.sh /ebio/abt3_projects/databases_no-backup/DeepMAsED/train_runs/intra-species/diff_richness/n1000_r6_rich0p7/config.yaml cluster.json /ebio/abt3_projects/databases_no-backup/DeepMAsED/train_runs/intra-species/diff_richness/n1000_r6_rich0p7/SGE_log 48
```

# --WAITING--

# Summary

## Communities

In [None]:
comm_files = list.files(file.path(work_dir, 'MGSIM'), 'comm_wAbund.txt', full.names=TRUE, recursive=TRUE)
comm_files %>% length %>% print
comm_files %>% head

In [None]:
comms = list()
for(F in comm_files){
    df = read.delim(F, sep='\t')
    df$Rep = basename(dirname(F))
    comms[[F]] = df
}
comms = do.call(rbind, comms)
rownames(comms) = 1:nrow(comms)
comms %>% dfhead

In [None]:
p = comms %>%
    mutate(Perc_rel_abund = ifelse(Perc_rel_abund == 0, 1e-5, Perc_rel_abund)) %>%
    group_by(Taxon) %>%
    summarize(mean_perc_abund = mean(Perc_rel_abund),
              sd_perc_abund = sd(Perc_rel_abund)) %>%
    ungroup() %>%
    mutate(neg_sd_perc_abund = mean_perc_abund - sd_perc_abund,
           pos_sd_perc_abund = mean_perc_abund + sd_perc_abund,
           neg_sd_perc_abund = ifelse(neg_sd_perc_abund <= 0, 1e-5, neg_sd_perc_abund)) %>%
    mutate(Taxon = Taxon %>% reorder(-mean_perc_abund)) %>%
    ggplot(aes(Taxon, mean_perc_abund)) +
    geom_linerange(aes(ymin=neg_sd_perc_abund, ymax=pos_sd_perc_abund),
                   size=0.3, alpha=0.3) +
    geom_point(size=0.5, alpha=0.4, color='red') +
    labs(y='% abundance') +
    theme_bw() +
    theme(
        axis.text.x = element_blank(),
        panel.grid.major.x = element_blank(), 
        panel.grid.major.y = element_blank(), 
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank()
    )

dims(10,2.5)
plot(p)

In [None]:
dims(10,2.5)
plot(p + scale_y_log10())

## Feature tables

In [None]:
feat_files = list.files(file.path(work_dir, 'map'), 'features.tsv.gz', full.names=TRUE, recursive=TRUE)
feat_files %>% length %>% print
feat_files %>% head

In [None]:
feats = list()
for(F in feat_files){
    cmd = glue::glue('gunzip -c {F}', F=F)
    df = fread(cmd, sep='\t') %>%
        distinct(contig, assembler, Extensive_misassembly)
    df$Rep = basename(dirname(dirname(F)))
    feats[[F]] = df
}
feats = do.call(rbind, feats)
rownames(feats) = 1:nrow(feats)
feats %>% dfhead

### No. of contigs

In [None]:
feats_s = feats %>%
    group_by(assembler, Rep) %>%
    summarize(n_contigs = n_distinct(contig)) %>%
    ungroup 

feats_s$n_contigs %>% summary

### Misassembly types

In [None]:
p = feats %>%
    mutate(Extensive_misassembly = ifelse(Extensive_misassembly == '', 'None',
                                          Extensive_misassembly)) %>%
    group_by(Extensive_misassembly, assembler, Rep) %>%
    summarize(n = n()) %>%
    ungroup() %>%
    ggplot(aes(Extensive_misassembly, n, color=assembler)) +
    geom_boxplot() +
    scale_y_log10() +
    labs(x='metaQUAST extensive mis-assembly', y='Count') +
    coord_flip() +
    theme_bw() +
    theme(
        axis.text.x = element_text(angle=45, hjust=1)
    )

dims(8,4)
plot(p)

# sessionInfo

In [None]:
sessionInfo()

In [None]:
pipelineInfo(pipeline_dir)