<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Load" data-toc-modified-id="Load-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load</a></span></li><li><span><a href="#Overlapping-samples" data-toc-modified-id="Overlapping-samples-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Overlapping samples</a></span><ul class="toc-item"><li><span><a href="#Creating-samples-file-for-DeepMAsED" data-toc-modified-id="Creating-samples-file-for-DeepMAsED-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Creating samples file for DeepMAsED</a></span></li><li><span><a href="#DeepMAsED" data-toc-modified-id="DeepMAsED-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>DeepMAsED</a></span></li><li><span><a href="#Summary" data-toc-modified-id="Summary-5.3"><span class="toc-item-num">5.3&nbsp;&nbsp;</span>Summary</a></span></li></ul></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* create feature tables for [PasolliE et al., 2019 dataset](https://doi.org/10.1016/j.cell.2019.01.001)
    


# Var

In [1]:
work_dir = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/'

# NCBI accessions for all metagenome samples used
acc_file = file.path(work_dir, 'NCBIaccession_long.txt')

# path to all ~150k MAGs
MAG_dir = file.path(work_dir, 'all_MAGs')

# AlmeidaA-2019 MAGs (random selection)
AlmeidaA_n143_file = '/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/samples_n143_w-MAGs.txt'



# Init

In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)
set.seed(18734)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



# Load

In [3]:
# NCBI accessions
acc = read.delim(acc_file, sep='\t')
acc %>% nrow %>% print
acc %>% head(n=3)

[1] 25676


sampleID,NCBI_accession
AsnicarF_2017__MV_FEI1_t1Q14,SRR4052021
AsnicarF_2017__MV_FEI2_t1Q14,SRR4052022
AsnicarF_2017__MV_FEI3_t1Q14,SRR4052033


In [11]:
# MAG fasta files
MAG_fasta = list.files(MAG_dir, '*.fa.gz$', recursive=TRUE, full.names=TRUE) 
MAG_fasta = data.frame(MAG = MAG_fasta %>% basename %>% gsub('\\.fa.gz$', '', .),                       
                       sample = MAG_fasta %>% basename %>% gsub('__bin\\.[0-9]+\\.fa\\.gz$', '', .),
                       fasta = MAG_fasta)
MAG_fasta %>% nrow %>% print
MAG_fasta %>% head(n=3)

[1] 154723


MAG,sample,fasta
BritoIL_2016__M1.26.ST__bin.30,BritoIL_2016__M1.26.ST,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1001/BritoIL_2016__M1.26.ST__bin.30.fa.gz
BritoIL_2016__W3.17.ST__bin.61,BritoIL_2016__W3.17.ST,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1001/BritoIL_2016__W3.17.ST__bin.61.fa.gz
CosteaPI_2017__SID713A004-11-0-0__bin.78,CosteaPI_2017__SID713A004-11-0-0,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1002/CosteaPI_2017__SID713A004-11-0-0__bin.78.fa.gz


In [5]:
# MAGs selected from AlmeidaA analysis
AlmeidaA_n143 = read.delim(AlmeidaA_n143_file, sep='\t') 
AlmeidaA_n143 %>% nrow %>% print
AlmeidaA_n143 %>% head(n=3)

[1] 1519


Taxon,Completeness,Contamination,Strain_heterogeneity,CheckM_lineage,Sample,Fasta,Read1,Read2
SRR1039533_bin.12,99.29,0.95,0,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces,SRR1039533,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/mags/SRR1039533_bin.12.fa.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/SRR1039533/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/SRR1039533/R2_final.fq.gz
SRR1039533_bin.14,69.19,0.0,0,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Actinomycetaceae;g__Actinomyces_2;s__Actinomyces_neuii,SRR1039533,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/mags/SRR1039533_bin.14.fa.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/SRR1039533/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/SRR1039533/R2_final.fq.gz
SRR1039533_bin.19,94.15,0.15,0,k__Bacteria;p__Actinobacteria;c__Actinobacteria;o__Actinomycetales;f__Dermabacteraceae,SRR1039533,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/mags/SRR1039533_bin.19.fa.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/SRR1039533/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/SRR1039533/R2_final.fq.gz


# Overlapping samples

* Justing using samples overlapping with Almeida-2019
  * These samples are already QC'ed

In [13]:
MAG_fasta = MAG_fasta %>%
    inner_join(acc, c('sample'='sampleID')) 

MAG_fasta %>% nrow %>% print
MAG_fasta %>% head(n=3)

“Column `sample`/`sampleID` joining factors with different levels, coercing to character vector”

[1] 475618


MAG,sample,fasta,NCBI_accession
BritoIL_2016__M1.26.ST__bin.30,BritoIL_2016__M1.26.ST,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1001/BritoIL_2016__M1.26.ST__bin.30.fa.gz,SRR2250547
BritoIL_2016__M1.26.ST__bin.30,BritoIL_2016__M1.26.ST,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1001/BritoIL_2016__M1.26.ST__bin.30.fa.gz,SRR2248241
BritoIL_2016__M1.26.ST__bin.30,BritoIL_2016__M1.26.ST,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1001/BritoIL_2016__M1.26.ST__bin.30.fa.gz,SRR2249667


In [14]:
# number of overlapping NCBI accessions
samp_overlap = intersect(AlmeidaA_n143$Sample, MAG_fasta$NCBI_accession) 
samp_overlap %>% length %>% print

[1] 60


In [17]:
# MAGs in Pasolli for overlapping samples
MAG_fasta_j = MAG_fasta %>%
    inner_join(AlmeidaA_n143, c('NCBI_accession'='Sample')) 

# overlapping samples
cat('Number of overlapping samples:', 
    MAG_fasta_j$NCBI_accession %>% unique %>% length, '\n')

# number of MAGs
cat('PasolliE-2019 MAGs:', MAG_fasta_j$MAG %>% unique %>% length, '\n')
cat('Almeida-2019 MAGs:', MAG_fasta_j$Taxon %>% unique %>% length, '\n')

MAG_fasta_j %>% nrow %>% print
MAG_fasta_j %>% head(n=3)

“Column `NCBI_accession`/`Sample` joining factors with different levels, coercing to character vector”

Number of overlapping samples: 60 
PasolliE-2019 MAGs: 1090 
Almeida-2019 MAGs: 536 
[1] 12538


MAG,sample,fasta,NCBI_accession,Taxon,Completeness,Contamination,Strain_heterogeneity,CheckM_lineage,Fasta,Read1,Read2
VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43,VogtmannE_2016__MMRS65205033ST-27-0-0,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1024/VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43.fa.gz,ERR1293531,ERR1293531_bin.6,80.98,2.02,0.0,k__Bacteria;p__Firmicutes;c__Negativicutes;o__Selenomonadales;f__Acidaminococcaceae,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/mags/ERR1293531_bin.6.fa.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R2_final.fq.gz
VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43,VogtmannE_2016__MMRS65205033ST-27-0-0,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1024/VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43.fa.gz,ERR1293531,ERR1293531_bin.9,79.73,0.12,100.0,k__Bacteria;p__Bacteroidetes;c__Bacteroidia;o__Bacteroidales;f__Prevotellaceae;g__Prevotella,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/mags/ERR1293531_bin.9.fa.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R2_final.fq.gz
VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43,VogtmannE_2016__MMRS65205033ST-27-0-0,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//all_MAGs/SGB_genome_fastas_part1/1024/VogtmannE_2016__MMRS65205033ST-27-0-0__bin.43.fa.gz,ERR1293531,ERR1293531_bin.12,61.9,0.59,15.38,k__Bacteria;p__Firmicutes;c__Clostridia;o__Clostridiales,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/mags/ERR1293531_bin.12.fa.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R1_final.fq.gz,/ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/AlmeidaA-2019/LLMGQC/final/ERR1293531/R2_final.fq.gz


## Creating samples file for DeepMAsED

* Making feature tables for each MAG

In [19]:
samples_file = file.path(work_dir, 'samples_Almeida-Pasolli.txt')
MAG_fasta_j  %>%
    distinct(MAG, NCBI_accession, fasta, Read1, Read2) %>%
    rename('Taxon' = MAG,
           'Sample' = NCBI_accession,
           'Fasta' = fasta) %>%
    write.table(file=samples_file, sep='\t', quote=FALSE, row.names=FALSE)
cat('File written:', samples_file, '\n')

File written: /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019//samples_Almeida-Pasolli.txt 


## DeepMAsED

* Creating feature tables for each MAG

```
(snakemake) @ rick:/ebio/abt3_projects/software/dev/DeepMAsED/DeepMAsED-SM
$ ./snakemake_sge.sh /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/config.yaml cluster.json /ebio/abt3_projects/databases_no-backup/DeepMAsED/MAG_datasets/PasolliE-2019/samples_Almeida-Pasolli/SGE_log 20 --dryrun
```

## Summary

In [21]:
P = file.path(work_dir, 'samples_Almeida-Pasolli', 'map')
feat_files = list.files(P, 'features.tsv.gz', recursive=TRUE)
feat_files %>% length

# sessionInfo

In [23]:
sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.2 LTS

Matrix products: default
BLAS: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRblas.so
LAPACK: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/py3_physeq/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] bindrcpp_0.2.2 ggplot2_3.0.0  tidyr_0.8.1    dplyr_0.7.6   

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.17     bindr_0.1.1      magrittr_1.5     munsell_0.5.0   
 [5] tidyselect_0.2