# Goal
Jacobo de la Cuesta-Zuluaga, July 2019.

The aim of this notebook is to generate the different tables to be included in the vadinCA11 manuscript

# Init

In [65]:
library(tidyverse)
library(stringr)

“package ‘stringr’ was built under R version 3.4.3”

In [10]:
# R util functions
source("/ebio/abt3_projects/vadinCA11/data/V11/R_utils/utils.R")

# Var

In [4]:
work_dir = "/ebio/abt3_projects/vadinCA11/data/V11"
tables_dir = file.path(work_dir, "manuscript_tables")

# Tables

## Included genomes data

In [99]:
# Clade membership
# Host Associated, Free Living and EXternal
HA_clade = c("GCA_002504405", "GCA_002506175", "GCA_002504495", "GCA_002506425", 
             "GCA_002503925", "GCA_002494805", "GCA_002498545", "GCA_002497475", 
             "GCA_002508595", "GCA_000800805", "GCA_002506905", "GCA_002496945", 
             "GCA_002495325", "GCA_002503545", "GCA_000300255", "V11_RL001", 
             "GCA_001481295", "GCA_900313075", "GCA_900314325", "GCA_002505245", 
             "GCA_002503785", "GCA_002506865", "GCA_002502545", "GCA_002509415", 
             "GCA_002498765", "GCA_002498785", "GCA_001421175", "GCA_002502965", 
             "GCA_002497155", "GCA_002509425", "GCA_002496785", "GCA_002505275", 
             "GCA_002506325", "GCA_002498605", "GCA_002509405", "GCA_002505345", 
             "GCA_002504645", "GCA_002502765", "GCA_002495665", "GCA_002506565", 
             "GCA_002508625", "GCA_002498425", "GCA_002498805", "GCA_002502925", 
             "GCA_002502465", "GCA_002498365", "GCA_002506995", "GCA_002494585", 
             "GCA_002509465", "GCA_002495495")

FL_clade = c("GCA_003153895", "GCA_003135935", "GCA_002506985", "GCA_002497075", 
             "GCA_000404225", "GCA_000308215", "GCA_002508545", "GCA_002494705", 
             "GCA_001421185", "GCA_002498285", "GCA_002495585", "GCA_002497995", 
             "GCA_002504525", "GCA_002496345", "GCA_002508585", "GCA_002508555", 
             "GCA_002502005", "GCA_002503495", "GCA_002505225", "GCA_002503645")

EX_clade = c("GCA_002496385", "GCA_002499085")

df_HA = data.frame(Accession_short = HA_clade, 
                   clade = gsub(pattern = ".*", replacement = "HA", x = HA_clade, perl = T))
df_FL = data.frame(Accession_short = FL_clade, 
                   clade = gsub(pattern = ".*", replacement = "FL", x = FL_clade, perl = T))
df_EX = data.frame(Accession_short = EX_clade, 
                   clade = gsub(pattern = ".*", replacement = "EX", x = EX_clade, perl = T))

df_Clade = bind_rows(df_HA, df_FL, df_EX)

df_Clade %>% dfhead

“binding character and factor vector, coercing into character vector”

[1] 72  2


Accession_short,clade
GCA_002504405,HA
GCA_002506175,HA
GCA_002504495,HA


In [47]:
# Included genomes accessions and names
included_accessions = file.path(work_dir, "genomes/Methanomassiliicoccales.csv") %>%
    read_csv() %>%
    select(-X1)

included_accessions %>% dfhead

“Missing column names filled in: 'X1' [1]”Parsed with column specification:
cols(
  X1 = col_integer(),
  Accession = col_character(),
  Assembly = col_character(),
  Organism = col_character()
)


[1] 78  3


Accession,Assembly,Organism
GCA_000308215.1,ASM30821v1,Methanomassiliicoccus luminyensis B10
GCA_000300255.2,ASM30025v2,Candidatus Methanomethylophilus alvus Mx1201
GCA_000404225.1,ASM40422v1,Candidatus Methanomassiliicoccus intestinalis Issoire-Mx1


In [82]:
raw_methanomassilii_data = "/ebio/abt3_projects/vadinCA11/data/V11/assemblies_metadata/Assembly_metadata_curated.txt" %>%
    read_tsv %>%
    as.data.frame %>%
    rename("Organism" = "X1", "Accession_short" = "Assembly") %>% 
    mutate_at(vars(starts_with("source"), location), factor) %>%
    mutate(Organism = str_replace_all(Organism, "\\_", " ")) %>%
    mutate(GC = round(GC*100, 2))

raw_methanomassilii_data %>% dfhead

“Missing column names filled in: 'X1' [1]”Parsed with column specification:
cols(
  X1 = col_character(),
  Assembly = col_character(),
  GC = col_double(),
  Len = col_integer(),
  Genes = col_integer(),
  source_1 = col_character(),
  source_2 = col_character(),
  source_3 = col_character(),
  location = col_character()
)


[1] 72  9


Organism,Accession_short,GC,Len,Genes,source_1,source_2,source_3,location
Candidatus Methanomethylophilus alvus Mx1201,GCA_000300255,55.59,1666795,1636,Host-associated,Feces,Human,
Methanomassiliicoccus luminyensis B10,GCA_000308215,60.48,2620233,2607,Host-associated,Feces,Human,
Candidatus Methanomassiliicoccus intestinalis Issoire Mx1,GCA_000404225,41.26,1931651,1855,Host-associated,Feces,Human,


In [83]:
assembly_stats =  file.path(work_dir, "genomes/included_stats.txt") %>%
    read_tsv()%>% 
    filter(Stat %in% c("Completeness", "Contamination")) %>%
    gather(variable, value, -Stat) %>%
    spread(Stat, value) %>%
    rename("Accession" = "variable") %>%
    mutate_at(vars(Completeness, Contamination), as.numeric) %>%
    mutate_at(vars(Completeness, Contamination), function(x) round(x, 2)) %>%
    mutate(Accession_short = str_replace(Accession, "\\.[0-9]", ""))

assembly_stats %>% dfhead

Parsed with column specification:
cols(
  .default = col_character()
)
See spec(...) for full column specifications.


[1] 72  4


Accession,Completeness,Contamination,Accession_short
GCA_000300255.2,98.39,0.81,GCA_000300255
GCA_000308215.1,98.39,0.0,GCA_000308215
GCA_000404225.1,98.79,0.81,GCA_000404225


In [101]:
# Join tables and organize columns
Included_Genomes_Table = left_join(assembly_stats, raw_methanomassilii_data, by = "Accession_short") %>% 
    left_join(., df_Clade, by = "Accession_short") %>%
    select(-Accession_short) %>%
    select(Organism, Accession, everything())


In [102]:
Included_Genomes_File = file.path(tables_dir, "Included_Genomes.tsv")
write_tsv(Included_Genomes_Table, Included_Genomes_File, col_names = T)

# Session info

In [3]:
sessionInfo()

R version 3.4.1 (2017-06-30)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.1 LTS

Matrix products: default
BLAS: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/UpSetR/lib/R/lib/libRblas.so
LAPACK: /ebio/abt3_projects/software/miniconda3_gt4.4/envs/UpSetR/lib/R/lib/libRlapack.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_0.7.4     purrr_0.2.4     readr_1.1.1     tidyr_0.7.2    
[5] tibble_1.4.1    ggplot2_2.2.1   tidyverse_1.1.1

loaded via a namespace (and not attached):
 [1] Rcpp_0.12.14     cellranger_1.1.0 plyr_