# Goal
Jacobo de la Cuesta-Zuluaga, July 2018.

Previously, I downloaded all the available *Methanomassiliicoccales* assemblies from NCBI, and ran `CheckM` to determine which were considered at least *substantially complete genomes* (>= 70%) *with low contamination* (<= 5%). I started doing several genome and phylogenimic comparisons between the available genomes and the draft of *Ca* Methanomethylophilus alvus RL001, but for analysis of phylogenetic signal, I require some metadata of the available genomes, in particular, the source from which it was obtained.

# Var 

In [196]:
metadata_folder = "/ebio/abt3_projects/vadinCA11/data/V11/assemblies_metadata"

# Init

In [197]:
library(dplyr)

# Create metadata table

## CheckM genome quality data

Load table with organism name and NCBI assembly accession

In [198]:
# Load table
label_table = read.table("/ebio/abt3_projects/vadinCA11/data/V11/anvio_output/external_genomes.txt",
                       header = T, sep = "\t", as.is = T)

label_table$contigs_db_path = label_table$contigs_db_path %>% 
  as.vector %>% lapply(. , function(x) basename(x) %>% strsplit(., ".db", fixed = T)) %>%
  unlist


Generate table with CheckM data, including GC content, numer of genes and genome length

In [199]:
# Load CheckM metadata
checkm_metadata = read.table("/ebio/abt3_projects/vadinCA11/data/V11/genomes/included_stats.txt",
                             sep = "\t", dec = ".", header = T, row.names = 1,
                             as.is = T, comment.char = "")

# Change assembly names
colnames(checkm_metadata) = colnames(checkm_metadata) %>% 
  lapply(. , function(x) strsplit(x, "\\.[0-9]", perl =  T)) %>% unlist

# Create a data frame with GC content and genome length
Assembly = colnames(checkm_metadata) %>% as.character()
GC = checkm_metadata["GC",] %>% as.numeric %>% unlist
Len = checkm_metadata["Genome size",] %>% as.numeric %>% unlist
Genes = checkm_metadata["# predicted genes",] %>% as.numeric %>% unlist
methanomassilii_data = data.frame(Assembly, GC = GC, Len = Len, 
                                  Genes, stringsAsFactors = F)

methanomassilii_data = methanomassilii_data[match(label_table$contigs_db_path, methanomassilii_data$Assembly), ]
row.names(methanomassilii_data) = label_table$name

In [200]:
methanomassilii_data %>% head

Unnamed: 0,Assembly,GC,Len,Genes
Methanomassiliicoccus_luminyensis_B10,GCA_000308215,0.6047886,2620233,2607
Candidatus_Methanomethylophilus_alvus_Mx1201,GCA_000300255,0.5559482,1666795,1636
Candidatus_Methanomassiliicoccus_intestinalis_Issoire_Mx1,GCA_000404225,0.4125543,1931651,1855
Candidatus_Methanoplasma_termitum,GCA_000800805,0.492057,1488669,1419
Candidatus_Methanomethylophilus_sp_1R26,GCA_001481295,0.6039454,1723106,2079
Methanomassiliicoccales_archaeon_RumEn_M2,GCA_001421175,0.5460069,1280797,1497


## Source of microorganism

In [201]:
# Create empty columns
methanomassilii_data$source_1 = rep(NA, nrow(methanomassilii_data))
methanomassilii_data$source_2 = rep(NA, nrow(methanomassilii_data))
methanomassilii_data$location = rep(NA, nrow(methanomassilii_data))

### UBA genomes
These genomes were reported by Parks et al. and the source can be found in tables S1 and S2 of the paper

In [202]:
# Extract UBA codes from organism names
UBA_genomes = rownames(methanomassilii_data)[grepl("UBA", rownames(methanomassilii_data), perl = T)]
UBA_codes = UBA_genomes %>% lapply(. , function(x) strsplit(x, "_", fixed = T)) %>% unlist
UBA_codes = UBA_codes[lapply(UBA_codes , function(x) grepl(pattern = "UBA", x, perl = T)) %>% unlist]
UBA_table = data.frame(name = UBA_genomes, UBA = UBA_codes)                             
UBA_codes                              

In [203]:
# Load tables from Parks et al.
# Table S1
parks_s1 = file.path(metadata_folder, "Parks_S1.txt") %>% 
    read.table(., sep = "\t", dec = ",", na.strings="", header = T, fill = T, quote = "")

head(parks_s1)

SRA.Experiment.Accession,Experiment.Title,Library.Source,Sample.Attribute,Study.Accession,Study.Title
DRX003681,Seabass metagenomes as control2,METAGENOMIC,,DRP000969,Asian seabass metagenomes in response to different stressors
DRX003682,Seabass metagenomes challenged with fast condition,METAGENOMIC,,DRP000969,Asian seabass metagenomes in response to different stressors
DRX011531,High-throughput sequencing of the metagenome extracted from AM-anode biofilm,METAGENOMIC,sample_name: DRS011376 || sample comment: Anode-biofilm in acetate-fed MFC || BioSampleModel: Generic,DRP001053,Comparative metagenome analyses of anode-associated microbial communities developed in rice paddy field-soil microbial fuel cells
DRX011532,High-throughput sequencing of the metagenome extracted from GM-anode biofilm,METAGENOMIC,sample_name: DRS011377 || sample comment: Anode-biofilm in glucose-fed MFC || BioSampleModel: Generic,DRP001053,Comparative metagenome analyses of anode-associated microbial communities developed in rice paddy field-soil microbial fuel cells
DRX012718,Metagenomic sequencing of anode biofilm,METAGENOMIC,sample_name: DRS012489 || sample comment: Total genomic DNA extracted from the anode biofilm in the methanol-fed MFC || BioSampleModel: Generic,DRP001235,Metagenomic analyses of microbial communities generating electricity from methanol
ERX1064483,Illumina HiSeq 2500 paired end sequencing,METAGENOMIC,,ERP011577,The rumen microbial metagenome associated with high methane production in cattle


In [204]:
# Table S2
parks_s2 = file.path(metadata_folder, "Parks_S2.txt") %>% 
    read.table(., sep = "\t", dec = ",", na.strings="", header = T, fill = T, quote = "")
head(parks_s2)

DDBJ.ENA.GenBank.Accession,UBA.Genome.ID,SRA.Bin.ID,Genome.Quality,CheckM.Completeness,CheckM.Contamination,CheckM.Strain.Heterogeneity,NCBI.Organism.Name
DAQG00000000,UBA1,SRX993396,Near complete,98.85,1.32,40,Methanosaeta harundinacea UBA1 [species]
DAQH00000000,UBA2,SRX993396,Medium,89.22,0.0,0,Methanomicrobiaceae archaeon UBA2 [family]
DAMY00000000,UBA3,ERX556017,Partial,52.1,0.0,0,Euryarchaeota archaeon UBA3 [phylum]
DAFQ00000000,UBA4,ERX556017,Partial,55.47,0.0,0,Euryarchaeota archaeon UBA4 [phylum]
DAVG00000000,UBA5,DRX011531,Near complete,98.69,0.33,0,Methanosarcina sp. UBA5 [genus]
DAVH00000000,UBA6,DRX011531,Near complete,97.31,0.81,0,Methanomassiliicoccus sp. UBA6 [genus]


In [205]:
# Subset table S2 to Methanomassilii genomes
UBA_bin_ID = parks_s2[parks_s2$UBA.Genome.ID %in% UBA_codes,]
head(UBA_bin_ID)

Unnamed: 0,DDBJ.ENA.GenBank.Accession,UBA.Genome.ID,SRA.Bin.ID,Genome.Quality,CheckM.Completeness,CheckM.Contamination,CheckM.Strain.Heterogeneity,NCBI.Organism.Name
6,DAVH00000000,UBA6,DRX011531,Near complete,97.31,0.81,0,Methanomassiliicoccus sp. UBA6 [genus]
48,DACS00000000,UBA48,SRX834663,Medium,85.89,0.94,50,Methanomassiliicoccaceae archaeon UBA48 [family]
71,DARO00000000,UBA71,ERX250271,Near complete,97.98,0.0,0,Methanomassiliicoccaceae archaeon UBA71 [family]
72,DAVO00000000,UBA72,ERX250271,Near complete,93.15,0.0,0,Methanomassiliicoccaceae archaeon UBA72 [family]
75,DASL00000000,UBA75,ERX250271,Near complete,96.91,0.0,0,Methanomassiliicoccaceae archaeon UBA75 [family]
78,DARY00000000,UBA78,ERX250271,Medium,88.38,0.94,0,Candidatus Methanomethylophilus sp. UBA78 [genus]


In [206]:
# Extract SRA bin ID from table S2 and match it with studies in table S1 to obtain source of genome
UBA_source = parks_s1[match(UBA_bin_ID$SRA.Bin.ID, parks_s1$SRA.Experiment.Accession), ]
UBA_source$UBA = UBA_bin_ID$UBA.Genome.ID
UBA_source

Unnamed: 0,SRA.Experiment.Accession,Experiment.Title,Library.Source,Sample.Attribute,Study.Accession,Study.Title,UBA
3.0,DRX011531,High-throughput sequencing of the metagenome extracted from AM-anode biofilm,METAGENOMIC,sample_name: DRS011376 || sample comment: Anode-biofilm in acetate-fed MFC || BioSampleModel: Generic,DRP001053,Comparative metagenome analyses of anode-associated microbial communities developed in rice paddy field-soil microbial fuel cells,UBA6
1491.0,SRX834663,shotgun metagenomic sequencing of fecal sample from an adult baboon: Sample F29,METAGENOMIC,collection_date: 30-Jul-2012 || env_biome: mammalia-associated habitat || env_feature: mammalia-associated habitat || env_material: feces || geo_loc_name: Kenya || host: Papio cynocephalus || lat_lon: 2.717 S 37.1 E || host_age: 10.38 || host_body_product,SRP051834,Baboon feces Metagenome,UBA48
26.0,ERX250271,,METAGENOMIC,Failed to decode,ERP002363,Metagenomic study of the microbial genes abundance in treated palm oil mill effluent,UBA71
26.1,ERX250271,,METAGENOMIC,Failed to decode,ERP002363,Metagenomic study of the microbial genes abundance in treated palm oil mill effluent,UBA72
26.2,ERX250271,,METAGENOMIC,Failed to decode,ERP002363,Metagenomic study of the microbial genes abundance in treated palm oil mill effluent,UBA75
26.3,ERX250271,,METAGENOMIC,Failed to decode,ERP002363,Metagenomic study of the microbial genes abundance in treated palm oil mill effluent,UBA78
774.0,SRX498681,Metagenome raw reads obatined from feces of a six-years old elephant,METAGENOMIC,"env_package: MIGS/MIMS/MIMARKS.host-associated || investigation_type: metagenome || biome: Zoo || collection_date: Jun-2009 || feature: Zoo || geo_loc_name: Zoo of Hamburg || lat_lon: 53.596582, 9.938249 || material: feces || host: Elephas maximus indicus",SRP040073,Elephant feces Metagenome,UBA113
688.0,SRX327722,2012TP6_6m: suncor tailing pond 6 meters illumina sample,METAGENOMIC,"env_package: MIGS/MIMS/MIMARKS.microbial || investigation_type: metagenome || biome: Tailling pond || collection_date: 2012-07-01 || feature: Suncor Tailings Pond 6 || geo_loc_name: Suncor tailings pond 6 || lat_lon: 57.02, -111.55 || material: waste wate",SRP017582,Hydrocarbon Metagenome,UBA147
15.0,ERX1109758,Illumina HiSeq 2000 paired end sequencing,METAGENOMIC,,ERP012237,The Illumina HiSeq 2000 platform was used to sequence two metagenomes from two Danish biogas plants.,UBA238
633.0,SRX211001,NapDC illumina metagenome: Naphthaline degrading microbial community from Northern Alberta Oil Sands,METAGENOMIC,"env_package: MIGS/MIMS/MIMARKS.wastewater || investigation_type: metagenome || biome: Tailling pond || collection_date: 2011-07-21 || feature: Tailings Pond || geo_loc_name: Syncrude tailings pond, Wood Buffalo, Alberta, Canada || lat_lon: 57.02, -111.55",SRP017582,Hydrocarbon Metagenome,UBA248


In [207]:
# Match table with species name with source of genome
UBA_source = UBA_source[match(UBA_table$UBA, UBA_source$UBA),]
UBA_source$name = UBA_table$name
UBA_positions = match(UBA_source$name, rownames(methanomassilii_data))

# Add source data to methanomasilii table
for(pos in (1:length(UBA_positions))){
    position = UBA_positions[pos]
    methanomassilii_data$source_1[position] = as.character(UBA_source$Sample.Attribute[pos]) 
    methanomassilii_data$source_2[position] = as.character(UBA_source$Study.Title[pos])  
}

In [208]:
methanomassilii_data %>% head(., n = 15)

Unnamed: 0,Assembly,GC,Len,Genes,source_1,source_2,location
Methanomassiliicoccus_luminyensis_B10,GCA_000308215,0.6047886,2620233,2607,,,
Candidatus_Methanomethylophilus_alvus_Mx1201,GCA_000300255,0.5559482,1666795,1636,,,
Candidatus_Methanomassiliicoccus_intestinalis_Issoire_Mx1,GCA_000404225,0.4125543,1931651,1855,,,
Candidatus_Methanoplasma_termitum,GCA_000800805,0.492057,1488669,1419,,,
Candidatus_Methanomethylophilus_sp_1R26,GCA_001481295,0.6039454,1723106,2079,,,
Methanomassiliicoccales_archaeon_RumEn_M2,GCA_001421175,0.5460069,1280797,1497,,,
Methanomassiliicoccales_archaeon_RumEn_M1,GCA_001421185,0.620941,2121026,2515,,,
uncultured_Candidatus_Methanomethylophilus_sp_RUG779,GCA_900313075,0.6168057,1262884,1374,,,
uncultured_Candidatus_Methanomethylophilus_sp_hRUG898,GCA_900314325,0.6102892,1388681,1421,,,
Methanomassiliicoccaceae_archaeon_UBA409,GCA_002494585,0.5576718,1292133,1369,collection_date: 2013-05-04 || depth: 0.01m || elev: 400m || env_biome: anthropogenic terrestrial biome || env_feature: soil || env_material: mud || geo_loc_name: China: Chengdu || lat_lon: 30.65 N 104.06 E || label: Meta-7-1-30-B || BioSampleModel: MIMS.,Pit mud of Chinese liqour fermentation reactorMetagenome,


In [209]:
raw_metadata = file.path(metadata_folder, "assemblies_metadata_raw.txt")
write.table(methanomassilii_data, raw_metadata, sep = "\t", dec = ".", quote = F)

### Other genomes
The final format of the metadata table, as well as the information of the remaining genomes will be organized manually

In [213]:
UBA_source[UBA_source$UBA == "UBA6",]

Unnamed: 0,SRA.Experiment.Accession,Experiment.Title,Library.Source,Sample.Attribute,Study.Accession,Study.Title,UBA,name
3,DRX011531,High-throughput sequencing of the metagenome extracted from AM-anode biofilm,METAGENOMIC,sample_name: DRS011376 || sample comment: Anode-biofilm in acetate-fed MFC || BioSampleModel: Generic,DRP001053,Comparative metagenome analyses of anode-associated microbial communities developed in rice paddy field-soil microbial fuel cells,UBA6,Methanomassiliicoccus_sp_UBA6
