<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Load" data-toc-modified-id="Load-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load</a></span><ul class="toc-item"><li><span><a href="#Checks" data-toc-modified-id="Checks-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Checks</a></span></li></ul></li><li><span><a href="#Merging-&amp;-pruning-tree" data-toc-modified-id="Merging-&amp;-pruning-tree-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Merging &amp; pruning tree</a></span><ul class="toc-item"><li><span><a href="#Checks" data-toc-modified-id="Checks-5.1"><span class="toc-item-num">5.1&nbsp;&nbsp;</span>Checks</a></span></li><li><span><a href="#Writing-tree" data-toc-modified-id="Writing-tree-5.2"><span class="toc-item-num">5.2&nbsp;&nbsp;</span>Writing tree</a></span></li></ul></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* Create phylogeny for all genome reps used for the Struo2 database
  * merging & filtering GTDB MLSA phylogenies

# Var

In [98]:
work_dir = '/ebio/abt3_projects/databases_no-backup/GTDB/release95/Struo/phylogeny/'

# species-rep genomes selected
genomes_file = file.path(dirname(work_dir),'metadata_1per-GTDB-Spec_gte50comp-lt5cont_wtaxID_wPath.tsv')

# trees from gtdb
arc_tree_file = '/ebio/abt3_projects/databases_no-backup/GTDB/release95/phylogeny/ar122_r95.tree'
bac_tree_file = '/ebio/abt3_projects/databases_no-backup/GTDB/release95/phylogeny/bac120_r95.tree'

# full gtdb metadata
gtdb_meta_dir = '/ebio/abt3_projects/databases_no-backup/GTDB/release95/metadata/'
gtdb_meta_arc_file = file.path(gtdb_meta_dir, 'ar122_metadata_r95.tsv')
gtdb_meta_bac_file = file.path(gtdb_meta_dir, 'bac120_metadata_r95.tsv')

# Init

In [99]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(tidytable)
library(ape)
library(LeyLabRMisc)

In [100]:
df.dims()

# Load

In [113]:
tax_levs = c('Domain', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species')

In [125]:
# genomes used for struo
genomes = Fread(genomes_file) %>%
    select.(ncbi_organism_name, accession, gtdb_taxonomy) %>%
    separate.(gtdb_taxonomy, tax_levs, sep = ';') %>%
    mutate.(Species = gsub(' ', '_', Species))
genomes %>% unique_n('genomes', ncbi_organism_name)
genomes

No. of unique genomes: 30989 


ncbi_organism_name,accession,Domain,Phylum,Class,Order,Family,Genus,Species
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GB_GCA_002789635.1_Candidatus Aenigmarchaeota archaeon CG_4_9_14_3_um_filter_37_18,GB_GCA_002789635.1,d__Archaea,p__Aenigmatarchaeota,c__Aenigmatarchaeia,o__CG10238-14,f__CG10238-14,g__CG10238-14,s__CG10238-14_sp002789635
GB_GCA_001784635.1_Candidatus Micrarchaeota archaeon RBG_16_49_10,GB_GCA_001784635.1,d__Archaea,p__Aenigmatarchaeota,c__Aenigmatarchaeia,o__CG10238-14,f__CG10238-14,g__RBG-16-49-10,s__RBG-16-49-10_sp001784635
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
RS_GCF_001644955.1_Pseudomonas putida,RS_GCF_001644955.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Pseudomonadales,f__Pseudomonadaceae,g__Pseudomonas_E,s__Pseudomonas_E_putida_C
RS_GCF_003233695.1_Lysobacter maris,RS_GCF_003233695.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Xanthomonadales,f__Xanthomonadaceae,g__Lysobacter,s__Lysobacter_maris


In [102]:
# arc tree
arc_tree = read.tree(arc_tree_file)
arc_tree


Phylogenetic tree with 1672 tips and 1671 internal nodes.

Tip labels:
	RS_GCF_002286985.1, RS_GCF_003781945.1, RS_GCF_000739595.1, RS_GCF_002727125.1, RS_GCF_000337915.1, RS_GCF_004114995.1, ...
Node labels:
	d__Archaea, 100, 74, 100, 100:p__Halobacteriota, 84, ...

Rooted; includes branch lengths.

In [103]:
# bac tree
bac_tree = read.tree(bac_tree_file)
bac_tree


Phylogenetic tree with 30238 tips and 30237 internal nodes.

Tip labels:
	RS_GCF_000783395.1, RS_GCF_900580865.1, RS_GCF_900582195.1, RS_GCF_005938625.1, GB_GCA_002754355.1, RS_GCF_003732425.1, ...
Node labels:
	100.0:d__Bacteria, 9, 0, 0, 0, 0, ...

Rooted; includes branch lengths.

In [117]:
# metadata: archaea
gtdb_meta_arc = Fread(gtdb_meta_arc_file) %>%
    select.(accession, gtdb_taxonomy) %>%
    filter.(accession %in% arc_tree$tip.label) %>%
    separate.(gtdb_taxonomy, tax_levs, sep = ';')
gtdb_meta_arc

accession,Domain,Phylum,Class,Order,Family,Genus,Species
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GB_GCA_000007185.1,d__Archaea,p__Methanobacteriota,c__Methanopyri,o__Methanopyrales,f__Methanopyraceae,g__Methanopyrus,s__Methanopyrus kandleri
GB_GCA_000007345.1,d__Archaea,p__Halobacteriota,c__Methanosarcinia,o__Methanosarcinales,f__Methanosarcinaceae,g__Methanosarcina,s__Methanosarcina acetivorans
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GB_GCA_013330595.1,d__Archaea,p__Nanoarchaeota,c__Nanoarchaeia,o__Woesearchaeales,f__UBA9642,g__UBA9642,s__UBA9642 sp12459u
GB_GCA_013330385.1,d__Archaea,p__Thermoplasmatota,c__Poseidoniia,o__Poseidoniales,f__Thalassarchaeaceae,g__MGIIb-O1,s__MGIIb-O1 sp12570u


In [118]:
# metadata: bacteria
gtdb_meta_bac = Fread(gtdb_meta_bac_file) %>%
    select.(accession, gtdb_taxonomy) %>%
    filter.(accession %in% bac_tree$tip.label) %>%
    separate.(gtdb_taxonomy, tax_levs, sep = ';')
gtdb_meta_bac

accession,Domain,Phylum,Class,Order,Family,Genus,Species
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GB_GCA_000010565.1,d__Bacteria,p__Firmicutes_B,c__Desulfotomaculia,o__Desulfotomaculales,f__Pelotomaculaceae,g__Pelotomaculum,s__Pelotomaculum thermopropionicum
GB_GCA_000018565.1,d__Bacteria,p__Chloroflexota,c__Chloroflexia,o__Chloroflexales,f__Herpetosiphonaceae,g__Herpetosiphon,s__Herpetosiphon aurantiacus
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
RS_GCF_902159415.1,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Streptococcaceae,g__Streptococcus,s__Streptococcus mitis_BG
RS_GCF_902167325.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Chitinophagales,f__Chitinophagaceae,g__Chitinophaga,s__Chitinophaga pinensis_A


In [119]:
# combined
gtdb_meta = rbind(gtdb_meta_arc, gtdb_meta_bac)
gtdb_meta_arc = gtdb_meta_bac = NULL
gtdb_meta

accession,Domain,Phylum,Class,Order,Family,Genus,Species
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GB_GCA_000007185.1,d__Archaea,p__Methanobacteriota,c__Methanopyri,o__Methanopyrales,f__Methanopyraceae,g__Methanopyrus,s__Methanopyrus kandleri
GB_GCA_000007345.1,d__Archaea,p__Halobacteriota,c__Methanosarcinia,o__Methanosarcinales,f__Methanosarcinaceae,g__Methanosarcina,s__Methanosarcina acetivorans
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
RS_GCF_902159415.1,d__Bacteria,p__Firmicutes,c__Bacilli,o__Lactobacillales,f__Streptococcaceae,g__Streptococcus,s__Streptococcus mitis_BG
RS_GCF_902167325.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Chitinophagales,f__Chitinophagaceae,g__Chitinophaga,s__Chitinophaga pinensis_A


## Checks

In [120]:
arc_tree$edge.length %>% summary
bac_tree$edge.length %>% summary

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.02860 0.06234 0.13371 0.14856 1.98299 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.01200 0.03000 0.05857 0.07000 1.35600 

# Merging & pruning tree

In [121]:
# binding trees at root
tree = ape::bind.tree(arc_tree, bac_tree)
tree


Phylogenetic tree with 31910 tips and 31907 internal nodes.

Tip labels:
	RS_GCF_002286985.1, RS_GCF_003781945.1, RS_GCF_000739595.1, RS_GCF_002727125.1, RS_GCF_000337915.1, RS_GCF_004114995.1, ...
Node labels:
	d__Archaea, 100, 74, 100, 100:p__Halobacteriota, 84, ...

Unrooted; includes branch lengths.

In [122]:
# renaming as species
idx = gtdb_meta %>%
    filter.(accession %in% tree$tip.label) %>%
    select.(accession, Species) %>%
    mutate.(Species = gsub(' ', '_', Species)) %>%
    as.data.frame
rownames(idx) = idx$accession
tree$tip.label = idx[tree$tip.label,'Species']
tree


Phylogenetic tree with 31910 tips and 31907 internal nodes.

Tip labels:
	s__Halorubrum_sp002286985, s__Halorubrum_sp003781945, s__Halorubrum_halophilum, s__Halorubrum_persicum, s__Halorubrum_saccharovorum, s__Halorubrum_sp004114995, ...
Node labels:
	d__Archaea, 100, 74, 100, 100:p__Halobacteriota, 84, ...

Unrooted; includes branch lengths.

In [126]:
# checking overlap
overlap(genomes$Species, tree$tip.label)

intersect(x,y): 30989 
setdiff(x,y): 0 
setdiff(y,x): 921 
union(x,y): 31910 


In [130]:
# purning
to_rm = setdiff(tree$tip.label, genomes$Species)
to_rm %>% length
tree_f = ape::drop.tip(tree, to_rm)
tree_f


Phylogenetic tree with 30989 tips and 30986 internal nodes.

Tip labels:
	s__Halorubrum_sp002286985, s__Halorubrum_sp003781945, s__Halorubrum_halophilum, s__Halorubrum_persicum, s__Halorubrum_saccharovorum, s__Halorubrum_sp004114995, ...
Node labels:
	d__Archaea, 100, 74, 100, 100:p__Halobacteriota, 84, ...

Unrooted; includes branch lengths.

## Checks

In [131]:
# branch lengths
tree_f$edge.length %>% summary

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.01200 0.03100 0.06276 0.07400 1.98299 

In [133]:
# checking overlap
overlap(genomes$Species, tree_f$tip.label)

intersect(x,y): 30989 
setdiff(x,y): 0 
setdiff(y,x): 0 
union(x,y): 30989 


## Writing tree

In [134]:
F = file.path(work_dir, 'ar122-bac120_r89_1per-GTDB-Spec_gte50comp-lt5cont.nwk')
write.tree(tree_f, F)
cat('File writen:', F, '\n')

File writen: /ebio/abt3_projects/databases_no-backup/GTDB/release95/Struo/phylogeny//ar122-bac120_r89_1per-GTDB-Spec_gte50comp-lt5cont.nwk 


# sessionInfo

In [62]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Georg_animal_feces/envs/phyloseq/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] ape_5.4           LeyLabRMisc_0.1.6 tidytable_0.4.1   data.table_1.12.8
[5] ggplot2_3.3.1     tidyr_1.1.0       dplyr_1.0.0      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6     pillar_1.4.4     compiler_3.6.3   base64enc_0.1-3 
 [5] tools_3.6.3      digest_0.6