<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Load" data-toc-modified-id="Load-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Filtering-SSU-sequences" data-toc-modified-id="Filtering-SSU-sequences-4.0.1"><span class="toc-item-num">4.0.1&nbsp;&nbsp;</span>Filtering SSU sequences</a></span></li></ul></li></ul></li><li><span><a href="#STOP-HERE" data-toc-modified-id="STOP-HERE-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>STOP HERE</a></span></li><li><span><a href="#SessionInfo" data-toc-modified-id="SessionInfo-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>SessionInfo</a></span></li></ul></div>

# Goal

* Infer a phylogeny for all SSU sequences in the GTDB that overlap with the genomes used

# Var

In [10]:
# GTDBr89 metadata
meta_file = '/ebio/abt3_projects/databases_no-backup/GTDB/release89/LLMGP-DB/metadata_1per-GTDB-Spec_gte50comp-lt5cont_wtaxID_wPath.tsv'

# ssu seqs
ssu_dir = '/ebio/abt3_projects/databases_no-backup/GTDB/release89/ssu/'

# params
threads = 8

# Init

In [3]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(tidytable)
library(ape)
library(LeyLabRMisc)

In [7]:
df.dims()
setDTthreads(threads)

# Load

In [20]:
tax_levs = c('Domain', 'Class', 'Order', 'Family', 'Genus', 'Species', 'Strain')
meta = Fread(meta_file) %>%
    dt_separate(gtdb_taxonomy, tax_levs, sep=';')
meta

ncbi_organism_name,accession,ambiguous_bases,checkm_completeness,checkm_contamination,checkm_marker_count,checkm_marker_lineage,checkm_marker_set_count,checkm_strain_heterogeneity,coding_bases,⋯,SPECIES,gtdb_taxid,fasta_file_path,Domain,Class,Order,Family,Genus,Species,Strain
<chr>,<chr>,<int>,<dbl>,<dbl>,<int>,<chr>,<int>,<dbl>,<int>,⋯,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
GB_GCA_002789635.1_Candidatus Aenigmarchaeota archaeon CG_4_9_14_3_um_filter_37_18,GB_GCA_002789635.1,0,79.44,0,149,k__Archaea (UID2),107,0,794970,⋯,CG10238-14 sp002789635,599,/ebio/abt3_projects/databases_no-backup/GTDB/release89/LLMGP-DB/genomes/genbank/archaea/GCA_002789635.1/GCA_002789635.1_ASM278963v1_genomic.fna.gz,d__Archaea,p__Aenigmarchaeota,c__Aenigmarchaeia,o__CG10238-14,f__CG10238-14,g__CG10238-14,s__CG10238-14 sp002789635
GB_GCA_001784635.1_Candidatus Micrarchaeota archaeon RBG_16_49_10,GB_GCA_001784635.1,0,74.92,0,149,k__Archaea (UID2),107,0,773516,⋯,RBG-16-49-10 sp001784635,3320,/ebio/abt3_projects/databases_no-backup/GTDB/release89/LLMGP-DB/genomes/genbank/archaea/GCA_001784635.1/GCA_001784635.1_ASM178463v1_genomic.fna.gz,d__Archaea,p__Aenigmarchaeota,c__Aenigmarchaeia,o__CG10238-14,f__CG10238-14,g__RBG-16-49-10,s__RBG-16-49-10 sp001784635
⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋱,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮,⋮
GB_GCA_001775395.1_candidate division Zixibacteria bacterium RBG_16_53_22,GB_GCA_001775395.1,0,94.51,3.48,147,k__Bacteria (UID2495),91,0,3439424,⋯,UBA10806 sp001775395,146758,/ebio/abt3_projects/databases_no-backup/GTDB/release89/LLMGP-DB/genomes/genbank/bacteria/GCA_001775395.1/GCA_001775395.1_ASM177539v1_genomic.fna.gz,d__Bacteria,p__Zixibacteria,c__MSB-5A5,o__UBA10806,f__UBA10806,g__UBA10806,s__UBA10806 sp001775395
GB_GCA_002791595.1_candidate division Zixibacteria bacterium CG_4_9_14_3_um_filter_46_8,GB_GCA_002791595.1,0,92.06,2.75,147,k__Bacteria (UID2495),91,0,2413971,⋯,UM-FILTER-46-8 sp002791595,148815,/ebio/abt3_projects/databases_no-backup/GTDB/release89/LLMGP-DB/genomes/genbank/bacteria/GCA_002791595.1/GCA_002791595.1_ASM279159v1_genomic.fna.gz,d__Bacteria,p__Zixibacteria,c__MSB-5A5,o__UM-FILTER-46-8,f__UM-FILTER-46-8,g__UM-FILTER-46-8,s__UM-FILTER-46-8 sp002791595


In [21]:
# summary
meta %>% .$Domain %>% table

.
 d__Archaea d__Bacteria 
       1155       22205 

In [13]:
# Writing list of genomes
meta %>%
    dt_select(accession) %>%
    write_table(file.path(ssu_dir, 'GTDBr89_struo_genomes.txt'), col.names=FALSE)

File written: /ebio/abt3_projects/databases_no-backup/GTDB/release89/ssu//GTDBr89_struo_genomes.txt 


### Filtering SSU sequences

```
$ grep -f GTDBr89_struo_genomes.txt -A 1 ar122_ssu_r89.fna | perl -ne 'print if !/^--/' > ar122_ssu_r89_filt.fna

$ grep -f GTDBr89_struo_genomes.txt -A 1 bac120_ssu_r89.fna | perl -ne 'print if !/^--/' > bac120_ssu_r89_filt.fna
```

# STOP HERE

* Not all genomes have >=1 SSU sequence. Cannot make a phylogeny representing all members

# SessionInfo

In [22]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: Ubuntu 18.04.5 LTS

Matrix products: default
BLAS/LAPACK: /ebio/abt3_projects/Georg_animal_feces/envs/phyloseq/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] LeyLabRMisc_0.1.6 ape_5.4           tidytable_0.4.1   data.table_1.12.8
[5] ggplot2_3.3.1     tidyr_1.1.0       dplyr_1.0.0      

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6     pillar_1.4.4     compiler_3.6.3   base64enc_0.1-3 
 [5] tools_3.6.3      digest_0.6