<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Goal" data-toc-modified-id="Goal-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Goal</a></span></li><li><span><a href="#Var" data-toc-modified-id="Var-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Var</a></span></li><li><span><a href="#Init" data-toc-modified-id="Init-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Init</a></span></li><li><span><a href="#Load" data-toc-modified-id="Load-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Load</a></span></li><li><span><a href="#Summarizing" data-toc-modified-id="Summarizing-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Summarizing</a></span></li><li><span><a href="#sessionInfo" data-toc-modified-id="sessionInfo-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>sessionInfo</a></span></li></ul></div>

# Goal

* Summarize GTDB metadata 
* The data will be used to select which genomes will be used for the `llmgp-db` pipeline

# Var

In [1]:
bac_metadata_url = 'https://data.ace.uq.edu.au/public/gtdb/release86/bac_metadata_r86.tsv'
arc_metadata_url = 'https://data.ace.uq.edu.au/public/gtdb/release86/arc_metadata_r86.tsv'

# Init

In [2]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Attaching package: ‘data.table’

The following objects are masked from ‘package:dplyr’:

    between, first, last



# Load

In [3]:
# reading in metadata
metadata_urls = c(bac_metadata_url, arc_metadata_url)

df = list()
for(url in metadata_urls){
    write(sprintf('Reading in file: %s', url), stderr())
    df[[url]] = fread(url, sep='\t', check.names=TRUE)
}
df = do.call(rbind, df)

df %>% nrow %>% print
df %>% head

[1] 127318


accession,scaffold_count,gc_count,longest_scaffold,gc_percentage,total_gap_length,genome_size,n50_contigs,n50_scaffolds,l50_scaffolds,⋯,ssu_silva_blast_evalue,ssu_silva_blast_perc_identity,lsu_5s_query_id,lsu_5s_length,lsu_5s_contig_len,gtdb_taxonomy,mimag_high_quality,gtdb_cluster_size,gtdb_clustered_genomes,gtdb_type_material
RS_GCF_001245025.1,157,2457823,329958,52.25851,0,4703201,115698,115698,13,⋯,0,100.0,NZ_LHKW01000019.1,110,45304,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Salmonella;s__Salmonella enterica,f,none,none,f
RS_GCF_000678935.1,4,2880443,3928455,65.59958,6678,4397625,1025092,3928455,1,⋯,0,100.0,none,none,none,d__Bacteria;p__Actinobacteriota;c__Actinobacteria;o__Corynebacteriales;f__Corynebacteriaceae;g__Mycobacterium;s__Mycobacterium tuberculosis,f,none,none,f
RS_GCF_000020485.1,1,976684,2578146,37.88319,0,2578146,2578146,2578146,1,⋯,0,100.0,NC_011899.1,110,2578146,d__Bacteria;p__Firmicutes_F;c__Halanaerobiia;o__Halanaerobiales;f__Halothermotrichaceae;g__Halothermothrix;s__Halothermothrix orenii,t,1,RS_GCF_000020485.1,t
RS_GCF_001206855.1,75,837175,119381,39.51367,35,2118732,64912,64913,13,⋯,0,99.935,NZ_CRLU01000016.1,108,45341,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__Streptococcus pneumoniae,t,none,none,f
RS_GCF_001085945.1,76,813113,115001,39.4667,101,2060352,47679,47680,14,⋯,0,99.935,NZ_CKTI01000046.1,108,14629,d__Bacteria;p__Firmicutes;c__Bacilli;o__Lactobacillales;f__Streptococcaceae;g__Streptococcus;s__Streptococcus pneumoniae,t,none,none,f
RS_GCF_000659565.1,4,2886094,3927593,65.58749,5261,4405634,762837,3927593,1,⋯,0,100.0,none,none,none,d__Bacteria;p__Actinobacteriota;c__Actinobacteria;o__Corynebacteriales;f__Corynebacteriaceae;g__Mycobacterium;s__Mycobacterium tuberculosis,f,none,none,f


In [5]:
df %>% colnames

# Summarizing

In [4]:
# number of representatives
df %>%
    filter(gtdb_representative == 't') %>%
    nrow

In [10]:
# number of representatives per species
df_s = df %>%
    filter(gtdb_representative == 't') %>%
    group_by(gtdb_taxonomy) %>%
    summarize(n=n()) %>%
    ungroup() 

df_s$n %>% summary %>% print

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   1.000   1.000   1.778   2.000 218.000 


In [12]:
# assessing these multi-rep per species genomes
df_s %>%
    filter(n >= 2) %>%
    head(n=30)

gtdb_taxonomy,n
d__Archaea;p__Altiarchaeota;c__Altiarchaeia;o__Altiarchaeales;f__Altiarchaeaceae;g__Altiarchaeum;s__Altiarchaeum sp1,8
d__Archaea;p__Altiarchaeota;c__Altiarchaeia;o__IMC4;f__WOR-SM1-SCG;g__WOR-SM1-SCG;s__,2
d__Archaea;p__Asgardarchaeota;c__Heimdallarchaeia;o__UBA460;f__UBA460;g__UBA460;s__,2
d__Archaea;p__Asgardarchaeota;c__Lokiarchaeia;o__Thorarchaeales;f__Thorarchaeaceae;g__SMTZ1-45;s__,2
d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__40CM-2-53-6;f__40CM-2-53-6;g__40CM-2-53-6;s__,4
d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__40CM-2-53-6;f__40CM-2-53-6;g__40CM-2-53-6;s__40CM-2-53-6 sp,3
d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__B26-1;f__B26-1;g__B26-1;s__,3
d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__B26-1;f__UBA233;g__UBA11855;s__UBA11855 sp1,2
d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__B26-1;f__UBA233;g__UBA233;s__,2
d__Archaea;p__Crenarchaeota;c__Bathyarchaeia;o__TCS64;f__TCS64;g__RBG-16-57-9;s__RBG-16-57-9 sp1,2


# sessionInfo

In [None]:
sessionInfo()