Revise scripts for processing reference databases

jedick · Jan 4, 2024 · 059c5e9 · 059c5e9
1 parent 7c02402
commit 059c5e9
Show file tree

Hide file tree

Showing 7 changed files with 30 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
-Date: 2023-12-29
+Date: 2024-01-04
 Package: chem16S
-Version: 1.0.0-10
+Version: 1.0.0-11
 Title: Chemical Metrics of Community Reference Proteomes
 Authors@R: c(
     person("Jeffrey", "Dick", , "j3ffdick@gmail.com", role = c("aut", "cre"),

diff --git a/inst/NEWS b/inst/NEWS
@@ -1,5 +1,8 @@
-CHANGES IN chem16S 1.0.0-1 (2023-07-23)
--------------------------------------
+CHANGES IN chem16S 1.0.0-10 (2023-12-29)
+----------------------------------------
+
+- Normalize file names in extdata/RefSeq and extdata/GTDB for easier
+  programmatic access.
 
 - Add 'xvar' and 'yvar' arguments to plot_metrics().
 

diff --git a/inst/extdata/GTDB/README.txt b/inst/extdata/GTDB/README.txt
@@ -1,4 +1,4 @@
 # Run R scripts to produce output files in this order:
-# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
-# 2. taxonomy.R  --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size)
+# 1. taxonomy.R  --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size)
+# 2. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
 # 3. taxon_AA.R  --> taxon_AA.csv
diff --git a/inst/extdata/GTDB/genome_AA.R b/inst/extdata/GTDB/genome_AA.R
@@ -23,6 +23,10 @@ genome_AA <- function() {
   genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa"))
   aa$organism <- genome
 
+  # Put in species names 20240104
+  taxonomy <- read.csv("taxonomy.csv")
+  aa$ref <- taxonomy$species
+
   # Save result
   write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)
 

diff --git a/inst/extdata/GTDB/taxon_AA.R b/inst/extdata/GTDB/taxon_AA.R
@@ -2,15 +2,15 @@
 taxon_AA <- function() {
 
   # Read the summed amino acid compositions of all proteins for each genome
-  genaa <- read.csv("genome_AA.csv.xz")
+  genome_AA <- read.csv("genome_AA.csv.xz")
   # Read the taxonomy 20231229
   taxonomy <- read.csv("taxonomy.csv")
   # Keep only genomes with at least 500 proteins
-  i500 <- genaa$chains >= 500
-  genaa <- genaa[i500, ]
+  i500 <- genome_AA$chains >= 500
+  genome_AA <- genome_AA[i500, ]
   taxonomy <- taxonomy[i500, ]
   # Normalize by number of proteins
-  genaa[, 5:25] <- genaa[, 5:25] / genaa$chains
+  genome_AA[, 5:25] <- genome_AA[, 5:25] / genome_AA$chains
 
   # Loop over taxonomic ranks
   ranks <- c("domain", "phylum", "class", "order", "family", "genus")
@@ -30,7 +30,7 @@ taxon_AA <- function() {
     for(i in 1:length(utaxa)) {
       # Sum amino acid compositions for all genomes in this taxon
       itaxa <- taxa == utaxa[i]
-      aa[i, 5:25] <- aasum(genaa[itaxa, ])[, 5:25]
+      aa[i, 5:25] <- aasum(genome_AA[itaxa, ])[, 5:25]
     }
     # Normalize by number of genomes (put the number in 'ref' column)
     aa$ref <- aa$chains

diff --git a/inst/extdata/GTDB/taxonomy.R b/inst/extdata/GTDB/taxonomy.R
@@ -1,22 +1,28 @@
-# Make taxonomy file for genomes 20231229
+# Make taxonomy file for genomes in GTDB 20231229
 taxonomy <- function() {
 
-  # Read the summed amino acid compositions of all proteins for each genome
-  genaa <- read.csv("genome_AA.csv")
+  # The protein_faa_reps directory was found in this archive:
+  # https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz
+  bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE)
+  arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE)
+  files <- c(bacfiles, arcfiles)
+  # Get full genome names (with version suffix .1, .2, etc.)
+  genome <- unlist(strsplit(basename(files), "_protein.faa"))
+
   # Read the GTDB taxonomy
   bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t")
   arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t")
   GTDBtax <- rbind(bactax, arctax)
 
   # Match genomes to taxonomy
-  itax <- match(genaa$organism, GTDBtax[, 1])
+  itax <- match(genome, GTDBtax[, 1])
   myGTDBtax <- GTDBtax[itax, ]
   # Get taxon names
   names <- strsplit(myGTDBtax[, 2], ";")
   # Remove d__, p__, c__, o__, f__, g__, s__ labels
   names <- lapply(names, function(x) gsub("^.__", "", x))
   taxonomy <- data.frame(
-    genome = genaa$organism,
+    genome = genome,
     domain = sapply(names, "[", 1),
     phylum = sapply(names, "[", 2),
     class = sapply(names, "[", 3),

diff --git a/inst/extdata/RefSeq/taxon_AA.R b/inst/extdata/RefSeq/taxon_AA.R
@@ -108,4 +108,5 @@ taxon_AA <- function(ranks = c("genus", "family", "order", "class", "phylum", "s
   # Replace NA parent with ""
   out$abbrv[is.na(out$abbrv)] <- ""
   write.csv(out, "taxon_AA.csv", row.names = FALSE, quote = FALSE)
+
 }