Normalize file names for RefSeq and GTDB

jedick · Dec 29, 2023 · 7c02402 · 7c02402
1 parent bea4bfc
commit 7c02402
Show file tree

Hide file tree

Showing 14 changed files with 105 additions and 120 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
-Date: 2023-12-06
+Date: 2023-12-29
 Package: chem16S
-Version: 1.0.0-9
+Version: 1.0.0-10
 Title: Chemical Metrics of Community Reference Proteomes
 Authors@R: c(
     person("Jeffrey", "Dick", , "j3ffdick@gmail.com", role = c("aut", "cre"),

diff --git a/R/get_metrics.R b/R/get_metrics.R
@@ -11,9 +11,7 @@ get_metrics <- function(RDP = NULL, map = NULL, refdb = "GTDB", taxon_AA = NULL,
   map <- na.omit(map)
   if(length(map) == 0) stop(paste("no available mappings to taxa in", refdb, "reference database"))
 
-  # Get amino acid compositions of taxa compiled from:
-  #   - RefSeq (no longer using precompiled metrics in taxon_metrics.csv 20220108) or
-  #   - GTDB 20221016
+  # Get amino acid compositions of taxa compiled from RefSeq or GTDB
   AApath <- file.path("extdata", refdb, "taxon_AA.csv.xz")
   AAfile <- system.file(AApath, package = "chem16S")
   if(is.null(taxon_AA)) taxon_AA <- read.csv(AAfile, as.is = TRUE)

diff --git a/inst/extdata/GTDB/README.txt b/inst/extdata/GTDB/README.txt
@@ -0,0 +1,4 @@
+# Run R scripts to produce output files in this order:
+# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
+# 2. taxonomy.R  --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size)
+# 3. taxon_AA.R  --> taxon_AA.csv
diff --git a/inst/extdata/GTDB/genome_AA.R b/inst/extdata/GTDB/genome_AA.R
@@ -0,0 +1,29 @@
+# Read and sum amino acid composition of all proteins for each genome 20221022
+genome_AA <- function() {
+
+  # The protein_faa_reps directory was found in this archive:
+  # https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz
+  bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE)
+  arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE)
+  files <- c(bacfiles, arcfiles)
+
+  # Loop over FASTA files (one for each genome)
+  ifile <- seq_along(files)
+  aa <- lapply(ifile, function(i) {
+    # Print progress message
+    if(i %% 100 == 0) print(i)
+    # Read amino acid composition 
+    aa <- suppressMessages(read.fasta(files[i]))
+    # Sum amino acid composition
+    aasum(aa)
+  })
+  aa <- do.call(rbind, aa)
+
+  # Put in full genome names (with version suffix .1, .2, etc.)
+  genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa"))
+  aa$organism <- genome
+
+  # Save result
+  write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)
+
+}
diff --git a/inst/extdata/GTDB/taxon_AA.R b/inst/extdata/GTDB/taxon_AA.R
@@ -1,71 +1,22 @@
-# Get amino acid compositions for bacterial and archaeal taxa in GTDB
-# 20221015 jmd
-
-# Read and sum amino acid composition of all proteins for each genome 20221022
-genome_AA <- function() {
-
-  # The protein_faa_reps directory was found in this archive:
-  # https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz
-  bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE)
-  arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE)
-  files <- c(bacfiles, arcfiles)
-
-  # Loop over FASTA files (one for each genome)
-  ifile <- seq_along(files)
-  aa <- lapply(ifile, function(i) {
-    # Print progress message
-    if(i %% 100 == 0) print(i)
-    # Read amino acid composition 
-    aa <- suppressMessages(read.fasta(files[i]))
-    # Sum amino acid composition
-    aasum(aa)
-  })
-  aa <- do.call(rbind, aa)
-
-  # Put in full genome names (with version suffix .1, .2, etc.)
-  genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa"))
-  aa$organism <- genome
-
-  # Save result
-  write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)
-
-}
-
 # Combine amino acid composition of genomes for genus and higher levels 20221015
 taxon_AA <- function() {
 
   # Read the summed amino acid compositions of all proteins for each genome
-  genaa <- read.csv("genome_AA.csv")
+  genaa <- read.csv("genome_AA.csv.xz")
+  # Read the taxonomy 20231229
+  taxonomy <- read.csv("taxonomy.csv")
   # Keep only genomes with at least 500 proteins
-  genaa <- genaa[genaa$chains >= 500, ]
+  i500 <- genaa$chains >= 500
+  genaa <- genaa[i500, ]
+  taxonomy <- taxonomy[i500, ]
   # Normalize by number of proteins
   genaa[, 5:25] <- genaa[, 5:25] / genaa$chains
 
-  # Read the GTDB taxonomy
-  bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t")
-  arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t")
-  taxonomy <- rbind(bactax, arctax)
-
-  # Match genomes to taxonomy
-  itax <- match(genaa$organism, taxonomy[, 1])
-  mytaxonomy <- taxonomy[itax, ]
-  # Get taxon names
-  names <- strsplit(mytaxonomy[, 2], ";")
-  # Remove d__, p__, c__, o__, f__, g__, s__ labels
-  names <- lapply(names, function(x) gsub("^.__", "", x))
-  domain <- sapply(names, "[", 1)
-  phylum <- sapply(names, "[", 2)
-  class <- sapply(names, "[", 3)
-  order <- sapply(names, "[", 4)
-  family <- sapply(names, "[", 5)
-  genus <- sapply(names, "[", 6)
-  species <- sapply(names, "[", 7)
-
   # Loop over taxonomic ranks
   ranks <- c("domain", "phylum", "class", "order", "family", "genus")
   aa <- lapply(ranks, function(rank) {
     # Names of all taxa at this rank
-    taxa <- get(rank)
+    taxa <- taxonomy[, rank]
     # Names of unique taxa
     utaxa <- unique(taxa)
     # Print rank and number of unique taxa
@@ -88,7 +39,7 @@ taxon_AA <- function() {
     irank <- match(rank, ranks)
     if(irank > 1) {
       uprank <- ranks[irank - 1]
-      uptaxa <- get(uprank)
+      uptaxa <- taxonomy[, uprank]
       itaxa <- match(utaxa, taxa)
       aa$abbrv <- uptaxa[itaxa]
     }

diff --git a/inst/extdata/GTDB/taxonomy.R b/inst/extdata/GTDB/taxonomy.R
@@ -0,0 +1,30 @@
+# Make taxonomy file for genomes 20231229
+taxonomy <- function() {
+
+  # Read the summed amino acid compositions of all proteins for each genome
+  genaa <- read.csv("genome_AA.csv")
+  # Read the GTDB taxonomy
+  bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t")
+  arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t")
+  GTDBtax <- rbind(bactax, arctax)
+
+  # Match genomes to taxonomy
+  itax <- match(genaa$organism, GTDBtax[, 1])
+  myGTDBtax <- GTDBtax[itax, ]
+  # Get taxon names
+  names <- strsplit(myGTDBtax[, 2], ";")
+  # Remove d__, p__, c__, o__, f__, g__, s__ labels
+  names <- lapply(names, function(x) gsub("^.__", "", x))
+  taxonomy <- data.frame(
+    genome = genaa$organism,
+    domain = sapply(names, "[", 1),
+    phylum = sapply(names, "[", 2),
+    class = sapply(names, "[", 3),
+    order = sapply(names, "[", 4),
+    family = sapply(names, "[", 5),
+    genus = sapply(names, "[", 6),
+    species = sapply(names, "[", 7)
+  )
+  write.csv(taxonomy, "taxonomy.csv", row.names = FALSE, quote = FALSE)
+
+}
diff --git a/inst/extdata/RefSeq/README.txt b/inst/extdata/RefSeq/README.txt
@@ -1,14 +1,21 @@
+# Run R scripts to produce output files in this order:
+# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
+# 2. taxonomy.R  --> taxonomy.csv (mirrored in https://github.com/jedick/JMDplots)
+# 3. taxon_AA.R  --> taxon_AA.csv
+
+### Details for steps 1 and 2 ###
+
 # This document describes steps for processing the 
 # RefSeq database (release 206, 2021-05-21) to produce these files:
 
-protein_refseq.csv: total amino acid composition of all proteins for
+genome_AA.csv: total amino acid composition of all proteins for
   bacteria, archaea, and viral genomes in the RefSeq collection (n = 49448)
-taxon_names.csv: taxid, phylum name and species name for 49448 taxa
+taxonomy.csv: taxid, phylum name and species name for 49448 taxa
 
 # These functions/scripts have the following purpose (output files listed in parentheses):
 gencat.sh - extract accession, taxid, sequence length from RefSeq release catalog (accession.taxid.txt)
-protein_refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv)
-taxon_names.R - get taxonomic names for each taxid represented (taxon_names.csv)
+genome_AA.R - get average amino acid composition for each taxid from gzipped sequence files (genome_AA.csv)
+taxonomy.R - get taxonomic names for each taxid represented (taxonomy.csv)
 
 ## Download stuff
 
@@ -35,20 +42,20 @@ wget -N -i urllist
 6. Use 'gencat.sh' to generate accession.taxid.txt for bacteria, archaea, and viral proteins in the catalog [11 minutes]
    for RefSeq 206, 'wc -l accession.taxid.txt' is 169527530
 
-7. Generate protein_refseq.csv
+7. Generate genome_AA.csv
    mkdir csv
    Then, run these R commands in the working directory that contains accession.taxid.txt, protein/, and csv/
-   > source("protein_refseq.R")
+   > source("genome_AA.R")
    > read.allfiles()    # ca. 19 hours (8 cores)
-   > protein.refseq()   # 5 minutes
+   > genome_AA()        # 5 minutes
 
 ## Taxonomy stuff
 
-8. Edit 'taxon_names.R' so that 'taxdir' points to the directory where the files
+8. Edit 'taxonomy.R' so that 'taxdir' points to the directory where the files
     'names.dmp' and 'nodes.dmp' are present. These files can be downloaded from
     https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz
 
     File info on server (accessed on 2021-05-21):
     taxdump.tar.gz            2021-05-21 21:27   53M
 
-9. Source 'taxon_names.R' to generate the file 'taxon_names.csv' [8 hours]
+9. Source 'taxonomy.R' to generate the file 'taxonomy.csv' [8 hours]
diff --git a/inst/extdata/RefSeq/protein_refseq.R → inst/extdata/RefSeq/genome_AA.R b/inst/extdata/RefSeq/protein_refseq.R → inst/extdata/RefSeq/genome_AA.R
@@ -1,4 +1,4 @@
-# chem16S/RefSeq/protein_refseq.R
+# chem16S/RefSeq/genome_AA.R
 # Calculate the overall amino acid composition of proteins for each taxid in RefSeq
 # 20100704 First version
 # 20130922 Deal with WP multispecies accessions (RefSeq 61)
@@ -152,7 +152,7 @@ read.allfiles <- function() {
   #invisible(aalist)
 }
 
-protein.refseq <- function() {
+genome_AA <- function() {
   # List all files in "csv" directory
   files <- file.path("csv", dir("csv"))
   # Read the CSV files into a single list
@@ -173,6 +173,6 @@ protein.refseq <- function() {
   aaall <- cbind(aadat[, 1:4], aasum)
   aaall$ref <- aaref
   # Save the result
-  write.csv(aaall, "protein_refseq.csv", row.names = FALSE, quote = 3)
+  write.csv(aaall, "genome_AA.csv", row.names = FALSE, quote = 3)
   invisible(aaall)
 }
diff --git a/inst/extdata/RefSeq/taxon_AA.R b/inst/extdata/RefSeq/taxon_AA.R
@@ -1,7 +1,7 @@
 # chem16S/RefSeq/taxon_AA.R
 
 # Functions to generate amino acid compositions and chemical metrics of
-# higher-level taxa from RefSeq species-level reference proteomes (protein_refseq.R)
+# higher-level taxa from RefSeq species-level reference proteomes
 
 # Notes moved from README.md on 20230708
 #* Only taxids classified at the species level are used, and archaeal and bacterial species with less than 500 reference protein sequences are excluded;
@@ -11,15 +11,13 @@
 
 # Calculate amino acid composition of taxonomic groups at genus and higher ranks --> taxon_AA.csv
 # taxon_AA()       
-# Calculate chemical metrics (nH2O, Zc) for each RefSeq group --> taxon_metrics.csv
-# taxon_metrics()         
 
 # Calculate average amino acid composition of genus and higher ranks in RefSeq 20200911
 taxon_AA <- function(ranks = c("genus", "family", "order", "class", "phylum", "superkingdom")) {
 
   # Read RefSeq amino acid compositions and taxon names
-  refseq <- read.csv(system.file("extdata/RefSeq/protein_refseq.csv.xz", package = "JMDplots"), as.is = TRUE)
-  taxa <- read.csv(system.file("extdata/RefSeq/taxon_names.csv.xz", package = "chem16S"), as.is = TRUE)
+  refseq <- read.csv(system.file("extdata/RefSeq/genome_AA.csv.xz", package = "JMDplots"), as.is = TRUE)
+  taxa <- read.csv(system.file("extdata/RefSeq/taxonomy.csv.xz", package = "chem16S"), as.is = TRUE)
   # Make sure the data tables have consistent taxids
   stopifnot(all(refseq$organism == taxa$taxid))
   # Keep taxids classified at species level 20220104
@@ -111,34 +109,3 @@ taxon_AA <- function(ranks = c("genus", "family", "order", "class", "phylum", "s
   out$abbrv[is.na(out$abbrv)] <- ""
   write.csv(out, "taxon_AA.csv", row.names = FALSE, quote = FALSE)
 }
-
-# Compute chemical metrics for each RefSeq group 20200927
-taxon_metrics <- function() {
-  # Read amino acid compositions of all groups
-  AA <- read.csv("taxon_AA.csv", as.is = TRUE)
-  # Build output data frame; rename columns for better readability
-  out <- data.frame(rank = AA$protein, group = AA$organism, ntaxa = AA$ref, parent = AA$abbrv, nH2O = NA, Zc = NA, nC = NA)
-  # Calculate metrics
-  out$nH2O <- round(canprot::nH2O(AA), 6)
-  out$Zc <- round(canprot::Zc(AA), 6)
-  out$nC <- round(CAA(AA), 6)
-  write.csv(out, "taxon_metrics.csv", row.names = FALSE, quote = FALSE)
-}
-
-# Function used in taxon_metrics() to calculate number of carbon atoms in amino acid compositions 20200927
-CAA <- function(AAcomp) {
-  # the number of carbons of the amino acids
-  nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6, 
-    Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5, 
-    Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
-  # find columns with names for the amino acids
-  isAA <- colnames(AAcomp) %in% c("Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys", 
-    "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr")
-  iAA <- match(colnames(AAcomp)[isAA], names(nC_AA))
-  # calculate the nC for all occurrences of each amino acid
-  multC <- t(t(AAcomp[, isAA]) * nC_AA[iAA])
-  # calculate the total nC, then the per-residue nC
-  nCtot <- rowSums(multC)
-  nCtot / rowSums(AAcomp[, isAA])
-}
-al
diff --git a/inst/extdata/RefSeq/taxon_metrics.csv.xz b/inst/extdata/RefSeq/taxon_metrics.csv.xz
diff --git a/inst/extdata/RefSeq/taxon_names.R → inst/extdata/RefSeq/taxonomy.R b/inst/extdata/RefSeq/taxon_names.R → inst/extdata/RefSeq/taxonomy.R
@@ -8,8 +8,8 @@ require(CHNOSZ)
 # Change this to the location where names.dmp and nodes.dmp are located
 taxdir <- "./taxdump"
 
-# Get the taxids from protein_refseq.csv
-pr <- read.csv("protein_refseq.csv")
+# Get the taxids from genome_AA.csv
+pr <- read.csv("genome_AA.csv")
 taxid <- pr$organism
 
 # Read in the names and nodes
@@ -56,4 +56,4 @@ cat("done!\n")
 # Write results to a file
 out <- as.data.frame(out)
 out <- cbind(data.frame(taxid = taxid[ii], out))
-write.csv(out, "taxon_names.csv", row.names = FALSE, quote = FALSE)
+write.csv(out, "taxonomy.csv", row.names = FALSE, quote = FALSE)
diff --git a/inst/extdata/RefSeq/taxon_names.csv.xz → inst/extdata/RefSeq/taxonomy.csv.xz b/inst/extdata/RefSeq/taxon_names.csv.xz → inst/extdata/RefSeq/taxonomy.csv.xz
diff --git a/man/chem16S-package.Rd b/man/chem16S-package.Rd
@@ -64,20 +64,19 @@ History:
 
     \item{\file{README.txt}}{Description of steps to generate reference proteomes of species-level taxa (including downloads and shell commands).}
     \item{\file{gencat.sh}}{Helper script to extract microbial protein records from the RefSeq catalog.}
-    \item{\file{protein_refseq.R}}{
+    \item{\file{genome_AA.R}}{
       R code to sum the amino acid compositions of all proteins for each bacterial, archaeal, and viral species in the NCBI Reference Sequence database.
-      NOTE: To save space, the output file (\file{protein_refseq.csv}) is not stored here but is in the \code{extdata/RefSeq} directory of the JMDplots package on GitHub (\url{https://github.com/jedick/JMDplots}.
+      NOTE: To save space in this package, the output file (\file{genome_AA.csv}) is stored in the \code{extdata/RefSeq} directory of the JMDplots package on GitHub (\url{https://github.com/jedick/JMDplots}.
       The first five columns are: \code{protein} (\dQuote{refseq}), \code{organism} (taxonomic id), \code{ref} (organism name), \code{abbrv} (empty), \code{chains} (number of protein sequences for this organism).
       Columns 6 to 25 have the counts of amino acids.
     }
-    \item{\file{taxon_names.R}}{
-      R code for processing taxonomic IDs; the output file is \file{taxon_names.csv}.
+    \item{\file{taxonomy.R}}{
+      R code for processing taxonomic IDs; the output file is \file{taxonomy.csv}.
       The columns are NCBI taxonomic ID (taxid), and names at different taxonomic rank (species, genus, family, order, class, phylum, superkingdom).
     }
 
     \item{\file{taxon_AA.R}}{Functions to create the files listed below:}
     \item{\file{taxon_AA.csv.xz}}{Average amino acid composition of reference proteomes for all species in each genus, family, order, class, phylum, and superkingdom.}
-    \item{\file{taxon_metrics.csv.xz}}{Selected chemical metrics (\Zc and \nH2O) of reference proteomes for taxa at genus and higher ranks.}
 
   }
 
@@ -180,7 +179,7 @@ O'Leary NA et al. 2016. Reference sequence (RefSeq) database at NCBI: current st
 
 Parks DH, Chuvochina M, Rinke C, Mussig AJ, Chaumeil P-A, Hugenholtz P. 2022. GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy. \emph{Nucleic Acids Research} \bold{50}: D785--D794. \doi{10.1093/nar/gkab776}
 
-Swingley WD, Meyer-Dombard DR, Shock EL, Alsop EB, Falenski HD, Havig JR, Raymond J. 2012. Coordinating environmental genomics and geochemistry reveals metabolic transitions in a hot spring ecosystem. \emph{PLOS One} \bold{7}(6): e38108. \doi{doi.org/10.1371/journal.pone.0038108}
+Swingley WD, Meyer-Dombard DR, Shock EL, Alsop EB, Falenski HD, Havig JR, Raymond J. 2012. Coordinating environmental genomics and geochemistry reveals metabolic transitions in a hot spring ecosystem. \emph{PLOS One} \bold{7}(6): e38108. \doi{10.1371/journal.pone.0038108}
 
 Zhang H-S, Feng Q-D, Zhang D-Y, Zhu G-L, Yang L. 2023. Bacterial community structure in geothermal springs on the northern edge of Qinghai-Tibet plateau. \emph{Frontiers in Microbiology} \bold{13}: 994179. \doi{10.3389/fmicb.2022.994179}
 

diff --git a/vignettes/metrics.Rmd b/vignettes/metrics.Rmd
@@ -86,7 +86,7 @@ par(opar)
 * Define a function to get `r Zc` for named genera.
 * Calculate mean `r Zc` for genera in selected phyla.
 ```{r phylum_to_genus}
-taxnames <- read.csv(system.file("extdata/RefSeq/taxon_names.csv.xz", package = "chem16S"))
+taxnames <- read.csv(system.file("extdata/RefSeq/taxonomy.csv.xz", package = "chem16S"))
 phylum_to_genus <- function(phylum) na.omit(unique(taxnames$genus[taxnames$phylum == phylum]))
 get_Zc <- function(genera) na.omit(taxon_Zc[match(genera, taxon_AA$organism)])
 sapply(sapply(sapply(c("Crenarchaeota", "Euryarchaeota"), phylum_to_genus), get_Zc), mean)