Skip to content

Commit

Permalink
Normalize file names for RefSeq and GTDB
Browse files Browse the repository at this point in the history
  • Loading branch information
jedick committed Dec 29, 2023
1 parent bea4bfc commit 7c02402
Show file tree
Hide file tree
Showing 14 changed files with 105 additions and 120 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Date: 2023-12-06
Date: 2023-12-29
Package: chem16S
Version: 1.0.0-9
Version: 1.0.0-10
Title: Chemical Metrics of Community Reference Proteomes
Authors@R: c(
person("Jeffrey", "Dick", , "j3ffdick@gmail.com", role = c("aut", "cre"),
Expand Down
4 changes: 1 addition & 3 deletions R/get_metrics.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@ get_metrics <- function(RDP = NULL, map = NULL, refdb = "GTDB", taxon_AA = NULL,
map <- na.omit(map)
if(length(map) == 0) stop(paste("no available mappings to taxa in", refdb, "reference database"))

# Get amino acid compositions of taxa compiled from:
# - RefSeq (no longer using precompiled metrics in taxon_metrics.csv 20220108) or
# - GTDB 20221016
# Get amino acid compositions of taxa compiled from RefSeq or GTDB
AApath <- file.path("extdata", refdb, "taxon_AA.csv.xz")
AAfile <- system.file(AApath, package = "chem16S")
if(is.null(taxon_AA)) taxon_AA <- read.csv(AAfile, as.is = TRUE)
Expand Down
4 changes: 4 additions & 0 deletions inst/extdata/GTDB/README.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Run R scripts to produce output files in this order:
# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 2. taxonomy.R --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 3. taxon_AA.R --> taxon_AA.csv
29 changes: 29 additions & 0 deletions inst/extdata/GTDB/genome_AA.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Read and sum amino acid composition of all proteins for each genome 20221022
genome_AA <- function() {

# The protein_faa_reps directory was found in this archive:
# https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz
bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE)
arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE)
files <- c(bacfiles, arcfiles)

# Loop over FASTA files (one for each genome)
ifile <- seq_along(files)
aa <- lapply(ifile, function(i) {
# Print progress message
if(i %% 100 == 0) print(i)
# Read amino acid composition
aa <- suppressMessages(read.fasta(files[i]))
# Sum amino acid composition
aasum(aa)
})
aa <- do.call(rbind, aa)

# Put in full genome names (with version suffix .1, .2, etc.)
genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa"))
aa$organism <- genome

# Save result
write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)

}
65 changes: 8 additions & 57 deletions inst/extdata/GTDB/taxon_AA.R
Original file line number Diff line number Diff line change
@@ -1,71 +1,22 @@
# Get amino acid compositions for bacterial and archaeal taxa in GTDB
# 20221015 jmd

# Read and sum amino acid composition of all proteins for each genome 20221022
genome_AA <- function() {

# The protein_faa_reps directory was found in this archive:
# https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz
bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE)
arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE)
files <- c(bacfiles, arcfiles)

# Loop over FASTA files (one for each genome)
ifile <- seq_along(files)
aa <- lapply(ifile, function(i) {
# Print progress message
if(i %% 100 == 0) print(i)
# Read amino acid composition
aa <- suppressMessages(read.fasta(files[i]))
# Sum amino acid composition
aasum(aa)
})
aa <- do.call(rbind, aa)

# Put in full genome names (with version suffix .1, .2, etc.)
genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa"))
aa$organism <- genome

# Save result
write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)

}

# Combine amino acid composition of genomes for genus and higher levels 20221015
taxon_AA <- function() {

# Read the summed amino acid compositions of all proteins for each genome
genaa <- read.csv("genome_AA.csv")
genaa <- read.csv("genome_AA.csv.xz")
# Read the taxonomy 20231229
taxonomy <- read.csv("taxonomy.csv")
# Keep only genomes with at least 500 proteins
genaa <- genaa[genaa$chains >= 500, ]
i500 <- genaa$chains >= 500
genaa <- genaa[i500, ]
taxonomy <- taxonomy[i500, ]
# Normalize by number of proteins
genaa[, 5:25] <- genaa[, 5:25] / genaa$chains

# Read the GTDB taxonomy
bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t")
arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t")
taxonomy <- rbind(bactax, arctax)

# Match genomes to taxonomy
itax <- match(genaa$organism, taxonomy[, 1])
mytaxonomy <- taxonomy[itax, ]
# Get taxon names
names <- strsplit(mytaxonomy[, 2], ";")
# Remove d__, p__, c__, o__, f__, g__, s__ labels
names <- lapply(names, function(x) gsub("^.__", "", x))
domain <- sapply(names, "[", 1)
phylum <- sapply(names, "[", 2)
class <- sapply(names, "[", 3)
order <- sapply(names, "[", 4)
family <- sapply(names, "[", 5)
genus <- sapply(names, "[", 6)
species <- sapply(names, "[", 7)

# Loop over taxonomic ranks
ranks <- c("domain", "phylum", "class", "order", "family", "genus")
aa <- lapply(ranks, function(rank) {
# Names of all taxa at this rank
taxa <- get(rank)
taxa <- taxonomy[, rank]
# Names of unique taxa
utaxa <- unique(taxa)
# Print rank and number of unique taxa
Expand All @@ -88,7 +39,7 @@ taxon_AA <- function() {
irank <- match(rank, ranks)
if(irank > 1) {
uprank <- ranks[irank - 1]
uptaxa <- get(uprank)
uptaxa <- taxonomy[, uprank]
itaxa <- match(utaxa, taxa)
aa$abbrv <- uptaxa[itaxa]
}
Expand Down
30 changes: 30 additions & 0 deletions inst/extdata/GTDB/taxonomy.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Make taxonomy file for genomes 20231229
taxonomy <- function() {

# Read the summed amino acid compositions of all proteins for each genome
genaa <- read.csv("genome_AA.csv")
# Read the GTDB taxonomy
bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t")
arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t")
GTDBtax <- rbind(bactax, arctax)

# Match genomes to taxonomy
itax <- match(genaa$organism, GTDBtax[, 1])
myGTDBtax <- GTDBtax[itax, ]
# Get taxon names
names <- strsplit(myGTDBtax[, 2], ";")
# Remove d__, p__, c__, o__, f__, g__, s__ labels
names <- lapply(names, function(x) gsub("^.__", "", x))
taxonomy <- data.frame(
genome = genaa$organism,
domain = sapply(names, "[", 1),
phylum = sapply(names, "[", 2),
class = sapply(names, "[", 3),
order = sapply(names, "[", 4),
family = sapply(names, "[", 5),
genus = sapply(names, "[", 6),
species = sapply(names, "[", 7)
)
write.csv(taxonomy, "taxonomy.csv", row.names = FALSE, quote = FALSE)

}
25 changes: 16 additions & 9 deletions inst/extdata/RefSeq/README.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,21 @@
# Run R scripts to produce output files in this order:
# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 2. taxonomy.R --> taxonomy.csv (mirrored in https://github.com/jedick/JMDplots)
# 3. taxon_AA.R --> taxon_AA.csv

### Details for steps 1 and 2 ###

# This document describes steps for processing the
# RefSeq database (release 206, 2021-05-21) to produce these files:

protein_refseq.csv: total amino acid composition of all proteins for
genome_AA.csv: total amino acid composition of all proteins for
bacteria, archaea, and viral genomes in the RefSeq collection (n = 49448)
taxon_names.csv: taxid, phylum name and species name for 49448 taxa
taxonomy.csv: taxid, phylum name and species name for 49448 taxa

# These functions/scripts have the following purpose (output files listed in parentheses):
gencat.sh - extract accession, taxid, sequence length from RefSeq release catalog (accession.taxid.txt)
protein_refseq.R - get average amino acid composition for each taxid from gzipped sequence files (protein_refseq.csv)
taxon_names.R - get taxonomic names for each taxid represented (taxon_names.csv)
genome_AA.R - get average amino acid composition for each taxid from gzipped sequence files (genome_AA.csv)
taxonomy.R - get taxonomic names for each taxid represented (taxonomy.csv)

## Download stuff

Expand All @@ -35,20 +42,20 @@ wget -N -i urllist
6. Use 'gencat.sh' to generate accession.taxid.txt for bacteria, archaea, and viral proteins in the catalog [11 minutes]
for RefSeq 206, 'wc -l accession.taxid.txt' is 169527530

7. Generate protein_refseq.csv
7. Generate genome_AA.csv
mkdir csv
Then, run these R commands in the working directory that contains accession.taxid.txt, protein/, and csv/
> source("protein_refseq.R")
> source("genome_AA.R")
> read.allfiles() # ca. 19 hours (8 cores)
> protein.refseq() # 5 minutes
> genome_AA() # 5 minutes

## Taxonomy stuff

8. Edit 'taxon_names.R' so that 'taxdir' points to the directory where the files
8. Edit 'taxonomy.R' so that 'taxdir' points to the directory where the files
'names.dmp' and 'nodes.dmp' are present. These files can be downloaded from
https://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz

File info on server (accessed on 2021-05-21):
taxdump.tar.gz 2021-05-21 21:27 53M

9. Source 'taxon_names.R' to generate the file 'taxon_names.csv' [8 hours]
9. Source 'taxonomy.R' to generate the file 'taxonomy.csv' [8 hours]
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# chem16S/RefSeq/protein_refseq.R
# chem16S/RefSeq/genome_AA.R
# Calculate the overall amino acid composition of proteins for each taxid in RefSeq
# 20100704 First version
# 20130922 Deal with WP multispecies accessions (RefSeq 61)
Expand Down Expand Up @@ -152,7 +152,7 @@ read.allfiles <- function() {
#invisible(aalist)
}

protein.refseq <- function() {
genome_AA <- function() {
# List all files in "csv" directory
files <- file.path("csv", dir("csv"))
# Read the CSV files into a single list
Expand All @@ -173,6 +173,6 @@ protein.refseq <- function() {
aaall <- cbind(aadat[, 1:4], aasum)
aaall$ref <- aaref
# Save the result
write.csv(aaall, "protein_refseq.csv", row.names = FALSE, quote = 3)
write.csv(aaall, "genome_AA.csv", row.names = FALSE, quote = 3)
invisible(aaall)
}
39 changes: 3 additions & 36 deletions inst/extdata/RefSeq/taxon_AA.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# chem16S/RefSeq/taxon_AA.R

# Functions to generate amino acid compositions and chemical metrics of
# higher-level taxa from RefSeq species-level reference proteomes (protein_refseq.R)
# higher-level taxa from RefSeq species-level reference proteomes

# Notes moved from README.md on 20230708
#* Only taxids classified at the species level are used, and archaeal and bacterial species with less than 500 reference protein sequences are excluded;
Expand All @@ -11,15 +11,13 @@

# Calculate amino acid composition of taxonomic groups at genus and higher ranks --> taxon_AA.csv
# taxon_AA()
# Calculate chemical metrics (nH2O, Zc) for each RefSeq group --> taxon_metrics.csv
# taxon_metrics()

# Calculate average amino acid composition of genus and higher ranks in RefSeq 20200911
taxon_AA <- function(ranks = c("genus", "family", "order", "class", "phylum", "superkingdom")) {

# Read RefSeq amino acid compositions and taxon names
refseq <- read.csv(system.file("extdata/RefSeq/protein_refseq.csv.xz", package = "JMDplots"), as.is = TRUE)
taxa <- read.csv(system.file("extdata/RefSeq/taxon_names.csv.xz", package = "chem16S"), as.is = TRUE)
refseq <- read.csv(system.file("extdata/RefSeq/genome_AA.csv.xz", package = "JMDplots"), as.is = TRUE)
taxa <- read.csv(system.file("extdata/RefSeq/taxonomy.csv.xz", package = "chem16S"), as.is = TRUE)
# Make sure the data tables have consistent taxids
stopifnot(all(refseq$organism == taxa$taxid))
# Keep taxids classified at species level 20220104
Expand Down Expand Up @@ -111,34 +109,3 @@ taxon_AA <- function(ranks = c("genus", "family", "order", "class", "phylum", "s
out$abbrv[is.na(out$abbrv)] <- ""
write.csv(out, "taxon_AA.csv", row.names = FALSE, quote = FALSE)
}

# Compute chemical metrics for each RefSeq group 20200927
taxon_metrics <- function() {
# Read amino acid compositions of all groups
AA <- read.csv("taxon_AA.csv", as.is = TRUE)
# Build output data frame; rename columns for better readability
out <- data.frame(rank = AA$protein, group = AA$organism, ntaxa = AA$ref, parent = AA$abbrv, nH2O = NA, Zc = NA, nC = NA)
# Calculate metrics
out$nH2O <- round(canprot::nH2O(AA), 6)
out$Zc <- round(canprot::Zc(AA), 6)
out$nC <- round(CAA(AA), 6)
write.csv(out, "taxon_metrics.csv", row.names = FALSE, quote = FALSE)
}

# Function used in taxon_metrics() to calculate number of carbon atoms in amino acid compositions 20200927
CAA <- function(AAcomp) {
# the number of carbons of the amino acids
nC_AA <- c(Ala = 3, Cys = 3, Asp = 4, Glu = 5, Phe = 9, Gly = 2, His = 6,
Ile = 6, Lys = 6, Leu = 6, Met = 5, Asn = 4, Pro = 5, Gln = 5,
Arg = 6, Ser = 3, Thr = 4, Val = 5, Trp = 11, Tyr = 9)
# find columns with names for the amino acids
isAA <- colnames(AAcomp) %in% c("Ala", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", "Lys",
"Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val", "Trp", "Tyr")
iAA <- match(colnames(AAcomp)[isAA], names(nC_AA))
# calculate the nC for all occurrences of each amino acid
multC <- t(t(AAcomp[, isAA]) * nC_AA[iAA])
# calculate the total nC, then the per-residue nC
nCtot <- rowSums(multC)
nCtot / rowSums(AAcomp[, isAA])
}
al
Binary file removed inst/extdata/RefSeq/taxon_metrics.csv.xz
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ require(CHNOSZ)
# Change this to the location where names.dmp and nodes.dmp are located
taxdir <- "./taxdump"

# Get the taxids from protein_refseq.csv
pr <- read.csv("protein_refseq.csv")
# Get the taxids from genome_AA.csv
pr <- read.csv("genome_AA.csv")
taxid <- pr$organism

# Read in the names and nodes
Expand Down Expand Up @@ -56,4 +56,4 @@ cat("done!\n")
# Write results to a file
out <- as.data.frame(out)
out <- cbind(data.frame(taxid = taxid[ii], out))
write.csv(out, "taxon_names.csv", row.names = FALSE, quote = FALSE)
write.csv(out, "taxonomy.csv", row.names = FALSE, quote = FALSE)
File renamed without changes.
11 changes: 5 additions & 6 deletions man/chem16S-package.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -64,20 +64,19 @@ History:
\item{\file{README.txt}}{Description of steps to generate reference proteomes of species-level taxa (including downloads and shell commands).}
\item{\file{gencat.sh}}{Helper script to extract microbial protein records from the RefSeq catalog.}
\item{\file{protein_refseq.R}}{
\item{\file{genome_AA.R}}{
R code to sum the amino acid compositions of all proteins for each bacterial, archaeal, and viral species in the NCBI Reference Sequence database.
NOTE: To save space, the output file (\file{protein_refseq.csv}) is not stored here but is in the \code{extdata/RefSeq} directory of the JMDplots package on GitHub (\url{https://github.com/jedick/JMDplots}.
NOTE: To save space in this package, the output file (\file{genome_AA.csv}) is stored in the \code{extdata/RefSeq} directory of the JMDplots package on GitHub (\url{https://github.com/jedick/JMDplots}.
The first five columns are: \code{protein} (\dQuote{refseq}), \code{organism} (taxonomic id), \code{ref} (organism name), \code{abbrv} (empty), \code{chains} (number of protein sequences for this organism).
Columns 6 to 25 have the counts of amino acids.
}
\item{\file{taxon_names.R}}{
R code for processing taxonomic IDs; the output file is \file{taxon_names.csv}.
\item{\file{taxonomy.R}}{
R code for processing taxonomic IDs; the output file is \file{taxonomy.csv}.
The columns are NCBI taxonomic ID (taxid), and names at different taxonomic rank (species, genus, family, order, class, phylum, superkingdom).
}
\item{\file{taxon_AA.R}}{Functions to create the files listed below:}
\item{\file{taxon_AA.csv.xz}}{Average amino acid composition of reference proteomes for all species in each genus, family, order, class, phylum, and superkingdom.}
\item{\file{taxon_metrics.csv.xz}}{Selected chemical metrics (\Zc and \nH2O) of reference proteomes for taxa at genus and higher ranks.}
}
Expand Down Expand Up @@ -180,7 +179,7 @@ O'Leary NA et al. 2016. Reference sequence (RefSeq) database at NCBI: current st

Parks DH, Chuvochina M, Rinke C, Mussig AJ, Chaumeil P-A, Hugenholtz P. 2022. GTDB: an ongoing census of bacterial and archaeal diversity through a phylogenetically consistent, rank normalized and complete genome-based taxonomy. \emph{Nucleic Acids Research} \bold{50}: D785--D794. \doi{10.1093/nar/gkab776}

Swingley WD, Meyer-Dombard DR, Shock EL, Alsop EB, Falenski HD, Havig JR, Raymond J. 2012. Coordinating environmental genomics and geochemistry reveals metabolic transitions in a hot spring ecosystem. \emph{PLOS One} \bold{7}(6): e38108. \doi{doi.org/10.1371/journal.pone.0038108}
Swingley WD, Meyer-Dombard DR, Shock EL, Alsop EB, Falenski HD, Havig JR, Raymond J. 2012. Coordinating environmental genomics and geochemistry reveals metabolic transitions in a hot spring ecosystem. \emph{PLOS One} \bold{7}(6): e38108. \doi{10.1371/journal.pone.0038108}

Zhang H-S, Feng Q-D, Zhang D-Y, Zhu G-L, Yang L. 2023. Bacterial community structure in geothermal springs on the northern edge of Qinghai-Tibet plateau. \emph{Frontiers in Microbiology} \bold{13}: 994179. \doi{10.3389/fmicb.2022.994179}

Expand Down
2 changes: 1 addition & 1 deletion vignettes/metrics.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ par(opar)
* Define a function to get `r Zc` for named genera.
* Calculate mean `r Zc` for genera in selected phyla.
```{r phylum_to_genus}
taxnames <- read.csv(system.file("extdata/RefSeq/taxon_names.csv.xz", package = "chem16S"))
taxnames <- read.csv(system.file("extdata/RefSeq/taxonomy.csv.xz", package = "chem16S"))
phylum_to_genus <- function(phylum) na.omit(unique(taxnames$genus[taxnames$phylum == phylum]))
get_Zc <- function(genera) na.omit(taxon_Zc[match(genera, taxon_AA$organism)])
sapply(sapply(sapply(c("Crenarchaeota", "Euryarchaeota"), phylum_to_genus), get_Zc), mean)
Expand Down

0 comments on commit 7c02402

Please sign in to comment.