Skip to content

Commit

Permalink
Revise scripts for processing reference databases
Browse files Browse the repository at this point in the history
  • Loading branch information
jedick committed Jan 4, 2024
1 parent 7c02402 commit 059c5e9
Show file tree
Hide file tree
Showing 7 changed files with 30 additions and 16 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Date: 2023-12-29
Date: 2024-01-04
Package: chem16S
Version: 1.0.0-10
Version: 1.0.0-11
Title: Chemical Metrics of Community Reference Proteomes
Authors@R: c(
person("Jeffrey", "Dick", , "j3ffdick@gmail.com", role = c("aut", "cre"),
Expand Down
7 changes: 5 additions & 2 deletions inst/NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
CHANGES IN chem16S 1.0.0-1 (2023-07-23)
-------------------------------------
CHANGES IN chem16S 1.0.0-10 (2023-12-29)
----------------------------------------

- Normalize file names in extdata/RefSeq and extdata/GTDB for easier
programmatic access.

- Add 'xvar' and 'yvar' arguments to plot_metrics().

Expand Down
4 changes: 2 additions & 2 deletions inst/extdata/GTDB/README.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Run R scripts to produce output files in this order:
# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 2. taxonomy.R --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 1. taxonomy.R --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 2. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size)
# 3. taxon_AA.R --> taxon_AA.csv
4 changes: 4 additions & 0 deletions inst/extdata/GTDB/genome_AA.R
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ genome_AA <- function() {
genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa"))
aa$organism <- genome

# Put in species names 20240104
taxonomy <- read.csv("taxonomy.csv")
aa$ref <- taxonomy$species

# Save result
write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE)

Expand Down
10 changes: 5 additions & 5 deletions inst/extdata/GTDB/taxon_AA.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@
taxon_AA <- function() {

# Read the summed amino acid compositions of all proteins for each genome
genaa <- read.csv("genome_AA.csv.xz")
genome_AA <- read.csv("genome_AA.csv.xz")
# Read the taxonomy 20231229
taxonomy <- read.csv("taxonomy.csv")
# Keep only genomes with at least 500 proteins
i500 <- genaa$chains >= 500
genaa <- genaa[i500, ]
i500 <- genome_AA$chains >= 500
genome_AA <- genome_AA[i500, ]
taxonomy <- taxonomy[i500, ]
# Normalize by number of proteins
genaa[, 5:25] <- genaa[, 5:25] / genaa$chains
genome_AA[, 5:25] <- genome_AA[, 5:25] / genome_AA$chains

# Loop over taxonomic ranks
ranks <- c("domain", "phylum", "class", "order", "family", "genus")
Expand All @@ -30,7 +30,7 @@ taxon_AA <- function() {
for(i in 1:length(utaxa)) {
# Sum amino acid compositions for all genomes in this taxon
itaxa <- taxa == utaxa[i]
aa[i, 5:25] <- aasum(genaa[itaxa, ])[, 5:25]
aa[i, 5:25] <- aasum(genome_AA[itaxa, ])[, 5:25]
}
# Normalize by number of genomes (put the number in 'ref' column)
aa$ref <- aa$chains
Expand Down
16 changes: 11 additions & 5 deletions inst/extdata/GTDB/taxonomy.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,28 @@
# Make taxonomy file for genomes 20231229
# Make taxonomy file for genomes in GTDB 20231229
taxonomy <- function() {

# Read the summed amino acid compositions of all proteins for each genome
genaa <- read.csv("genome_AA.csv")
# The protein_faa_reps directory was found in this archive:
# https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz
bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE)
arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE)
files <- c(bacfiles, arcfiles)
# Get full genome names (with version suffix .1, .2, etc.)
genome <- unlist(strsplit(basename(files), "_protein.faa"))

# Read the GTDB taxonomy
bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t")
arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t")
GTDBtax <- rbind(bactax, arctax)

# Match genomes to taxonomy
itax <- match(genaa$organism, GTDBtax[, 1])
itax <- match(genome, GTDBtax[, 1])
myGTDBtax <- GTDBtax[itax, ]
# Get taxon names
names <- strsplit(myGTDBtax[, 2], ";")
# Remove d__, p__, c__, o__, f__, g__, s__ labels
names <- lapply(names, function(x) gsub("^.__", "", x))
taxonomy <- data.frame(
genome = genaa$organism,
genome = genome,
domain = sapply(names, "[", 1),
phylum = sapply(names, "[", 2),
class = sapply(names, "[", 3),
Expand Down
1 change: 1 addition & 0 deletions inst/extdata/RefSeq/taxon_AA.R
Original file line number Diff line number Diff line change
Expand Up @@ -108,4 +108,5 @@ taxon_AA <- function(ranks = c("genus", "family", "order", "class", "phylum", "s
# Replace NA parent with ""
out$abbrv[is.na(out$abbrv)] <- ""
write.csv(out, "taxon_AA.csv", row.names = FALSE, quote = FALSE)

}

0 comments on commit 059c5e9

Please sign in to comment.