-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Normalize file names for RefSeq and GTDB
- Loading branch information
Showing
14 changed files
with
105 additions
and
120 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# Run R scripts to produce output files in this order: | ||
# 1. genome_AA.R --> genome_AA.csv (hosted in https://github.com/jedick/JMDplots because of its size) | ||
# 2. taxonomy.R --> taxonomy.csv (hosted in https://github.com/jedick/JMDplots because of its size) | ||
# 3. taxon_AA.R --> taxon_AA.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
# Read and sum amino acid composition of all proteins for each genome 20221022 | ||
genome_AA <- function() { | ||
|
||
# The protein_faa_reps directory was found in this archive: | ||
# https://data.gtdb.ecogenomic.org/releases/release207/207.0/genomic_files_reps/gtdb_proteins_aa_reps_r207.tar.gz | ||
bacfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/bacteria/", full.names = TRUE) | ||
arcfiles <- dir("207.0/genomic_files_reps/protein_faa_reps/archaea/", full.names = TRUE) | ||
files <- c(bacfiles, arcfiles) | ||
|
||
# Loop over FASTA files (one for each genome) | ||
ifile <- seq_along(files) | ||
aa <- lapply(ifile, function(i) { | ||
# Print progress message | ||
if(i %% 100 == 0) print(i) | ||
# Read amino acid composition | ||
aa <- suppressMessages(read.fasta(files[i])) | ||
# Sum amino acid composition | ||
aasum(aa) | ||
}) | ||
aa <- do.call(rbind, aa) | ||
|
||
# Put in full genome names (with version suffix .1, .2, etc.) | ||
genome <- unlist(strsplit(basename(files[ifile]), "_protein.faa")) | ||
aa$organism <- genome | ||
|
||
# Save result | ||
write.csv(aa, "genome_AA.csv", row.names = FALSE, quote = FALSE) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Make taxonomy file for genomes 20231229 | ||
taxonomy <- function() { | ||
|
||
# Read the summed amino acid compositions of all proteins for each genome | ||
genaa <- read.csv("genome_AA.csv") | ||
# Read the GTDB taxonomy | ||
bactax <- read.table("207.0/bac120_taxonomy_r207.tsv.gz", sep = "\t") | ||
arctax <- read.table("207.0/ar53_taxonomy_r207.tsv.gz", sep = "\t") | ||
GTDBtax <- rbind(bactax, arctax) | ||
|
||
# Match genomes to taxonomy | ||
itax <- match(genaa$organism, GTDBtax[, 1]) | ||
myGTDBtax <- GTDBtax[itax, ] | ||
# Get taxon names | ||
names <- strsplit(myGTDBtax[, 2], ";") | ||
# Remove d__, p__, c__, o__, f__, g__, s__ labels | ||
names <- lapply(names, function(x) gsub("^.__", "", x)) | ||
taxonomy <- data.frame( | ||
genome = genaa$organism, | ||
domain = sapply(names, "[", 1), | ||
phylum = sapply(names, "[", 2), | ||
class = sapply(names, "[", 3), | ||
order = sapply(names, "[", 4), | ||
family = sapply(names, "[", 5), | ||
genus = sapply(names, "[", 6), | ||
species = sapply(names, "[", 7) | ||
) | ||
write.csv(taxonomy, "taxonomy.csv", row.names = FALSE, quote = FALSE) | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters