Skip to content

Commit

Permalink
bugfixes in SQMtools
Browse files Browse the repository at this point in the history
  • Loading branch information
fpusan committed Jul 10, 2019
1 parent fe7883d commit 1e3da24
Show file tree
Hide file tree
Showing 27 changed files with 106 additions and 99 deletions.
Binary file modified SQMtools_manual_v0.3.3.pdf
Binary file not shown.
4 changes: 2 additions & 2 deletions lib/SQMtools/R/Hadza.R
Expand Up @@ -20,7 +20,7 @@
#'
#' @examples
#' data(Hadza)
#' plotTaxonomy(Hadza, 'genus', rescale=T)
#' plotFunctions(Hadza, 'COG')
#' plotTaxonomy(Hadza, "genus", rescale=T)
#' plotFunctions(Hadza, "COG")
#'
"Hadza"
5 changes: 3 additions & 2 deletions lib/SQMtools/R/RecA.R
Expand Up @@ -23,6 +23,7 @@
#' data(RecA)
#' ### Let's calculate the average copy number of each function in our samples.
#' # We do it for COG annotations here, but we could also do it for KEGG or PFAMs.
#' COG.coverage = SQMtools::aggregate.fun(Hadza, 'COG', trusted_functions_only=T, ignore_unclassified_functions=F)$cov
#' COG.copynumber = t(t(COG.coverage) / COG.coverage[RecA,]) # Sample-wise division by RecA tpm.
#' COG.coverage = SQMtools:::aggregate.fun(Hadza, "COG", trusted_functions_only=T,
#' ignore_unclassified_functions=F)$cov
#' COG.copynumber = t(t(COG.coverage) / COG.coverage[RecA,]) # Sample-wise division by RecA coverage.
"RecA"
13 changes: 8 additions & 5 deletions lib/SQMtools/R/USiCGs.R
Expand Up @@ -27,13 +27,16 @@
#' KEGG.tpm = Hadza$functions$KEGG$tpm
#' all(USiCGs %in% rownames(KEGG.tpm)) # Are all the USiCGs present in our dataset?
#' # Plot a boxplot of USiCGs tpms and calculate median USiCGs tpm.
#' # This looks weird in the test dataset bc it's only a subset of the metagenomes.
#' # In a set of complete metagenomes USiCGs should have fairly similar TPM averages and low dispersion across samples.
#' boxplot(t(KEGG.tpm[USiCGs,]), names=USiCGs, ylab='TPM', col='slateblue2')
#' # This looks weird in the test dataset because it contains only a small subset of the metagenomes.
#' # In a set of complete metagenomes USiCGs should have fairly similar TPM averages
#' # and low dispersion across samples.
#' boxplot(t(KEGG.tpm[USiCGs,]), names=USiCGs, ylab="TPM", col="slateblue2")
#'
#' ### Now let's calculate the average copy numbers of each function.
#' # We do it for KEGG annotations here, but we could also do it for COGs or PFAMs.
#' KEGG.coverage = SQMtools::aggregate.fun(Hadza, 'KEGG', trusted_functions_only=T, ignore_unclassified_functions=F)$cov
#' KEGG.coverage = SQMtools:::aggregate.fun(Hadza, "KEGG", trusted_functions_only=T,
#' ignore_unclassified_functions=F)$cov
#' USiCGs.cov = apply(KEGG.coverage[USiCGs,], 2, median)
#' KEGG.copynumber = t(t(KEGG.coverage) / USiCGs.cov) # Sample-wise division by the median USiCG coverage.
#' # Sample-wise division by the median USiCG coverage.
#' KEGG.copynumber = t(t(KEGG.coverage) / USiCGs.cov)
"USiCGs"
1 change: 1 addition & 0 deletions lib/SQMtools/R/aggregate_methods.R
Expand Up @@ -67,6 +67,7 @@ aggregate.fun = function(SQM, fun, trusted_functions_only, ignore_unclassified_f
tpm = tpm [rownames(tpm) !='Unclassified',]
coverage = coverage[rownames(coverage)!='Unclassified',]
lengths = lengths[rownames(lengths) !='Unclassified',]
copies = copies[rownames(copies) !='Unclassified',]
}

stopifnot(identical(rownames(abund), rownames(tpm)))
Expand Down
20 changes: 11 additions & 9 deletions lib/SQMtools/R/combineSQM.R
Expand Up @@ -12,16 +12,16 @@
#' @examples
#' data(Hadza)
#' # Select Carbohydrate metabolism ORFs in Bacteroidetes, and Amino acid metabolism ORFs in Proteobacteria
#' bact = subsetTax(Hadza, 'phylum', 'Bacteroidetes <phylum>')
#' bact.carb = subsetFun(bact, 'Carbohydrate metabolism')
#' proteo = subsetTax(Hadza, 'phylum', 'Proteobacteria')
#' proteo.amins = subsetFun(proteo, 'Amino acid metabolism')
#' bact = subsetTax(Hadza, "phylum", "Bacteroidetes")
#' bact.carb = subsetFun(bact, "Carbohydrate metabolism")
#' proteo = subsetTax(Hadza, "phylum", "Proteobacteria")
#' proteo.amins = subsetFun(proteo, "Amino acid metabolism")
#' bact.carb_proteo.amins = combineSQM(bact.carb, proteo.amins, rescale_copy_number=F)
#' @export
combineSQM = function(..., tax_source = 'orfs', trusted_functions_only = F, ignore_unclassified_functions = F, rescale_tpm = T, rescale_copy_number = T)
{
# intermediate function so that we can pass extra args to combineSQM
myFun = function(SQM1, SQM2) combineSQM_(SQM1, SQM2, tax_source, trusted_functions_only, ignore_unclassified_functions, rescale_copy_number)
myFun = function(SQM1, SQM2) combineSQM_(SQM1, SQM2, tax_source, trusted_functions_only, ignore_unclassified_functions, rescale_tpm, rescale_copy_number)
return(Reduce(myFun, list(...)))
}

Expand All @@ -47,8 +47,8 @@ combineSQM_ = function(SQM1, SQM2, tax_source = 'orfs', trusted_functions_only =
combSQM$orfs$tpm = as.matrix(combSQM$orfs$table[,grepl('TPM', colnames(combSQM$orfs$table)),drop=F])
colnames(combSQM$orfs$tpm) = gsub('TPM ', '', colnames(combSQM$orfs$tpm), fixed=T)
# Sequences
combSQM$orfs$sequences = c(combSQM$orfs$sequences, SQM2$orfs$sequences[extraORFs])
combSQM$orfs$sequences = combSQM$orfs$sequences[rownames(combSQM$orfs$table)]
combSQM$orfs$seqs = c(combSQM$orfs$seqs, SQM2$orfs$seqs[extraORFs])
combSQM$orfs$seqs = combSQM$orfs$seqs[rownames(combSQM$orfs$table)]
# Taxonomy
combSQM$orfs$tax = rbind(combSQM$orfs$tax, SQM2$orfs$tax[extraORFs,,drop=F])
combSQM$orfs$tax = combSQM$orfs$tax[rownames(combSQM$orfs$table),,drop=F]
Expand All @@ -62,11 +62,13 @@ combineSQM_ = function(SQM1, SQM2, tax_source = 'orfs', trusted_functions_only =
# Abundances
combSQM$contigs$abund = as.matrix(combSQM$contigs$table[,grepl('Raw read count', colnames(combSQM$contigs$table)),drop=F])
colnames(combSQM$contigs$abund) = gsub('Raw read count ', '', colnames(combSQM$contigs$abund), fixed=T)
combSQM$contigs$cov = as.matrix(combSQM$contigs$table[,grepl('Coverage', colnames(combSQM$contigs$table)),drop=F])
colnames(combSQM$contigs$cov) = gsub('Coverage ', '', colnames(combSQM$contigs$abund), fixed=T)
combSQM$contigs$tpm = as.matrix(combSQM$contigs$table[,grepl('TPM', colnames(combSQM$contigs$table)),drop=F])
colnames(combSQM$contigs$tpm) = gsub('TPM ', '', colnames(combSQM$contigs$tpm), fixed=T)
# Sequences
combSQM$orfs$sequences = c(combSQM$contigs$sequences, SQM2$contigs$sequences[extraContigs])
combSQM$orfs$sequences = combSQM$contigs$sequences[rownames(combSQM$contigs$table)]
combSQM$contigs$seqs = c(combSQM$contigs$seqs, SQM2$contigs$seqs[extraContigs])
combSQM$contigs$seqs = combSQM$contigs$seqs[rownames(combSQM$contigs$table)]
# Taxonomy
combSQM$contigs$tax = rbind(combSQM$contigs$tax, SQM2$contigs$tax[extraContigs,,drop=F])
combSQM$contigs$tax = combSQM$contigs$tax[rownames(combSQM$contigs$table),,drop=F]
Expand Down
8 changes: 4 additions & 4 deletions lib/SQMtools/R/exportTable.R
Expand Up @@ -5,13 +5,13 @@
#' @param output_name character. Name of the output file.
#' @examples
#' data(Hadza)
#' Hadza.iron = subsetFun(Hadza, 'iron')
#' Hadza.iron = subsetFun(Hadza, "iron")
#' # Write the taxonomic distribution at the genus level of all the genes related to iron.
#' exportTable(Hadza.iron$taxa$genus$percent, 'Hadza.ironGenes.genus.tsv')
#' exportTable(Hadza.iron$taxa$genus$percent, "Hadza.ironGenes.genus.tsv")
#' # Now write the distribution of the different iron-related COGs (Clusters of Orthologous Groups) across samples.
#' exportTable(Hadza.iron$functions$COG$tpm, 'Hadza.ironGenes.COG.tsv')
#' exportTable(Hadza.iron$functions$COG$tpm, "Hadza.ironGenes.COG.tsv")
#' # Now write all the information contained in the ORF table.
#' exportTable(Hadza.iron$orfs$table, 'Hadza.ironGenes.orftable.tsv')
#' exportTable(Hadza.iron$orfs$table, "Hadza.ironGenes.orftable.tsv")
#' @export
exportTable = function(table, output_name)
{
Expand Down
15 changes: 6 additions & 9 deletions lib/SQMtools/R/figures.R
Expand Up @@ -15,13 +15,9 @@ require(reshape2)
#' @examples
#' data(Hadza)
#' topPFAM = mostAbundant(Hadza$functions$PFAM$tpm)
#' topPFAM = topPFAM[rownames(topPFAM) != 'Unclassified',] # Take out the Unclassified ORFs.
#' plotHeatmap(topPFAM, label_x = 'Samples', label_y = 'PFAMs', label_fill = 'TPM')
#' topPFAM = topPFAM[rownames(topPFAM) != "Unclassified",] # Take out the Unclassified ORFs.
#' plotHeatmap(topPFAM, label_x = "Samples", label_y = "PFAMs", label_fill = "TPM")
#' @export
#' @examples
#' data(Hadza)
#' phyla_percent = Hadza$taxa$phylum$percent
#' plotHeatmap(phyla_percent, label_y = 'Phylum', label_fill = 'Percentage')
plotHeatmap = function(data, label_x = 'Samples', label_y = 'Features', label_fill = 'Abundance', gradient_col = c('ghostwhite', 'dodgerblue4'), base_size = 11)
{
if (!is.data.frame(data) & !is.matrix(data)) { stop('The first argument must be a matrix or a data frame') }
Expand Down Expand Up @@ -67,9 +63,10 @@ plotHeatmap = function(data, label_x = 'Samples', label_y = 'Features', label_fi
#' @param base_size numeric. Base font size (default \code{11}).
#' @return a ggplot2 plot object.
#' @seealso \code{\link[plotTaxonomy]{plotTaxonomy}} for plotting the most abundant taxa of a SQM object; \code{\link[plotBars]{plotHeatmap}} for plotting a heatmap with arbitrary data; \code{\link[mostAbundant]{mostAbundant}} for selecting the most abundant rows in a dataframe or matrix.
#' @examples
#' data(Hadza)
#' sk = Hadza$taxa$superkingdom$abund
#' plotBars(sk, label_y = 'Raw reads', label_fill = 'Superkingdom')
#' plotBars(sk, label_y = "Raw reads", label_fill = "Superkingdom")
#' @export
plotBars = function(data, label_x = 'Samples', label_y = 'Abundances', label_fill = 'Features', color = NULL, base_size = 11)
{
Expand Down Expand Up @@ -196,9 +193,9 @@ plotFunctions = function(SQM, fun_level = 'KEGG', count = 'tpm', N = 25, fun = c
#' @seealso \code{\link[plotFunctions]{plotFunctions}} for plotting the most abundant functions of a SQM object; \code{\link[plotBars]{plotBars}} and \code{\link[plotBars]{plotHeatmap}} for plotting barplots or heatmaps with arbitrary data.
#' @examples
#' data(Hadza)
#' Hadza.amin = subsetFun(Hadza, 'Amino acid metabolism')
#' Hadza.amin = subsetFun(Hadza, "Amino acid metabolism")
#' # Taxonomic distribution of amino acid metabolism ORFs at the family level.
#' plotTaxonomy(Hadza.amin, 'family')
#' plotTaxonomy(Hadza.amin, "family")
#' @export
plotTaxonomy = function(SQM, rank = 'phylum', count = 'percent', N = 15, tax = NULL, others = T, ignore_unclassified = F, rescale = F, color = NULL, base_size = 11)
{
Expand Down
25 changes: 12 additions & 13 deletions lib/SQMtools/R/loadSQM.R
Expand Up @@ -90,17 +90,18 @@ require(reshape2)
#'
#' data(Hadza)
#' # Which are the ten most abundant KEGG IDs in our data?
#' topKEGG = sort(rowSums(Hadza$functions$KEGG$tpm), decreasing=T)[1:10]
#' topKEGG = sort(rowSums(Hadza$functions$KEGG$tpm), decreasing=T)[1:11]
#' topKEGG = topKEGG[names(topKEGG)!="Unclassified"]
#' # Which functions do those KEGG IDs represent?
#' Hadza$misc$KEGG_names[topKEGG]
#' What is the relative abundance of the Gammaproteobacteria class across samples?
#' Hadza$taxa$class$percent['Gammaproteobacteria',]
#' Hadza$taxa$class$percent["Gammaproteobacteria",]
#' # Which information is stored in the orf, contig and bin tables?
#' colnames(Hadza$orfs$table)
#' colnames(Hadza$contigs$table)
#' colnames(Hadza$bins$table)
#' # What is the GC content distribution of my metagenome?
#' boxplot(Hadza$contigs$table[,'GC perc']) # Not weighted by contig length or abundance!
#' boxplot(Hadza$contigs$table[,"GC perc"]) # Not weighted by contig length or abundance!
#' @export

loadSQM = function(project_path, tax_mode = 'allfilter')
Expand Down Expand Up @@ -195,8 +196,6 @@ loadSQM = function(project_path, tax_mode = 'allfilter')
SQM$contigs$bins = rbind(SQM$contigs$bins, notInBins)
SQM$contigs$bins = SQM$contigs$bins[rownames(SQM$contigs$table),,drop=F]
SQM$contigs$bins[is.na(SQM$contigs$bins)] = 'No_bin'
names(SQM$contigs$bins) = rownames(SQM$contigs$table)


cat('Loading bins\n')
cat(' table...\n')
Expand All @@ -207,7 +206,7 @@ loadSQM = function(project_path, tax_mode = 'allfilter')
SQM$bins$tpm = as.matrix(SQM$bins$table[,grepl('TPM', colnames(SQM$bins$table)),drop=F])
colnames(SQM$bins$tpm) = gsub('TPM ', '', colnames(SQM$bins$tpm), fixed=T)
cat(' taxonomy...\n')
SQM$bins$taxonomy = as.matrix(read.table(sprintf('%s/results/tables/%s.bin.tax.tsv', project_path, project_name),
SQM$bins$tax = as.matrix(read.table(sprintf('%s/results/tables/%s.bin.tax.tsv', project_path, project_name),
header=T, row.names=1, sep='\t'))
cat('Loading taxonomies\n')
SQM$taxa = list()
Expand Down Expand Up @@ -294,15 +293,15 @@ loadSQM = function(project_path, tax_mode = 'allfilter')
SQM$functions = list()
SQM$functions$KEGG = list()
SQM$functions$KEGG$abund = as.matrix(read.table(sprintf('%s/results/tables/%s.KO.abund.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))
SQM$functions$KEGG$tpm = as.matrix(read.table(sprintf('%s/results/tables/%s.KO.tpm.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))

SQM$functions$COG = list()
SQM$functions$COG$abund = as.matrix(read.table(sprintf('%s/results/tables/%s.COG.abund.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))
SQM$functions$COG$tpm = as.matrix(read.table(sprintf('%s/results/tables/%s.COG.tpm.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))


SQM$functions$PFAM = list()
Expand All @@ -314,11 +313,11 @@ loadSQM = function(project_path, tax_mode = 'allfilter')
if(file.exists(sprintf('%s/results/tables/%s.RecA.tsv', project_path, project_name)))
{
SQM$functions$KEGG$copy_number = as.matrix(read.table(sprintf('%s/results/tables/%s.KO.copyNumber.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))
SQM$functions$COG$copy_number = as.matrix(read.table(sprintf('%s/results/tables/%s.COG.copyNumber.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))
SQM$functions$PFAM$copy_number = as.matrix(read.table(sprintf('%s/results/tables/%s.PFAM.copyNumber.tsv', project_path, project_name),
header=T, sep='\t', row.names=1, check.names=F))
header=T, sep='\t', row.names=1, check.names=F, comment.char='', quote=''))
SQM$misc$RecA_cov = unlist (read.table(sprintf('%s/results/tables/%s.RecA.tsv', project_path, project_name),
header=T, row.names=1) ['COG0468',] )
}else
Expand Down
8 changes: 4 additions & 4 deletions lib/SQMtools/R/mostAbundant.R
Expand Up @@ -9,17 +9,17 @@
#' @return A matrix or data frame (same as input) with the selected rows.
#' @examples
#' data(Hadza)
#' Hadza.carb = subsetFun(Hadza, 'Carbohydrate metabolism')
#' Hadza.carb = subsetFun(Hadza, "Carbohydrate metabolism")
#' # Which are the 20 most abundant KEGG functions in the ORFs related to carbohydrate metabolism?
#' topCarb = mostAbundant(Hadza.carb$functions$KEGG$tpm, N=20)
#' # Now print them with nice names
#' rownames(topCarb) = paste(rownames(topCarb), Hadza.carb$misc$KEGG_names[rownames(topCarb)], sep='; ')
#' rownames(topCarb) = paste(rownames(topCarb), Hadza.carb$misc$KEGG_names[rownames(topCarb)], sep="; ")
#' topCarb
#' We can pass this to any R function
#' heatmap(topCarb)
#' But for convenience we provide wrappers for plotting ggplot2 heatmaps and barplots
#' plotHeatmap(topCarb, label_y='TPM')
#' plotBars(topCarb, label_y='TPM')
#' plotHeatmap(topCarb, label_y="TPM")
#' plotBars(topCarb, label_y="TPM")
#' @export
mostAbundant = function(data, N = 10, items = NULL, others = F, rescale = F)
{
Expand Down
18 changes: 9 additions & 9 deletions lib/SQMtools/R/subset_methods.R
Expand Up @@ -13,8 +13,8 @@
#' @return SQM object containing only the requested function.
#' @examples
#' data(Hadza)
#' Hadza.iron = subsetFun(Hadza, 'iron')
#' Hadza.carb = subsetFun(Hadza, 'Carbohydrate metabolism')
#' Hadza.iron = subsetFun(Hadza, "iron")
#' Hadza.carb = subsetFun(Hadza, "Carbohydrate metabolism")
#' @export
subsetFun = function(SQM, fun, ignore_case=T, fixed=F, trusted_functions_only = F, ignore_unclassified_functions = F, rescale_tpm = F, rescale_copy_number = F)
{
Expand Down Expand Up @@ -68,8 +68,8 @@ subsetFun = function(SQM, fun, ignore_case=T, fixed=F, trusted_functions_only =
#' @seealso \code{\link[subsetFun]{subsetFun}}, \code{\link[subsetContigs]{subsetContigs}}, \code{\link[combineSQM]{combineSQM}}. The most abundant items of a particular table contained in a SQM object can be eselected with \code{\link[mostAbundant]{mostAbundant}}.
#' @examples
#' data(Hadza)
#' Hadza.Escherichia = subsetTax(Hadza, 'genus', 'Escherichia')
#' Hadza.Bacteroidetes = subsetTax(Hadza, 'phylum', 'Bacteroidetes')
#' Hadza.Escherichia = subsetTax(Hadza, "genus", "Escherichia")
#' Hadza.Bacteroidetes = subsetTax(Hadza, "phylum", "Bacteroidetes")
#' @export
subsetTax = function(SQM, rank, tax, trusted_functions_only = F, ignore_unclassified_functions = F, rescale_tpm =T, rescale_copy_number = T)
{
Expand Down Expand Up @@ -98,14 +98,14 @@ subsetTax = function(SQM, rank, tax, trusted_functions_only = F, ignore_unclassi
#' @seealso \code{\link[subsetContigs]{subsetContigs}}, \code{\link[subsetORFs]{subsetORFs}}
#' @examples
#' data(Hadza)
#' # Which are the five most complete bin?
#' topBinNames = rownames(Hadza$bins$table)[order(Hadza$bins$table[,'Completeness'], decreasing=T)][1:5]
#' # Which are the two most complete bins?
#' topBinNames = rownames(Hadza$bins$table)[order(Hadza$bins$table[,"Completeness"], decreasing=T)][1:2]
#' topBins = subsetBins(Hadza, topBinNames)
#' @export
subsetBins = function(SQM, bins, trusted_functions_only = F, ignore_unclassified_functions = F, rescale_tpm = T, rescale_copy_number = T)
{
if(!class(SQM)=='SQM') { stop('The first argument must be a SQM object') }
goodContigs = names(SQM$contigs$bins)[SQM$contigs$bins %in% bins]
goodContigs = rownames(SQM$contigs$bins)[SQM$contigs$bins %in% bins]
return ( subsetContigs(SQM, goodContigs,
trusted_functions_only = trusted_functions_only,
ignore_unclassified_functions=ignore_unclassified_functions,
Expand All @@ -129,9 +129,9 @@ subsetBins = function(SQM, bins, trusted_functions_only = F, ignore_unclassified
#' @examples
#' data(Hadza)
#' # Which contigs have a GC content below 40?
#' lowGCcontigNames = rownames(Hadza$contigs$table[Hadza$contigs$table[,'GC perc']<40,])
#' lowGCcontigNames = rownames(Hadza$contigs$table[Hadza$contigs$table[,"GC perc"]<40,])
#' lowGCcontigs = subsetContigs(Hadza, lowGCcontigNames)
#' hist(lowGCcontigs$contigs$table[,'GC perc'])
#' hist(lowGCcontigs$contigs$table[,"GC perc"])
#' @export
subsetContigs = function(SQM, contigs, trusted_functions_only = F, ignore_unclassified_functions = F, rescale_tpm = F, rescale_copy_number = F)
{
Expand Down
Binary file modified lib/SQMtools/data/Hadza.RData
Binary file not shown.
4 changes: 2 additions & 2 deletions lib/SQMtools/man/Hadza.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 1e3da24

Please sign in to comment.