v0.99.24 added plot_sample_network, plot_richness_estimates

* Advanced network/graph plotting/visualization wrappers: make_sample_network(), plot_sample_network() * Advanced alpha diversity wrappers: plot_richness_estimates(), estimate_richness() * Added taxafilter() function for even more convenient filtering of species/taxa * Added reshape dependency for explicit use of melt() function.
joey711 · Mar 1, 2012 · 0ef911b · 0ef911b
1 parent 60b96bd
commit 0ef911b
Show file tree

Hide file tree

Showing 17 changed files with 965 additions and 159 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: phyloseq
-Version: 0.99.23
-Date: 2012-02-21
+Version: 0.99.24
+Date: 2012-02-29
 Title: Handling and analysis of high-throughput
     phylogenetic sequence data.
 Description: phyloseq is a set of classes, wrappers, and
@@ -15,6 +15,7 @@ Imports:
     vegan (>= 2.0),
     foreach (>= 1.3),
     plyr (>= 1.7),
+    reshape (>= 0.8.4),
     RJSONIO (>= 0.98)
 Depends:
     R (>= 2.14.0),
@@ -52,3 +53,4 @@ Collate:
     'assignment-methods.r'
     'sampleData-class.R'
     'extend_vegan.R'
+    'network-methods.R'
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,7 +3,9 @@ export("sampleData<-")
 export("taxTab<-")
 export(access)
 export(calcplot)
+export(estimate_richness)
 export(export_env_file)
+export(filter_taxa)
 export(filterfunSample)
 export(getslots.phyloseq)
 export(getTaxa)
@@ -16,10 +18,13 @@ export(import_pyrotagger_tab)
 export(import_qiime)
 export(import_RDP_cluster)
 export(import)
-export(makenetwork)
+export(make_sample_network)
 export(merge_phyloseq)
 export(phyloseq)
 export(plot_ordination_phyloseq)
+export(plot_richness_estimates)
+export(plot_sample_network)
+export(plot_taxa_bar)
 export(plot_tree_phyloseq)
 export(rank.names)
 export(rm_outlierf)
@@ -82,10 +87,18 @@ exportMethods(tre)
 exportMethods(UniFrac)
 exportMethods(vegdist)
 import(foreach)
-import(igraph)
 import(vegan)
+importFrom(igraph,degree)
+importFrom(igraph,delete.vertices)
+importFrom(igraph,get.edgelist)
+importFrom(igraph,graph.adjacency)
+importFrom(igraph,layout.fruchterman.reingold)
+importFrom(igraph,V)
 importFrom(multtest,mt.maxT)
 importFrom(multtest,mt.minP)
 importFrom(plyr,laply)
 importFrom(plyr,ldply)
+importFrom(reshape,melt)
 importFrom(RJSONIO,fromJSON)
+importFrom(vegan,diversity)
+importFrom(vegan,estimateR)
diff --git a/R/extend_vegan.R b/R/extend_vegan.R
@@ -41,4 +41,77 @@ setMethod("vegdist", "phyloseq", function(x, method = "bray", binary = FALSE,
 	x <- otuTable(x)
 	vegdist(x, method, binary, diag, upper, na.rm, ...)	
 })
+################################################################################
+################################################################################
+#' Summarize richness estimates
+#'
+#' Performs a number of standard richness estimates, and returns the results
+#' as a \code{data.frame}. Can operate on the cumulative population of all
+#' samples in the dataset, or by repeating the richness estimates for each
+#' sample individually.
+#' NOTE: You must use untrimmed datasets
+#' for meaningful results, as these estimates (and even the ``observed'' richness)
+#' are highly dependent on the number of singletons. You can always trim the data
+#' later on if needed, just not before using this function.
+#'
+#' @usage estimate_richness(physeq, split=TRUE)
+#' 
+#' @param physeq (Required). \code{\link{phyloseq-class}}, or alternatively, 
+#'  an \code{\link{otuTable-class}}. The data about which you want to estimate
+#'  the richness.
+#'
+#' @param split (Optional). Logical. Should a separate set of richness estimates
+#'  be performed for each sample? Or alternatively, pool all samples and 
+#'  estimate richness of the entire set.
+#'
+#' @return A \code{data.frame} of the richness estimates, and their standard error.
+#' 
+#' @seealso 
+#'  Check out the custom plotting function, \code{\link{plot_richness_estimates}},
+#'  for easily showing the results of different estimates, with method-specific
+#'  error-bars. Also check out the internal functions borrowed from the \code{vegan}
+#'  package:
+#'  \code{\link[vegan]{estimateR}},
+#'  \code{\link[vegan]{diversity}}
+#'
+#' @importFrom vegan estimateR
+#' @importFrom vegan diversity
+#' @export
+#' @examples 
+#'  data(GlobalPatterns)
+#'  ( S.GP <- estimate_richness(GlobalPatterns) )
+#'  # # Make the plots
+#'  # plot_richness_estimates(GlobalPatterns, "SampleType")
+#'  # plot_richness_estimates(GlobalPatterns, "SampleType", "SampleType")
+#'  # For more plotting examples, see plot_richness_estimates()
+#' 
+estimate_richness <- function(physeq, split=TRUE){
+	# Check for singletons, and then warning if they are missing.
+	# These metrics only really meaningful if singletons are included.
+	if( !any(otuTable(physeq)==1) ){
+		warning("The experiment object you have provided does not have\n",
+		"any singletons. This is highly suspicious. Results of richness\n",
+		"estimates are probably unreliable, or wrong, if you have already\n",
+		"trimmed low-abundance taxa from the data.\n",
+		"\n",
+		"It is recommended that you find the un-trimmed data and retry.",
+		)
+	}
+
+	# If we are not splitting sample-wise, sum the species. Else, enforce orientation.
+	if( !split ){
+		OTU <- speciesSums(physeq)		
+	} else if( split ){
+		OTU <- as(otuTable(physeq), "matrix")
+		if( speciesAreRows(physeq) ){ OTU <- t(OTU) }
+	}
+
+	# Some standard richness parameters
+	richness <- round(estimateR(OTU))
+	shannon	 <- round(diversity( OTU ), 2)
+	simpson  <- round(diversity( OTU, index="simpson"), 2)
+	# # fisher   <- round(fisher.alpha( OTU))
+
+	return( t(rbind(richness, shannon, simpson)) )
+}
 ################################################################################
diff --git a/R/network-methods.R b/R/network-methods.R
@@ -0,0 +1,139 @@
+################################################################################
+#' Make sample-wise microbiome network (igraph)
+#'
+#' A specialized function for creating graphical models of microbiome samples
+#' based on a user-defined ecological distance and threshold.
+#' The graph is ultimately built with tools from the 
+#' \code{\link[igraph]{igraph}}-package.
+#'
+#' @usage make_sample_network(physeq, keep.isolates=FALSE, threshold = 0, 
+#'    max.dist = 0.4, dist.fun = function(x) vegdist(x, "jaccard"), presence.absence=FALSE)
+#'
+#' @param physeq (Required). Default \code{NULL}. 
+#'  A \code{\link{phyloseq-class}} object on which \code{g} is based.
+#'
+#' @param keep.isolates (Optional). Default \code{FALSE}. Logical.
+#'  Whether to keep isolates (un-connected samples, not microbial isolates)
+#'  in the graphical model that is returned. Default results in isolates
+#'  being removed from the object.
+#' 
+#' @param threshold (Optional). Default \code{1}.
+#'  The minimum number of individuals of each taxa across all
+#'  samples. Taxa below this value are ignored during network construction.
+#'  Note that this is extremely simple filtering. More advanced filtering
+#'  can be performed prior to this function using the 
+#'  \code{\link{filterfunSample}} or \code{\link{filter_taxa}} functions.
+#' 
+#' @param max.dist (Optional). Default \code{0.4}. 
+#'  The maximum ecological distance (as defined by \code{dist.fun})
+#'  allowed between two samples to still consider them ``connected'' 
+#'  by an edge in the graphical model.
+#' 
+#' @param dist.fun (Optional). Default \code{function(x) vegdist(x, "jaccard")}.
+#'  A function that takes a \code{\link{phyloseq-class}} object
+#'  (or \code{\link{otuTable-class}}) and returns a sample-wise distance
+#'  matrix. Currently, any method in \code{\link{vegdist}} is supported,
+#'  as well as \code{\link{UniFrac}} (although this may take some time to run
+#'  you probably want to run this separately, save, and then provide the
+#'  distance-object directly. See below for alternatives).
+#'  Most ecological distance functions that take a matrix in \code{vegan-package}
+#'  format (samples are rows) are supported. 
+#'
+#'  Alternatively, if you have already calculated the sample-wise distance
+#'  object, this can be provided as \code{dist.fun} instead (see examples).
+#'  A third alternative is to provide a character string indicating the
+#'  desired method of distance calculation. For the moment, this is limited
+#'  to the distance methods available in \code{\link{vegdist}}.
+#' 
+#' @param presence.absence (Optional). Default \code{FALSE}.
+#'  Whether the abundances values should be transformed to binary 
+#'  presence-absence values prior to distance calculation and network
+#'  construction.
+#'
+#' @return A \code{\link[igraph]{igraph}}-class object. 
+#' 
+#' @seealso 
+#'  \code{\link{plot_sample_network}}
+#'
+#' @importFrom igraph graph.adjacency
+#' @importFrom igraph V
+#' @importFrom igraph delete.vertices
+#' @importFrom igraph degree
+#'
+#' @export
+#'
+#' @examples
+#' # # Example plots with Enterotype Dataset
+#' data(enterotype)
+#' 
+#' ig <- make_sample_network(enterotype, FALSE, max.dist=0.3)
+#' plot_sample_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#' 
+#' # ig <- make_sample_network(enterotype, FALSE, max.dist=0.2)
+#' # plot_sample_network(ig, enterotype, color="SeqTech", shape="Enterotype", line_weight=0.3, label=NULL)
+#' 
+#' # # Three methods of choosing/providing distance/distance-method
+#' # Provide method name available to vegdist
+#' ig <- make_sample_network(enterotype, FALSE, max.dist=0.3, dist.fun="jaccard")
+#' # Provide distance object, already computed
+#' ih <- make_sample_network(enterotype, FALSE, max.dist=0.3, dist.fun=vegdist(enterotype, "jaccard"))
+#' # Provide explicit function. Can be custom function.
+#' ii <- make_sample_network(enterotype, FALSE, max.dist=0.3, dist.fun=function(x){vegdist(x, "jaccard")})
+#' # The have equal results:		
+#' all.equal(ig, ih)
+#' all.equal(ig, ii)
+#' #
+make_sample_network <- function(physeq, keep.isolates=FALSE, threshold = 0, 
+    max.dist = 0.4, dist.fun = function(x) vegdist(x, "jaccard"), presence.absence=FALSE){
+
+    abund <- otuTable(physeq)
+    if (speciesAreRows(abund)) {
+        abund <- t(abund)
+    }
+
+    # Keep only those taxa (columns) that are above the threshold
+    abundance <- abund[, colSums(abund) > threshold]
+
+    if( presence.absence ){
+	    # Convert to presence/absence matrix
+	    abundance <- (abundance > threshold) - 0
+    }
+
+    # Calculate or asign sample-wise distance matrix
+    if( class(dist.fun) == "dist" ){
+    	# If dist.fun a distance object, use it rather than re-calculate
+    	sample.dist <- dist.fun
+    	if( attributes(sample.dist)$Size != nsamples(physeq) ){
+    		stop("nsamples(physeq) does not match size of dist object in dist.fun")
+    	}
+    	if( !setequal(attributes(sample.dist)$Labels, sample.names(physeq)) ){
+    		stop("sample.names does not exactly match dist-indices")
+    	}
+    } else if( class(dist.fun) == "character" ){
+	    sample.dist <- vegdist(abundance, method=dist.fun)
+    } else {
+	    sample.dist <- dist.fun(abundance)    	
+    }
+
+    # coerce distance-matrix back into vanilla matrix, Sample Distance Matrix, SaDiMa
+    SaDiMa  <- as.matrix(sample.dist)
+
+    # Add Inf to the diagonal to avoid self-connecting edges (inefficient)
+    SaDiMa <- SaDiMa + diag(Inf, nsamples(physeq), nsamples(physeq))
+
+    # Convert distance matrix to coincidence matrix, CoMa, using max.dist
+	CoMa <- SaDiMa < max.dist
+
+    # Calculate the igraph-formatted network
+    ig <- graph.adjacency(CoMa, mode="lower")
+
+    # If keeping isolates, done. Else, remove them, then return igraph.
+    if( keep.isolates ){
+    	return(ig)
+    } else {
+	    isolates   <- V(ig)[degree(ig) == 0]
+	    ig.no.isol <- delete.vertices(ig, V(ig)[degree(ig) == 0])
+	    return(ig.no.isol)
+    }
+}
+################################################################################