lgatto · lgatto · Apr 29, 2017 · Apr 28, 2017 · Apr 28, 2017 · Apr 28, 2017
diff --git a/NEWS.md b/NEWS.md
@@ -3,6 +3,8 @@
 ## Changes in version 2.3.1
 
 - Introduce "NTR" method for `combineFeatures` <2017-04-26 Wed>.
+- Rewrite `nQuants` and `featureCV` to avoid returning of rows for empty
+    factors; see PR #208 for details <2017-04-28 Fri>.
 
 ## Changes in version 2.3.0
 

diff --git a/R/functions-MSnSet.R b/R/functions-MSnSet.R
@@ -81,12 +81,10 @@ featureCV <- function(x, groupBy, na.rm = TRUE,
   if (norm != "none")
     x <- normalise(x, method = norm)
 
-  ans <- utils.applyColumnwiseByGroup(exprs(x), groupBy=groupBy,
-                                      FUN=function(y, ...) {
-                                        utils.colSd(y, ...)/
-                                          colMeans(y, ...)}, na.rm=na.rm)
-  colnames(ans) <- paste("CV", colnames(ans), sep = ".")
-  ans
+  cv <- rowsd(exprs(x), group=groupBy, reorder=TRUE, na.rm=na.rm) /
+          rowmean(exprs(x), group=groupBy, reorder=TRUE, na.rm=na.rm)
+  colnames(cv) <- paste("CV", colnames(cv), sep=".")
+  cv
 }
 
 updateFvarLabels <- function(object, label, sep = ".") {
@@ -158,11 +156,9 @@ nQuants <- function(x, groupBy) {
   if (class(x) != "MSnSet")
     stop("'x' must be of class 'MSnSet'.")
 
-  ans <- utils.applyColumnwiseByGroup(exprs(x), groupBy=groupBy,
-                                      FUN=function(y) {
-                                        nrow(y)-colSums(is.na(y))})
-  colnames(ans) <- sampleNames(x)
-  ans
+  e <- !is.na(exprs(x))
+  mode(e) <- "numeric"
+  rowsum(e, group=groupBy, reorder=TRUE)
 }
 
 ##' Subsets \code{MSnSet} instances to their common feature names.

diff --git a/R/utils.R b/R/utils.R
@@ -877,54 +877,50 @@ compareMSnSets <- function(x, y, qual = FALSE, proc = FALSE) {
     all.equal(x, y)
 }
 
-##' Similar to colMeans but calculates the sd. Should be identical to
-##' apply(x, 2, sd, na.rm).
-##' based on: http://stackoverflow.com/questions/17549762/is-there-such-colsd-in-r/17551600#17551600
-##' @title colSd
-##' @param x matrix/data.frame
+##' Similar to rowsum but calculates the mean. It is slower than colMeans but
+##' supports grouping variables. See ?rowsum for details.
+##' @param x matrix
+##' @param group a vector/factor of grouping
+##' @param reorder if TRUE the rows are ordered by `sort(unique(group))`
 ##' @param na.rm logical. Should missing values (including ‘NaN’) be omitted
-##' from the calculations?
-##' @return double
+##' @return matrix
 ##' @author Sebastian Gibb <mail@@sebastiangibb.de>
 ##' @noRd
-utils.colSd <- function(x, na.rm = TRUE) {
+rowmean <- function(x, group, reorder=FALSE, na.rm=FALSE) {
   if (na.rm) {
-    n <- colSums(!is.na(x))
+    nna <- !is.na(x)
+    mode(nna) <- "numeric"
   } else {
-    n <- nrow(x)
+    nna <- x
+    nna[] <- 1
   }
-  colVar <- colMeans(x*x, na.rm = na.rm) - (colMeans(x, na.rm = na.rm))^2L
-  sqrt(colVar * n/(n - 1L))
+  nna <- rowsum(nna, group=group, reorder=reorder, na.rm=na.rm)
+  rs <- rowsum(x, group=group, reorder=reorder, na.rm=na.rm)
+  rs/nna
 }
 
-##' Apply a function groupwise. Similar to tapply but takes a matrix as input
-##' and preserve its structure and order.
-##' @title applyColumnwiseByGroup
+##' Similar to rowsum but calculates the sd.
+##' See ?rowsum for details.
 ##' @param x matrix
-##' @param groupBy factor/grouping index
-##' @param FUN function to be applied; must work on columns, e.g. colSums
-##' @param ... further arguments to FUN
-##' @return modified matrix
+##' @param group a vector/factor of grouping
+##' @param reorder if TRUE the rows are ordered by `sort(unique(group))`
+##' @param na.rm logical. Should missing values (including ‘NaN’) be omitted
+##' @return matrix
 ##' @author Sebastian Gibb <mail@@sebastiangibb.de>
 ##' @noRd
-utils.applyColumnwiseByGroup <- function(x, groupBy, FUN, ...) {
-  if (!is.matrix(x)) {
-    stop("x has to be a matrix!")
-  }
-
-  groupBy <- as.factor(groupBy)
-  FUN <- match.fun(FUN)
-
-  j <- split(1L:nrow(x), groupBy)
-  ans <- matrix(NA_real_, nrow = nlevels(groupBy), ncol = ncol(x),
-                dimnames = list(levels(groupBy), colnames(x)))
-
-  for (i in seq(along = j)) {
-    subexprs <- x[j[[i]], , drop = FALSE]
-    ans[i, ] <- do.call(FUN, list(subexprs, ...))
+rowsd <- function(x, group, reorder=FALSE, na.rm=FALSE) {
+  if (na.rm) {
+    nna <- !is.na(x)
+    mode(nna) <- "numeric"
+  } else {
+    nna <- x
+    nna[] <- 1
   }
-
-  ans
+  nna <- rowsum(nna, group=group, reorder=reorder, na.rm=na.rm)
+  nna[nna == 1] <- NA_real_            # return NA if n == 1 (similar to sd)
+  var <- rowmean(x*x, group=group, reorder=reorder, na.rm=na.rm) -
+    rowmean(x, group=group, reorder=reorder, na.rm=na.rm)^2L
+  sqrt(var * nna/(nna - 1L))
 }
 
 setMethod("trimws", "data.frame",

diff --git a/tests/testthat/test_MSnSet.R b/tests/testthat/test_MSnSet.R
@@ -212,6 +212,66 @@ test_that("Transpose and subset", {
     expect_identical(dim(bb), c(2L, 2L))
 })
 
+test_that("nQuants", {
+    m <- new("MSnSet",
+             exprs=matrix(1:10, nrow=5, ncol=2),
+             featureData=new("AnnotatedDataFrame",
+                             data=data.frame(accession=
+                                             factor(c("A", "A", "A", "B", "B")))))
+    qu <- matrix(3:2, nrow=2, ncol=2, dimnames=list(c("A", "B"), 1:2))
+    expect_equal(nQuants(m, group=fData(m)$accession), qu)
+
+    ## more levels than items present in the factor
+    m@featureData <- new("AnnotatedDataFrame",
+                         data=data.frame(accession=
+                                         factor(c("A", "A", "A", "B", "B"),
+                                                    levels=LETTERS[1:10])))
+    expect_equal(nQuants(m, group=fData(m)$accession), qu)
+
+    ## real world example
+    data(msnset)
+    pa <- fData(msnset)$ProteinAccession
+    upa <- unique(pa)
+    qu <- matrix(1, nrow=length(upa), ncol=ncol(msnset),
+                 dimnames=list(upa[order(upa)], sampleNames(msnset)))
+    qu[c("ECA0435", "ECA0469", "ECA3349", "ECA3566", "ECA4026"),] <- 2
+    qu["BSA",] <- 3
+    qu["ENO",] <- c(4, 4, 3, 4)
+    qu["ECA4514",] <- 6
+    expect_equal(nQuants(msnset, pa), qu)
+
+    ## more levels than items present in the factor
+    qu <- matrix(1, nrow=10, ncol=4,
+                 dimnames=list(as.character(pa[order(pa[1:10])]),
+                                            sampleNames(msnset)))
+    expect_equal(nQuants(msnset[1:10], pa[1:10]), qu)
+})
+
+test_that("featureCV", {
+    m <- new("MSnSet",
+             exprs=matrix(1:10, nrow=5, ncol=2),
+             featureData=new("AnnotatedDataFrame",
+                             data=data.frame(accession=
+                                             factor(c("A", "A", "A", "B", "B")))))
+    cv <- matrix(c(0.5, 1/4.5, 1/7, 1/9.5),
+                 nrow=2, ncol=2, dimnames=list(c("A", "B"), c("CV.1", "CV.2"))
+    expect_equal(featureCV(m, group=fData(m)$accession, norm="none"), cv)
+
+    ## more levels than items present in the factor
+    m@featureData <- new("AnnotatedDataFrame",
+                         data=data.frame(accession=
+                                         factor(c("A", "A", "A", "B", "B"),
+                                                    levels=LETTERS[1:10])))
+    expect_equal(featureCV(m, group=fData(m)$accession, norm="none"), cv)
+
+    i <- list(1:3, 4:5, 6:8, 9:10)
+    div <- rowSums(exprs(m))
+    cv <- matrix(sapply(i, function(ii) {
+                   sd((exprs(m)/div)[ii]/mean((exprs(m)/div)[ii])) }),
+                 nrow=2, ncol=2, dimnames=list(c("A", "B"), c("CV.1", "CV.2")))
+    expect_equal(featureCV(m, group=fData(m)$accession, norm="sum"), cv)
+})
+
 context("MSnSet identification data")
 
 test_that("addIdentificationData", {

diff --git a/tests/testthat/test_utils.R b/tests/testthat/test_utils.R
@@ -162,26 +162,42 @@ test_that("formatRt", {
     expect_warning(is.na(formatRt(TRUE)))
 })
 
-test_that("colSd", {
-    set.seed(1)
-    m <- matrix(rnorm(10), ncol=2)
+test_that("rowmean", {
+    m <- matrix(1:10, ncol=2)
     mna <- m
-    mna[c(1, 8)] <- NA
-    expect_equal(MSnbase:::utils.colSd(m), apply(m, 2, sd))
-    expect_equal(MSnbase:::utils.colSd(mna, na.rm=TRUE),
-                 apply(mna, 2, sd, na.rm=TRUE))
+    mna[c(2, 8)] <- NA
+    group <- c("B", "B", "A", "A", "B")
+    r <- matrix(c(8/3, 3.5, 23/3, 8.5), ncol=2, dimnames=list(c("B", "A"), c()))
+    rna <- r
+    rna[c(1, 4)] <- NA
+    rnarm <- matrix(c(3, 3.5, 23/3, 9), ncol=2, dimnames=list(c("B", "A"), c()))
+    expect_error(MSnbase:::rowmean(m, group=1:2))
+    expect_equal(MSnbase:::rowmean(m, group=group), r)
+    expect_equal(MSnbase:::rowmean(m, group=group, reorder=TRUE), r[2:1,])
+    expect_equal(MSnbase:::rowmean(mna, group=group), rna)
+    expect_equal(MSnbase:::rowmean(mna, group=group, reorder=TRUE), rna[2:1,])
+    expect_equal(MSnbase:::rowmean(mna, group=group, na.rm=TRUE), rnarm)
 })
 
-test_that("applyColumnwiseByGroup", {
-    m <- matrix(1:20, nrow=4, byrow=TRUE,
-                dimnames=list(1:4, LETTERS[1:5]))
-    r <- matrix(c(seq(7, 15, by=2), seq(27, 35, by=2)), nrow=2, byrow=TRUE,
-                dimnames=list(1:2, LETTERS[1:5]))
-    expect_error(MSnbase:::utils.applyColumnwiseByGroup(1:10, 1:2, sum),
-                 "x has to be a matrix")
-    expect_equal(MSnbase:::utils.applyColumnwiseByGroup(m,
-                                                        rep(1:2, each=2),
-                                                        colSums), r)
+test_that("rowsd", {
+    m <- matrix(1:10, ncol=2)
+    mna <- m
+    mna[c(2, 8)] <- NA
+    group <- c("B", "B", "A", "A", "B")
+    r <- matrix(c(sd(m[c(1, 2, 5), 1]), sd(m[3:4, 1]),
+                  sd(m[c(1, 2, 5), 2]), sd(m[3:4, 2])),
+                  ncol=2, dimnames=list(c("B", "A"), c()))
+    rna <- r
+    rna[c(1, 4)] <- NA
+    rnarm <- matrix(c(sd(mna[c(1, 2, 5), 1], na.rm=TRUE), r[2],
+                      r[3], sd(mna[3:4, 2], na.rm=TRUE)),
+          ncol=2, dimnames=list(c("B", "A"), c()))
+    expect_error(MSnbase:::rowsd(m, group=1:2))
+    expect_equal(MSnbase:::rowsd(m, group=group), r)
+    expect_equal(MSnbase:::rowsd(m, group=group, reorder=TRUE), r[2:1,])
+    expect_equal(MSnbase:::rowsd(mna, group=group), rna)
+    expect_equal(MSnbase:::rowsd(mna, group=group, reorder=TRUE), rna[2:1,])
+    expect_equal(MSnbase:::rowsd(mna, group=group, na.rm=TRUE), rnarm)
 })
 
 test_that("get.amino.acids", {