Skip to content
Browse files

Much more efficient aggregation

  • Loading branch information...
1 parent 6dd6bfc commit 33a06563d0d6d28706be3cf4aea23d471e4a8b96 @hadley committed Sep 5, 2010
Showing with 34 additions and 21 deletions.
  1. +15 −16 R/cast.r
  2. +1 −2 R/helper-guess-value.r
  3. +1 −1 README.md
  4. +3 −1 bench/bench.r
  5. +14 −1 inst/tests/test-cast.r
View
31 R/cast.r
@@ -111,24 +111,23 @@ cast <- function(data, formula, fun.aggregate = NULL, ..., subset = NULL, fill =
message("Aggregation function missing: defaulting to length")
fun.aggregate <- length
}
- if (is.null(fill)) {
- fill <- fun.aggregate(value[0])
- }
- value <- tapply(value, overall, fun.aggregate, ...)
- overall <- sort(unique(overall))
- }
-
- # Add in missing values, if necessary
- if (length(overall) < n) {
- overall <- match(seq_len(n), overall, nomatch = NA)
+ ordered <- vaggregate(.value = value, .group = overall,
+ .fun = fun.aggregate, ..., .default = fill, .n = n)
+ overall <- seq_len(n)
+
} else {
- overall <- order(overall)
- }
-
- ordered <- value[overall]
- if (!is.null(fill)) {
- ordered[is.na(ordered)] <- fill
+ # Add in missing values, if necessary
+ if (length(overall) < n) {
+ overall <- match(seq_len(n), overall, nomatch = NA)
+ } else {
+ overall <- order(overall)
+ }
+
+ ordered <- value[overall]
+ if (!is.null(fill)) {
+ ordered[is.na(ordered)] <- fill
+ }
}
list(
View
3 R/helper-guess-value.r
@@ -13,8 +13,7 @@ guess_value <- function(df) {
if ("(all)" %in% names(df)) return("(all)")
last <- names(df)[ncol(df)]
- message("Using ", last, " as value column.", "
- Use the value_var argument to cast to override this choice")
+ message("Using ", last, " as value column: use value_var to override.")
last
}
View
2 README.md
@@ -16,4 +16,4 @@ Compared to `reshape`, `reshape2`:
* supports a new cast syntax which allows you to reshape based on functions
of variables (based on the same underlying syntax as plyr)
-Initial benchmarking has shown `melt` to be up to 10x faster, and `cast` to be up to 100x faster.
+Initial benchmarking has shown `melt` to be up to 10x faster, pure reshaping `cast` up to 100x faster, and aggregating `cast()` up to 10x faster.
View
4 bench/bench.r
@@ -13,4 +13,6 @@ bdm <- subset(bdm, response != 0)
system.time(dcast(bdm, ... ~ question))
# Reshape1:
-# gave up after 40 minutes
+# gave up after 40 minutes
+
+dcast(bdm, question ~ state)
View
15 inst/tests/test-cast.r
@@ -53,12 +53,25 @@ test_that("margins are computed correctly", {
})
-
test_that("missing combinations filled correctly", {
s2am <- subset(s2m, !(X1 == 1 & X2 == 1))
expect_equal(acast(s2am, X1 ~ X2)[1, 1], NA_integer_)
expect_equal(acast(s2am, X1 ~ X2, length)[1, 1], 0)
expect_equal(acast(s2am, X1 ~ X2, length, fill = 1)[1, 1], 1)
+})
+
+test_that("aggregated values computed correctly", {
+ ffm <- melt(french_fries, id = 1:4)
+
+ count_c <- function(vars) as.table(acast(ffm, as.list(vars), length))
+ count_t <- function(vars) table(ffm[vars], useNA = "ifany")
+
+ combs <- matrix(names(ffm)[1:5][t(combn(5, 2))], ncol = 2)
+ a_ply(combs, 1, function(vars) {
+ expect_that(count_c(vars), is_equivalent_to(count_t(vars)),
+ label = paste(vars, collapse = ", "))
+ })
+
})

0 comments on commit 33a0656

Please sign in to comment.
Something went wrong with that request. Please try again.