Skip to content
This repository
Browse code

Generalise margins code to nD

  • Loading branch information...
commit cf4429a5dc853d2167b091de82a892b8ba6b0665 1 parent ed867c0
Hadley Wickham authored
12 R/cast.r
@@ -116,7 +116,7 @@ cast <- function(data, formula, fun.aggregate = NULL, ..., subset = NULL, fill =
116 116 ids <- lapply(vars, id, drop = drop)
117 117 labels <- mapply(split_labels, vars, ids, MoreArgs = list(drop = drop),
118 118 SIMPLIFY = FALSE, USE.NAMES = FALSE)
119   - overall <- id(rev(ids))
  119 + overall <- id(rev(ids), drop = FALSE)
120 120
121 121 ns <- vapply(ids, attr, 0, "n")
122 122 n <- attr(overall, "n")
@@ -160,11 +160,9 @@ dcast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL, subs
160 160 }
161 161
162 162 if (!is.null(margins)) {
163   - data <- add_margins(data, names(formula[[1]]), names(formula[[2]]),
164   - margins)
  163 + data <- add_margins(data, lapply(formula, names), margins)
165 164 }
166 165
167   -
168 166 res <- cast(data, formula, fun.aggregate, ...,
169 167 subset = subset, fill = fill, drop = drop,
170 168 value_var = value_var)
@@ -180,11 +178,7 @@ acast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL, subs
180 178 formula <- parse_formula(formula, names(data), value_var)
181 179
182 180 if (!is.null(margins)) {
183   - if (length(formula) > 2) {
184   - stop("Margins only work for up to two variables")
185   - }
186   - data <- add_margins(data, names(formula[[1]]), names(formula[[2]]),
187   - margins)
  181 + data <- add_margins(data, lapply(formula, names), margins)
188 182 }
189 183
190 184 res <- cast(data, formula, fun.aggregate, ...,
98 R/helper-margins.r
@@ -4,59 +4,45 @@
4 4 #' margins, works out which ones are possible. Variables that can't be
5 5 #' margined over are dropped silently.
6 6 #'
7   -#' @param col a character vector of column names
8   -#' @param row a character vector of row names
9   -#' @param margins a character vector of variable names to margin over. Can be
10   -#' any variable name in \code{col} or \code{row}, \code{"grand_row"} or
11   -#' \code{"grand_col"}. If \code{TRUE} will compute all possible margins.
  7 +#' @param vars a list of character vectors giving the variables in each
  8 +#' dimension
  9 +#' @param margins a character vector of variable names to compute margins for.
  10 +#' \code{TRUE} will compute all possible margins.
12 11 #' @keywords manip internal
13 12 #' @return list of margining combinations, or \code{NULL} if none. These are
14 13 #' the combinations of variables that should have their values set to
15   -#' `(all)`
16   -margins <- function(rows = NULL, cols = NULL, margins = NULL) {
  14 +#' \code{(all)}
  15 +margins <- function(vars, margins = NULL) {
17 16 if (is.null(margins) || identical(margins, FALSE)) return(NULL)
  17 +
  18 + all_vars <- unlist(vars)
18 19 if (isTRUE(margins)) {
19   - margins <- c(rows, cols, "grand_row", "grand_col")
20   - }
21   -
22   - # Nothing to margin over for last variable in column or row
23   - row.margins <- intersect(rows[-length(rows)], margins)
24   - if (length(row.margins) == 0 ) row.margins <- NULL
25   - col.margins <- intersect(cols[-length(cols)], margins)
26   - if (length(col.margins) == 0 ) col.margins <- NULL
27   -
28   - margin.intersect <- function(cols, col.margins, rows, row.margins) {
29   - unlist(lapply(col.margins, function(col) {
30   - c(lapply(row.margins, c, col), list(c(col, rows)))
31   - }), recursive = FALSE)
  20 + margins <- all_vars
32 21 }
33   -
34   - margin_vars <- c(
35   - margin.intersect(cols, col.margins, rows, row.margins),
36   - margin.intersect(rows, row.margins, cols, col.margins)
37   - )
38 22
39   - grand.row <- "grand_row" %in% margins
40   - grand.col <- "grand_col" %in% margins
41   - if (grand.row && !is.null(rows)) {
42   - margin_vars <- compact(c(margin_vars, list(cols), list(col.margins)))
43   - }
44   - if (grand.col && !is.null(cols)) {
45   - margin_vars <- compact(c(margin_vars, list(rows), list(row.margins)))
46   - }
  23 + # Start by grouping margins by dimension
  24 + dims <- lapply(vars, intersect, margins)
47 25
48   - # Do we need an overall total?
49   - rows_and_cols <- grand.col && grand.row && !is.null(rows) && !is.null(cols)
50   - just_rows <- grand.row && !is.null(rows) && is.null(cols)
51   - just_cols <- grand.col && !is.null(cols) && is.null(rows)
52   - if (rows_and_cols || just_rows || just_cols) {
53   - margin_vars <- c(margin_vars, list(c(rows, cols)))
54   - }
  26 + # Next, ensure high-level margins include lower-levels
  27 + dims <- mapply(function(vars, margin) {
  28 + lapply(margin, downto, vars)
  29 + }, vars, dims, SIMPLIFY = FALSE, USE.NAMES = FALSE)
  30 +
  31 + # Finally, find intersections across all dimensions
  32 + seq_0 <- function(x) c(0, seq_along(x))
  33 + indices <- expand.grid(lapply(dims, seq_0), KEEP.OUT.ATTRS = FALSE)
  34 + # indices <- indices[rowSums(indices) > 0, ]
  35 +
  36 + lapply(seq_len(nrow(indices)), function(i){
  37 + unlist(mapply("[", dims, indices[i, ], SIMPLIFY = FALSE))
  38 + })
  39 +}
55 40
56   - # Remove duplicates
57   - duplicates <- duplicated(lapply(lapply(margin_vars, sort), paste,
58   - collapse = ""))
59   - margin_vars[!duplicates]
  41 +upto <- function(a, b) {
  42 + b[seq_len(match(a, b, nomatch = 0))]
  43 +}
  44 +downto <- function(a, b) {
  45 + rev(upto(a, rev(b)))
60 46 }
61 47
62 48 #' Add margins to a data frame.
@@ -65,34 +51,34 @@ margins <- function(rows = NULL, cols = NULL, margins = NULL) {
65 51 #' to factors.
66 52 #'
67 53 #' @param df input data frame
68   -#' @param rows names of row margins
69   -#' @param cols names of column margins
70   -#' @param margins Either \code{TRUE} to compute all margins, or a character
71   -#' vector of margin names, which may include any name in \code{rows} or
72   -#' \code{cols}, or \code{"grand_row"} or \code{"grand_col"}.
  54 +#' @param vars a list of character vectors giving the variables in each
  55 +#' dimension
  56 +#' @param margins a character vector of variable names to compute margins for.
  57 +#' \code{TRUE} will compute all possible margins.
73 58 #' @export
74   -add_margins <- function(df, rows = NULL, cols = NULL, margins = TRUE) {
75   - margin_vars <- margins(rows, cols, margins)
  59 +add_margins <- function(df, vars, margins = TRUE) {
  60 + margin_vars <- margins(vars, margins)
76 61
77 62 # Return data frame if no margining necessary
78 63 if (length(margin_vars) == 0) return(df)
79 64
80 65 # Prepare data frame for addition of margins
81 66 addAll <- function(x) {
82   - if (!is.factor(x)) x <- factor(x)
  67 + x <- as.factor(x)
83 68 levels(x) <- c(levels(x), "(all)")
84 69 x
85 70 }
86 71 vars <- unique(unlist(margin_vars))
87 72 df[vars] <- lapply(df[vars], addAll)
  73 +
88 74 rownames(df) <- NULL
89 75
90 76 # Loop through all combinations of margin variables, setting
91 77 # those variables to (all)
92   - margin_dfs <- ldply(margin_vars, function(vars) {
93   - df[, vars] <- factor("(all)")
94   - unique(df)
  78 + margin_dfs <- llply(margin_vars, function(vars) {
  79 + df[vars] <- factor("(all)")
  80 + df
95 81 })
96 82
97   - rbind(df, margin_dfs)
  83 + rbind.fill(margin_dfs)
98 84 }
24 README.md
Source Rendered
@@ -2,26 +2,28 @@ Reshape2 is a reboot of the reshape package. It's been over five years since the
2 2
3 3 This version improves speed at the cost of functionality, so I have renamed it to `reshape2` to avoid causing problems for existing users. Based on user feedback I may reintroduce some of these features.
4 4
5   -Compared to `reshape`, `reshape2`:
  5 +What's new in `reshape2`:
6 6
7   - * is considerably faster and more memory efficient thanks to a much better
  7 + * considerably faster and more memory efficient thanks to a much better
8 8 underlying algorithm that uses the power and speed of subsetting to the
9 9 fullest extent, in most cases only making a single copy of the data.
10 10
11 11 * cast is replaced by two functions depending on the output type: `dcast`
12 12 produces data frames, and `acast` produces matrices/arrays.
  13 +
  14 + * multidimensional margins are now possible: `grand_row` and `grand_col` have
  15 + been dropped: now the name of the margin refers to the variable that has
  16 + its value set to (all).
13 17
14   - * lacks some features such as the `|` cast operator, and the ability to
15   - return multiple values from an aggregation function. I'm reasonable sure
16   - both these operations are better performed by plyr.
  18 + * some features have been removed such as the `|` cast operator, and the
  19 + ability to return multiple values from an aggregation function. I'm
  20 + reasonably sure both these operations are better performed by plyr.
17 21
18   - * supports a new cast syntax which allows you to reshape based on functions
  22 + * a new cast syntax which allows you to reshape based on functions
19 23 of variables (based on the same underlying syntax as plyr):
20   -
21   - * implements good development practices like namespaces, tests, and code that
22   - readable and understandable.
23 24
24   -Initial benchmarking has shown `melt` to be up to 10x faster, pure reshaping `cast` up to 100x faster, and aggregating `cast()` up to 10x faster.
  25 + * better development practices like namespaces and tests.
25 26
  27 +Initial benchmarking has shown `melt` to be up to 10x faster, pure reshaping `cast` up to 100x faster, and aggregating `cast()` up to 10x faster.
26 28
27   -This work has been generously supported by BD (Becton Dickinson).
  29 +This work has been generously supported by BD (Becton Dickinson).
32 inst/tests/test-cast.r
@@ -53,8 +53,8 @@ test_that("aggregation matches table", {
53 53 })
54 54
55 55 test_that("grand margins are computed correctly", {
56   - col <- acast(s2m, X1 ~ X2, mean, margins = "grand_col")[4, ]
57   - row <- acast(s2m, X1 ~ X2, mean, margins = "grand_row")[, 5]
  56 + col <- acast(s2m, X1 ~ X2, mean, margins = "X1")[4, ]
  57 + row <- acast(s2m, X1 ~ X2, mean, margins = "X2")[, 5]
58 58 grand <- acast(s2m, X1 ~ X2, mean, margins = T)[4, 5]
59 59
60 60 expect_equivalent(col, colMeans(s2))
@@ -62,15 +62,17 @@ test_that("grand margins are computed correctly", {
62 62 expect_equivalent(grand, mean(s2))
63 63 })
64 64 #
65   -# test_that("internal margins are computed correctly", {
66   -#
67   -# cast <- dcast(chick_m, diet + chick ~ time, length, margins="diet")
68   -# marg <- subset(cast, diet == "(all)")[-(1:2), ]
69   -#
70   -# joint <- subset(cast, diet != "(all)")[, 1:14]
71   -# expect_that(joint, equals(dcast(chick_m, diet + chick ~ time, length)))
72   -#
73   -# })
  65 +test_that("internal margins are computed correctly", {
  66 + cast <- dcast(chick_m, diet + chick ~ time, length, margins="diet")
  67 +
  68 + marg <- subset(cast, diet == "(all)")[-(1:2)]
  69 + expect_that(as.vector(as.matrix(marg)),
  70 + equals(as.vector(acast(chick_m, time ~ ., length))))
  71 +
  72 + joint <- subset(cast, diet != "(all)")
  73 + expect_that(joint,
  74 + is_equivalent_to(dcast(chick_m, diet + chick ~ time, length)))
  75 +})
74 76
75 77 test_that("missing combinations filled correctly", {
76 78 s2am <- subset(s2m, !(X1 == 1 & X2 == 1))
@@ -81,6 +83,14 @@ test_that("missing combinations filled correctly", {
81 83
82 84 })
83 85
  86 +test_that("drop = FALSE generates all combinations", {
  87 + df <- data.frame(x = c("a", "b"), y = c("a", "b"), value = 1:2)
  88 +
  89 + expect_that(as.vector(acast(df, x + y ~ ., drop = FALSE)),
  90 + is_equivalent_to(as.vector(acast(df, x ~ y))))
  91 +
  92 +})
  93 +
84 94 test_that("aggregated values computed correctly", {
85 95 ffm <- melt(french_fries, id = 1:4)
86 96
18 inst/tests/test-margins.r
... ... @@ -0,0 +1,18 @@
  1 +context("Margins")
  2 +
  3 +vars <- list(c("a", "b", "c"), c("d", "e", "f"))
  4 +test_that("margins expanded", {
  5 + expect_that(margins(vars, "c")[[2]], equals(c("c")))
  6 + expect_that(margins(vars, "b")[[2]], equals(c("b", "c")))
  7 + expect_that(margins(vars, "a")[[2]], equals(c("a", "b", "c")))
  8 +
  9 + expect_that(margins(vars, "f")[[2]], equals(c("f")))
  10 + expect_that(margins(vars, "e")[[2]], equals(c("e", "f")))
  11 + expect_that(margins(vars, "d")[[2]], equals(c("d", "e", "f")))
  12 +})
  13 +
  14 +test_that("margins intersect", {
  15 + expect_that(margins(vars, c("c", "f"))[-1],
  16 + equals(list("c", "f", c("c", "f"))))
  17 +
  18 +})

0 comments on commit cf4429a

Please sign in to comment.
Something went wrong with that request. Please try again.