New strip_splits function

hadley · Mar 2, 2011 · 1607d23 · 1607d23
1 parent d741061
commit 1607d23
Show file tree

Hide file tree

Showing 9 changed files with 58 additions and 11 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -25,10 +25,10 @@ Collate: 'dimensions.r' 'helper-arrange.r'
     'helper-each.r' 'helper-match-df.r' 'helper-mutate.r'
     'helper-quick-df.r' 'helper-rename.r'
     'helper-round-any.r' 'helper-splat.r'
-    'helper-summarise.r' 'helper-try.r'
-    'helper-vaggregate.r' 'id.r' 'immutable.r'
-    'indexed-array.r' 'indexed-data-frame.r' 'indexed.r'
-    'join.r' 'loop-apply.r' 'ply-array.r'
+    'helper-strip-splits.r' 'helper-summarise.r'
+    'helper-try.r' 'helper-vaggregate.r' 'id.r'
+    'immutable.r' 'indexed-array.r' 'indexed-data-frame.r'
+    'indexed.r' 'join.r' 'loop-apply.r' 'ply-array.r'
     'ply-data-frame.r' 'ply-iterator.r' 'ply-list.r'
     'ply-mapply.r' 'ply-null.r' 'ply-replicate.r'
     'progress.r' 'quote.r' 'rbind-matrix.r' 'rbind.r'

diff --git a/NAMESPACE b/NAMESPACE
@@ -12,6 +12,7 @@ export(quickdf)
 export(rename)
 export(round_any)
 export(splat)
+export(strip_splits)
 export(summarise, summarize)
 export(failwith)
 export(try_default)

diff --git a/NEWS b/NEWS
@@ -1,6 +1,9 @@
 Version 1.5 (2011-XX-XX)
 ------------------------------------------------------------------------------
 
+* new `strip_splits` function removes splitting variables from the data frames
+  returned by `ddply`.
+
 * `join` now implements joins in a more SQL like way, returning all possible
    matches, not just the first one.  It is still a (little) faster than merge.
    The previous behaviour is accessible with `match = "first"`.

diff --git a/R/helper-strip-splits.r b/R/helper-strip-splits.r
@@ -0,0 +1,15 @@
+#' Remove splitting variables from a data frame.
+#'
+#' This is useful when you want to perform some operation to every column
+#' in the data frame, except the variables that you have used to split it.
+#' These variables will be automatically added back on to the result when
+#' combining all results together.
+#'
+#' @param df data frame produced by \code{d*ply}.
+#' @export
+#' @examples
+#' dlply(mtcars, c("vs", "am"))
+#' dlply(mtcars, c("vs", "am"), strip_splits)
+strip_splits <- function(df) {
+  x[setdiff(names(df), attr(df, "vars"))]
+}
diff --git a/R/indexed-data-frame.r b/R/indexed-data-frame.r
@@ -3,19 +3,20 @@
 #' 
 #' @param env environment containing data frame
 #' @param index list of indices
+#' @param vars a character vector giving the variables used for subsetting
 #' @keywords internal
 #' @aliases indexed_df length.indexed names.indexed as.list.indexed
 #'   [[.indexed_df [.indexed print.indexed
-indexed_df <- function(data, index) {
+indexed_df <- function(data, index, vars) {
 
   structure(
-    list(data = data, index = index),
+    list(data = data, index = index, vars = vars),
     class = c("indexed", "indexed_df")
   )
 }
 
 "[[.indexed_df" <- function(x, i) {
-  x$data[x$index[[i]], , drop = FALSE]
+  structure(x$data[x$index[[i]], , drop = FALSE], vars = x$vars)
   # x$env$data[x$index[[i]], , drop = FALSE]
   # slice(x, attr(x, "index")[[i]]) 
   # subset_rows(x$env$data, x$index[[i]])

diff --git a/R/split-data-frame.r b/R/split-data-frame.r
@@ -13,7 +13,7 @@
 #' 
 #' @seealso \code{\link{.}} for quoting variables, \code{\link{split}}
 #' @param data data frame
-#' @param .variables a \link{quoted} list of variables, a formula, or character vector.  \code{NULL} will not split the data
+#' @param .variables a \link{quoted} list of variables
 #' @param drop drop unnused factor levels?
 #' @return a list of data.frames, with attributes that record split details
 #' @keywords internal
@@ -31,21 +31,26 @@
 #' plyr:::splitter_d(mtcars, .(cyl3, vs))
 #' plyr:::splitter_d(mtcars, .(cyl3, vs), drop = FALSE)
 splitter_d <- function(data, .variables = NULL, drop = TRUE) {
+  stopifnot(is.quoted(.variables))
+
+
   if (length(.variables) == 0) {
     splitv <- rep(1, nrow(data))
     split_labels <- NULL
     attr(splitv, "n") <- max(splitv)
+    vars <- character(0)
   } else {
     splits <- eval.quoted(.variables, data)
 
     splitv <- id(splits, drop = drop)
     split_labels <- split_labels(splits, drop = drop, id = splitv)
+    vars <- unlist(lapply(.variables, all.vars))
   }
 
   index <- split_indices(seq_along(splitv), as.integer(splitv), 
     attr(splitv, "n"))
 
-  il <- indexed_df(data, index)
+  il <- indexed_df(data, index, vars)
 
   structure(
     il,

diff --git a/man/indexed_df.Rd b/man/indexed_df.Rd
@@ -1,6 +1,6 @@
 \name{indexed_df}
 \title{An indexed data frame.}
-\usage{indexed_df(data, index)}
+\usage{indexed_df(data, index, vars)}
 
 \description{
   An indexed data frame. Create a indexed list, a space
@@ -17,4 +17,5 @@
 \arguments{
   \item{env}{environment containing data frame}
   \item{index}{list of indices}
+  \item{vars}{a character vector giving the variables used for subsetting}
 }
diff --git a/man/splitter_d.Rd b/man/splitter_d.Rd
@@ -26,7 +26,7 @@
 \keyword{internal}
 \arguments{
   \item{data}{data frame}
-  \item{.variables}{a \link{quoted} list of variables, a formula, or character vector.  \code{NULL} will not split the data}
+  \item{.variables}{a \link{quoted} list of variables}
   \item{drop}{drop unnused factor levels?}
 }
 \examples{plyr:::splitter_d(mtcars, .(cyl))

diff --git a/man/strip_splits.Rd b/man/strip_splits.Rd
@@ -0,0 +1,21 @@
+\name{strip_splits}
+\alias{strip_splits}
+\title{Remove splitting variables from a data frame.}
+\usage{strip_splits(df)}
+
+\description{
+  Remove splitting variables from a data frame.
+}
+
+\details{
+  This is useful when you want to perform some operation to
+  every column in the data frame, except the variables that
+  you have used to split it. These variables will be
+  automatically added back on to the result when combining
+  all results together.
+}
+\arguments{
+  \item{df}{data frame produced by \code{d*ply}.}
+}
+\examples{dlply(mtcars, c("vs", "am"))
+dlply(mtcars, c("vs", "am"), strip_splits)}