Permalink
Browse files

Optimising row subsetting

  • Loading branch information...
1 parent 07a55af commit 0f5b667eaff595390ed873cd4845134af93b6474 @hadley committed Oct 8, 2012
Showing with 62 additions and 11 deletions.
  1. +2 −0 NEWS
  2. +29 −4 R/indexed-data-frame.r
  3. +31 −7 benchmark/get-rows.r
View
2 NEWS
@@ -1,6 +1,8 @@
Version 1.7.1.99
------------------------------------------------------------------------------
+* The subsetting in `d*ply` has been considerably optimised: this will have a small impact unless you have a very large number of groups, in which case it will be considerably faster.
+
* `d*ply` will now preserve factor levels input if `drop = FALSE` (#81)
* `name_rows` provides a convenient way of saving and then restoring row names so that you can preserve them if you need to. (#61)
View
@@ -16,8 +16,33 @@ indexed_df <- function(data, index, vars) {
#' @S3method [[ indexed_df
"[[.indexed_df" <- function(x, i) {
- structure(x$data[x$index[[i]], , drop = FALSE], vars = x$vars)
- # x$env$data[x$index[[i]], , drop = FALSE]
- # slice(x, attr(x, "index")[[i]])
- # subset_rows(x$env$data, x$index[[i]])
+ out <- extract_rows(x$data, x$index[[i]])
+ attr(out, "vars") <- x$vars
+ out
+}
+
+extract_rows <- function(x, i) {
+ n <- ncol(x)
+
+ out <- lapply(seq_len(n), extract_col_rows, df = x, i = i)
+
+ names(out) <- names(x)
+ class(out) <- "data.frame"
+ attr(out, "row.names") <- c(NA_integer_, -length(out[[1]]))
+
+ out
+}
+extract_col_rows <- function(df, i, j) {
+ col <- .subset2(df, j)
+ if (isS4(col)) return(col[i])
+
+ if (is.null(attr(col, "class"))) {
+ .subset(col, i)
+ } else if (inherits(col, "factor") || inherits(x, "POSIXt")) {
+ out <- .subset(col, i)
+ attributes(out) <- attributes(col)
+ out
+ } else {
+ col[i]
+ }
}
View
@@ -91,20 +91,44 @@ row7 <- function(x, i) {
out
}
+# Row7 but with quickdf - loses all gains by duplicating (once?)
+row8 <- function(x, i) {
+ n <- ncol(x)
+
+ sub_col <- function(df, i, j) {
+ col <- .subset2(df, j)
+ if (isS4(col)) return(col[i])
+
+ if (is.null(attr(col, "class"))) {
+ .subset(col, i)
+ } else if (inherits(col, "factor") || inherits(x, "POSIXt")) {
+ out <- .subset(col, i)
+ attributes(out) <- attributes(col)
+ out
+ } else {
+ col[i]
+ }
+ }
+
+ out <- lapply(seq_len(n), sub_col, df = x, i = i)
+ quickdf(out)
+}
+
if (FALSE) {
n <- 1000; p <- 10
df <- as.data.frame(replicate(p, runif(n)))
names(df) <- letters[1:ncol(df)]
microbenchmark(
- row1(df, 5000:5010),
- row2(df, 5000:5010),
- row3(df, 5000:5010),
- row4(df, 5000:5010),
- row5(df, 5000:5010),
- row6(df, 5000:5010),
- row7(df, 5000:5010)
+ row1(df, 5000:6000),
+ row2(df, 5000:6000),
+ row3(df, 5000:6000),
+ row4(df, 5000:6000),
+ row5(df, 5000:6000),
+ row6(df, 5000:6000),
+ row7(df, 5000:6000),
+ row8(df, 5000:6000)
)
}

0 comments on commit 0f5b667

Please sign in to comment.