Skip to content
This repository
Browse code

Optimising row subsetting

  • Loading branch information...
commit 0f5b667eaff595390ed873cd4845134af93b6474 1 parent 07a55af
Hadley Wickham authored

Showing 3 changed files with 62 additions and 11 deletions. Show diff stats Hide diff stats

  1. +2 0  NEWS
  2. +29 4 R/indexed-data-frame.r
  3. +31 7 benchmark/get-rows.r
2  NEWS
... ... @@ -1,6 +1,8 @@
1 1 Version 1.7.1.99
2 2 ------------------------------------------------------------------------------
3 3
  4 +* The subsetting in `d*ply` has been considerably optimised: this will have a small impact unless you have a very large number of groups, in which case it will be considerably faster.
  5 +
4 6 * `d*ply` will now preserve factor levels input if `drop = FALSE` (#81)
5 7
6 8 * `name_rows` provides a convenient way of saving and then restoring row names so that you can preserve them if you need to. (#61)
33 R/indexed-data-frame.r
@@ -16,8 +16,33 @@ indexed_df <- function(data, index, vars) {
16 16
17 17 #' @S3method [[ indexed_df
18 18 "[[.indexed_df" <- function(x, i) {
19   - structure(x$data[x$index[[i]], , drop = FALSE], vars = x$vars)
20   - # x$env$data[x$index[[i]], , drop = FALSE]
21   - # slice(x, attr(x, "index")[[i]])
22   - # subset_rows(x$env$data, x$index[[i]])
  19 + out <- extract_rows(x$data, x$index[[i]])
  20 + attr(out, "vars") <- x$vars
  21 + out
  22 +}
  23 +
  24 +extract_rows <- function(x, i) {
  25 + n <- ncol(x)
  26 +
  27 + out <- lapply(seq_len(n), extract_col_rows, df = x, i = i)
  28 +
  29 + names(out) <- names(x)
  30 + class(out) <- "data.frame"
  31 + attr(out, "row.names") <- c(NA_integer_, -length(out[[1]]))
  32 +
  33 + out
  34 +}
  35 +extract_col_rows <- function(df, i, j) {
  36 + col <- .subset2(df, j)
  37 + if (isS4(col)) return(col[i])
  38 +
  39 + if (is.null(attr(col, "class"))) {
  40 + .subset(col, i)
  41 + } else if (inherits(col, "factor") || inherits(x, "POSIXt")) {
  42 + out <- .subset(col, i)
  43 + attributes(out) <- attributes(col)
  44 + out
  45 + } else {
  46 + col[i]
  47 + }
23 48 }
38 benchmark/get-rows.r
@@ -91,6 +91,29 @@ row7 <- function(x, i) {
91 91 out
92 92 }
93 93
  94 +# Row7 but with quickdf - loses all gains by duplicating (once?)
  95 +row8 <- function(x, i) {
  96 + n <- ncol(x)
  97 +
  98 + sub_col <- function(df, i, j) {
  99 + col <- .subset2(df, j)
  100 + if (isS4(col)) return(col[i])
  101 +
  102 + if (is.null(attr(col, "class"))) {
  103 + .subset(col, i)
  104 + } else if (inherits(col, "factor") || inherits(x, "POSIXt")) {
  105 + out <- .subset(col, i)
  106 + attributes(out) <- attributes(col)
  107 + out
  108 + } else {
  109 + col[i]
  110 + }
  111 + }
  112 +
  113 + out <- lapply(seq_len(n), sub_col, df = x, i = i)
  114 + quickdf(out)
  115 +}
  116 +
94 117 if (FALSE) {
95 118 n <- 1000; p <- 10
96 119 df <- as.data.frame(replicate(p, runif(n)))
@@ -98,13 +121,14 @@ if (FALSE) {
98 121
99 122
100 123 microbenchmark(
101   - row1(df, 5000:5010),
102   - row2(df, 5000:5010),
103   - row3(df, 5000:5010),
104   - row4(df, 5000:5010),
105   - row5(df, 5000:5010),
106   - row6(df, 5000:5010),
107   - row7(df, 5000:5010)
  124 + row1(df, 5000:6000),
  125 + row2(df, 5000:6000),
  126 + row3(df, 5000:6000),
  127 + row4(df, 5000:6000),
  128 + row5(df, 5000:6000),
  129 + row6(df, 5000:6000),
  130 + row7(df, 5000:6000),
  131 + row8(df, 5000:6000)
108 132 )
109 133
110 134 }

0 comments on commit 0f5b667

Please sign in to comment.
Something went wrong with that request. Please try again.