Permalink
Browse files

output_template treats dims orthogonal to type

  • Loading branch information...
1 parent 33b69f8 commit 33aaf66c6811149b203100bd17781a6ce52da579 @crowding crowding committed Aug 23, 2013
Showing with 100 additions and 50 deletions.
  1. +10 −4 NEWS
  2. +42 −40 R/rbind-fill.r
  3. +48 −6 inst/tests/test-rbind.r
View
@@ -1,9 +1,15 @@
Version 1.8.0.99
------------------------------------------------------------------------------
-* `join(x,y)` works when the key column in X is character and Y is
- factor. Additionally `rbind.fill(x,y)` converts factor columns of Y
- to character when columns of X are character. (Thanks to Peter
- Meilstrup; #128)
+Improvements to `rbind.fill` and by extension `*dply` and `join`:
+
+* `rbind.fill` handles non-numeric matrix columns (i.e. factors, characters, lists)
+ (Contributed by Peter Meilstrup)
+
+* `rbind.fill(x,y)` converts factor columns of Y to
+ character when columns of X are character. (Contributed by Peter Meilstrup)
+
+* `join(x,y)` and `match_df(x,y)` now work when the key column in X is
+ character and Y is factor. (Contributed by Peter Meilstrup)
* Fix faulty array allocation which caused problems when using `split_indices`
with large (> 2^24) vectors. (Fixes #131)
View
@@ -56,22 +56,30 @@ rbind.fill <- function(...) {
for(var in names(df)) {
if (is.factor(output[[var]]) && is.character(df[[var]])) {
- output[[var]] <- as.character(output[[var]])
+ output[[var]] <- factor_to_char_preserving_attrs(output[[var]])
}
if (is.factor(df[[var]]) && is.character(output[[var]])) {
- df[[var]] <- as.character(df[[var]])
+ df[[var]] <- factor_to_char_preserving_attrs(df[[var]])
}
- if (!is.matrix(output[[var]])) {
- output[[var]][rng] <- df[[var]]
- } else {
+ if (is.matrix(output[[var]])) {
output[[var]][rng, ] <- df[[var]]
+ } else {
+ output[[var]][rng] <- df[[var]]
}
}
}
quickdf(output)
}
+factor_to_char_preserving_attrs <- function(x) {
+ a <- attributes(x)
+ a[c("levels", "class")] <- NULL
+ x <- as.character(x)
+ mostattributes(x) <- a
+ x
+}
+
output_template <- function(dfs, nrows) {
vars <- unique(unlist(lapply(dfs, base::names))) # ~ 125,000/s
output <- vector("list", length(vars))
@@ -81,32 +89,37 @@ output_template <- function(dfs, nrows) {
names(seen) <- vars
is_array <- seen
- is_matrix <- seen
is_factor <- seen
for(df in dfs) {
matching <- intersect(names(df), vars[!seen])
for(var in matching) {
value <- df[[var]]
- if (is.vector(value) && is.atomic(value)) {
- output[[var]] <- rep(NA, nrows)
- } else if (is.factor(value)) {
- output[[var]] <- factor(rep(NA, nrows), ordered = is.ordered(value))
- is_factor[var] <- TRUE
- } else if (is.matrix(value)) {
- is_matrix[var] <- TRUE
- } else if (is.array(value)) {
- is_array[var] <- TRUE
- } else if (inherits(value, "POSIXt")) {
+ if (inherits(value, "POSIXt")) {
output[[var]] <- as.POSIXct(rep(NA, nrows))
attr(output[[var]], "tzone") <- attr(value, "tzone")
- } else if (is.list(value)) {
- output[[var]] <- vector("list", nrows)
} else {
- output[[var]] <- rep(NA, nrows)
- class(output[[var]]) <- class(value)
- attributes(output[[var]]) <- attributes(value)
+ if (is.factor(value)) {
+ is_factor[[var]] <- TRUE
+ }
+ a <- attributes(value)
+ if(length(dim(value)) >= 1) {
+ newdim <- c(nrows, dim(value)[-1])
+ value <- vector(typeof(value), prod(newdim))
+ a$dim <- newdim
+ a$dimnames <- NULL
+ is_array[[var]] <- TRUE
+ } else {
+ value <- vector(typeof(value), nrows)
+ }
+ if (is.recursive(value)) {
+ value[] <- list(NULL)
+ } else {
+ value[] <- NA
+ }
+ attributes(value) <- a
+ output[[var]] <- value
}
}
@@ -117,28 +130,17 @@ output_template <- function(dfs, nrows) {
# Set up factors
for(var in vars[is_factor]) {
all <- unique(lapply(dfs, function(df) levels(df[[var]])))
- output[[var]] <- factor(output[[var]], levels = unique(unlist(all)),
- exclude = NULL)
- }
-
- # Set up matrices
- for(var in vars[is_matrix]) {
- width <- unique(unlist(lapply(dfs, function(df) ncol(df[[var]]))))
- if (length(width) > 1)
- stop("Matrix variable ", var, " has inconsistent widths")
-
- vec <- rep(NA, nrows * width)
- output[[var]] <- array(vec, c(nrows, width))
+ levels(output[[var]]) <- unique(unlist(all))
}
# Set up arrays
- for (var in vars[is_array]) {
- dims <- unique(unlist(lapply(dfs, function(df) dims(df[[var]]))))
- if (any(dims) > 1) {
- stop("rbind.fill can only work with 1d arrays")
- }
-
- output[[var]] <- rep(NA, nrows)
+ for(var in vars[is_array]) {
+ df_has <- vapply(dfs, function(df) var %in% names(df), FALSE)
+ dims <- unique(lapply(dfs[df_has], function(df) dim(df[[var]])[-1]))
+ if (length(dims) > 1)
+ stop("Array variable ", var, " has inconsistent dims")
+ if (length(dims[[1]]) == 0) #is dropping dims necessary for 1d arrays?
+ output[[var]] <- as.vector(output[[var]])
}
output
@@ -55,6 +55,28 @@ test_that("matrices are preserved", {
expect_that(ab1, equals(ab2))
})
+test_that("character or factor or list-matrices are preserved", {
+ d1 <- data.frame(a=1:2,
+ x=I(matrix(c('a', 'b', 'c', 'd'), nrow=2)))
+ d2 <- data.frame(b=1:2,
+ x=I(`dim<-`(factor(c('a', 'b', 'c', 'd')), c(2,2))))
+ d3 <- data.frame(b=1:2,
+ x=I(array(as.list(1:4), c(2,2))))
+
+ b1 <- rbind.fill(d1, d1)
+ b2 <- rbind.fill(d2, d2)
+ b3 <- rbind.fill(d3, d3)
+
+ expect_equal(dim(b1$x), c(4,2))
+ expect_equal(typeof(b1$x), "character")
+
+ expect_equal(dim(b2$x), c(4,2))
+ expect_is(b2$x, "factor")
+
+ expect_equal(dim(b3$x), c(4,2))
+ expect_equal(typeof(b3$x), "list")
+})
+
test_that("missing levels in factors preserved", {
f <- addNA(factor(c("a", "b", NA)))
df1 <- data.frame(a = f)
@@ -79,12 +101,15 @@ test_that("time zones are preserved", {
})
-test_that("arrays are ok", {
+test_that("arrays are ok", { #is this necessary?
df <- data.frame(x = 1)
df$x <- array(1, 1)
df2 <- rbind.fill(df, df)
+ #this asserts that dim is stripped off 1d arrays. Necessary?
expect_that(df2$x, is_equivalent_to(rbind(df, df)$x))
+ #this would be more consistent
+ #expect_that(df2$x, is_equivalent_to(rbind(array(1,1), array(1,1))))
})
test_that("attributes are preserved", {
@@ -104,18 +129,35 @@ test_that("attributes are preserved", {
})
-test_that("characters override factors", {
+test_that("characters override and convert factors", {
d1a <- data.frame(x=c('a','b'), y=1:2)
- d2a <- data.frame(x=c('b','d'), z=1:2, stringsAsFactors=F)
+ d2a <- data.frame(x=c('c','d'), z=1:2, stringsAsFactors=F)
d1b <- data.frame(x=c('a','b'), y=1:2, stringsAsFactors=F)
- d2b <- data.frame(x=c('b','d'), z=1:2)
+ d2b <- data.frame(x=c('c','d'), z=1:2)
d3a <- rbind.fill(d1a,d2a)
d3b <- rbind.fill(d1b,d2b)
- expect_that(d3a$x, is_a("character"))
- expect_that(d3b$x, is_a("character"))
+ expect_equal(d3a$x, c("a", "b", "c", "d"))
+ expect_equal(d3b$x, c("a", "b", "c", "d"))
+})
+
+test_that("factor to character conversion preserves attributes", {
+ d1 <- data.frame(a = letters[1:10], b = factor(letters[11:20]),
+ stringsAsFactors=FALSE)
+ d2 <- data.frame(a = factor(letters[11:20]), b = letters[11:20],
+ stringsAsFactors=FALSE)
+
+ attr(d1$a, "foo") <- "one"
+ attr(d1$b, "foo") <- "two"
+ attr(d2$a, "foo") <- "bar"
+ attr(d2$b, "foo") <- "baz"
+
+ d12 <- rbind.fill(d1, d2)
+
+ expect_equal(attr(d12$a, "foo"), "one")
+ expect_equal(attr(d12$b, "foo"), "two")
})
test_that("zero row data frames ok", {

0 comments on commit 33aaf66

Please sign in to comment.