/
cast.r
194 lines (171 loc) · 7.48 KB
/
cast.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#' Cast functions
#' Cast a molten data frame into an array or data frame.
#'
#' Use \code{acast} or \code{dcast} depending on whether you want
#' vector/matrix/array output or data frame output. Data frames can have at
#' most two dimensions.
#'
#' The cast formula has the following format:
#' \code{x_variable + x_2 ~ y_variable + y_2 ~ z_variable ~ ... }
#' The order of the variables makes a difference. The first varies slowest,
#' and the last fastest. There are a couple of special variables: "..."
#' represents all other variables not used in the formula and "." represents
#' no variable, so you can do \code{formula = var1 ~ .}.
#'
#' Alternatively, you can supply a list of quoted expressions, in the form
#' \code{list(.(x_variable, x_2), .(y_variable, y_2), .(z))}. The advantage
#' of this form is that you can cast based on transformations of the
#' variables: \code{list(.(a + b), (c = round(c)))}. See the documentation
#' for \code{\link[plyr]{.}} for more details and alternative formats.
#'
#' If the combination of variables you supply does not uniquely identify one
#' row in the original data set, you will need to supply an aggregating
#' function, \code{fun.aggregate}. This function should take a vector of
#' numbers and return a single summary statistic.
#'
#' @usage acast(data, formula, fun.aggregate = NULL, ..., margins = NULL,
#' subset = NULL, fill=NULL, drop = TRUE, value.var = guess_value(data))
#' @usage dcast(data, formula, fun.aggregate = NULL, ..., margins = NULL,
#' subset = NULL, fill=NULL, drop = TRUE, value.var = guess_value(data))
#' @keywords manip
#' @param data molten data frame, see \code{\link{melt}}.
#' @param formula casting formula, see details for specifics.
#' @param fun.aggregate aggregation function needed if variables do not
#' identify a single observation for each output cell. Defaults to length
#' (with a message) if needed but not specified.
#' @param ... further arguments are passed to aggregating function
#' @param margins vector of variable names (can include "grand\_col" and
#' "grand\_row") to compute margins for, or TRUE to compute all margins .
#' Any variables that can not be margined over will be silently dropped.
#' @param subset quoted expression used to subset data prior to reshaping,
#' e.g. \code{subset = .(variable=="length")}.
#' @param fill value with which to fill in structural missings, defaults to
#' value from applying \code{fun.aggregate} to 0 length vector
#' @param drop should missing combinations dropped or kept?
#' @param value.var name of column which stores values, see
#' \code{\link{guess_value}} for default strategies to figure this out.
#' @seealso \code{\link{melt}}, \url{http://had.co.nz/reshape/}
#' @name cast
#' @aliases cast dcast acast
#' @export dcast acast
#' @import plyr
#' @import stringr
#' @examples
#' #Air quality example
#' names(airquality) <- tolower(names(airquality))
#' aqm <- melt(airquality, id=c("month", "day"), na.rm=TRUE)
#'
#' acast(aqm, day ~ month ~ variable)
#' acast(aqm, month ~ variable, mean)
#' acast(aqm, month ~ variable, mean, margins = TRUE)
#' dcast(aqm, month ~ variable, mean, margins = c("month", "variable"))
#'
#' library(plyr) # needed to access . function
#' acast(aqm, variable ~ month, mean, subset = .(variable == "ozone"))
#' acast(aqm, variable ~ month, mean, subset = .(month == 5))
#'
#' #Chick weight example
#' names(ChickWeight) <- tolower(names(ChickWeight))
#' chick_m <- melt(ChickWeight, id=2:4, na.rm=TRUE)
#'
#' dcast(chick_m, time ~ variable, mean) # average effect of time
#' dcast(chick_m, diet ~ variable, mean) # average effect of diet
#' acast(chick_m, diet ~ time, mean) # average effect of diet & time
#'
#' # How many chicks at each time? - checking for balance
#' acast(chick_m, time ~ diet, length)
#' acast(chick_m, chick ~ time, mean)
#' acast(chick_m, chick ~ time, mean, subset = .(time < 10 & chick < 20))
#'
#' acast(chick_m, time ~ diet, length)
#'
#' dcast(chick_m, diet + chick ~ time)
#' acast(chick_m, diet + chick ~ time)
#' acast(chick_m, chick ~ time ~ diet)
#' acast(chick_m, diet + chick ~ time, length, margins="diet")
#' acast(chick_m, diet + chick ~ time, length, drop = FALSE)
#'
#' #Tips example
#' dcast(melt(tips), sex ~ smoker, mean, subset = .(variable == "total_bill"))
#'
#' ff_d <- melt(french_fries, id=1:4, na.rm=TRUE)
#' acast(ff_d, subject ~ time, length)
#' acast(ff_d, subject ~ time, length, fill=0)
#' dcast(ff_d, treatment ~ variable, mean, margins = TRUE)
#' dcast(ff_d, treatment + subject ~ variable, mean, margins="treatment")
#' lattice::xyplot(`1` ~ `2` | variable, dcast(ff_d, ... ~ rep), aspect="iso")
NULL
cast <- function(data, formula, fun.aggregate = NULL, ..., subset = NULL, fill = NULL, drop = TRUE, value.var = guess_value(data)) {
if (!is.null(subset)) {
include <- data.frame(eval.quoted(subset, data))
data <- data[rowSums(include) == ncol(include), ]
}
formula <- parse_formula(formula, names(data), value.var)
value <- data[[value.var]]
# Need to branch here depending on whether or not we have strings or
# expressions - strings should avoid making copies of the data
vars <- lapply(formula, eval.quoted, envir = data, enclos = parent.frame(2))
# Compute labels and id values
ids <- lapply(vars, id, drop = drop)
labels <- mapply(split_labels, vars, ids, MoreArgs = list(drop = drop),
SIMPLIFY = FALSE, USE.NAMES = FALSE)
overall <- id(rev(ids), drop = FALSE)
ns <- vapply(ids, attr, 0, "n")
# Replace zeros (empty inputs) with 1 for dimensions of output
ns[ns == 0] <- 1
n <- attr(overall, "n")
# Aggregate duplicates
if (any(duplicated(overall)) || !is.null(fun.aggregate)) {
if (is.null(fun.aggregate)) {
message("Aggregation function missing: defaulting to length")
fun.aggregate <- length
}
ordered <- vaggregate(.value = value, .group = overall,
.fun = fun.aggregate, ..., .default = fill, .n = n)
overall <- seq_len(n)
} else {
# Add in missing values, if necessary
if (length(overall) < n) {
overall <- match(seq_len(n), overall, nomatch = NA)
} else {
overall <- order(overall)
}
ordered <- value[overall]
if (!is.null(fill)) {
ordered[is.na(ordered)] <- fill
}
}
list(
data = structure(ordered, dim = ns),
labels = labels
)
}
dcast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill=NULL, drop = TRUE, value.var = guess_value(data)) {
formula <- parse_formula(formula, names(data), value.var)
if (length(formula) > 2) {
stop("Dataframes have at most two output dimensions")
}
if (!is.null(margins)) {
data <- add_margins(data, lapply(formula, names), margins)
}
res <- cast(data, formula, fun.aggregate, ...,
subset = subset, fill = fill, drop = drop,
value.var = value.var)
data <- as.data.frame.matrix(res$data, stringsAsFactors = FALSE)
names(data) <- array_names(res$labels[[2]])
stopifnot(nrow(res$labels[[1]]) == nrow(data))
cbind(res$labels[[1]], data)
}
acast <- function(data, formula, fun.aggregate = NULL, ..., margins = NULL, subset = NULL, fill=NULL, drop = TRUE, value.var = guess_value(data)) {
formula <- parse_formula(formula, names(data), value.var)
if (!is.null(margins)) {
data <- add_margins(data, lapply(formula, names), margins)
}
res <- cast(data, formula, fun.aggregate, ...,
subset = subset, fill = fill, drop = drop, value.var = value.var)
dimnames(res$data) <- lapply(res$labels, array_names)
res$data
}
array_names <- function(df) {
do.call(paste, c(df, list(sep = "_")))
}