-
Notifications
You must be signed in to change notification settings - Fork 7
/
cj.R
116 lines (106 loc) · 5.1 KB
/
cj.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#' @rdname cregg
#' @importFrom utils hasName
#' @export
cj <-
function(
data,
formula,
id = ~ 0,
weights = NULL,
estimate = c("amce", "frequencies", "mm", "amce_differences", "mm_differences"),
feature_order = NULL,
feature_labels = NULL,
level_order = c("ascending", "descending"),
by = NULL,
...
) {
estimate <- match.arg(estimate)
if (!is.null(by)) {
# get RHS variables, variable labels, and factor levels
RHS <- all.vars(stats::update(formula, 0 ~ . ))
# get RHS variables, variable labels, and factor levels
by_vars <- all.vars(stats::update(by, 0 ~ . ))
## check `by` variables
for (b in by_vars) {
if (!utils::hasName(data, b)) {
stop(sprintf("Variable '%s' in `by` not found in 'data'", b))
}
# check that all variables in `by` are factors
if (utils::hasName(data, b) && !inherits(data[[b]], "factor")) {
stop(sprintf("Variable '%s' in `by` must be a factor", b))
}
# check for empty string values in by vars; error if present
if ("" %in% unique(data[[b]])) {
stop(sprintf("'by' variables cannot contain \"\" values (see '%s')", b))
}
}
# process feature_order argument
feature_order <- check_feature_order(feature_order, RHS)
# set level_order (within features) to ascending or descending
level_order <- match.arg(level_order)
# function to produce "fancy" feature labels
## we need to handle this here, otherwise `split()` will drop feature labels from variable attributes
feature_labels <- clean_feature_labels(data = data, RHS = RHS, feature_labels = feature_labels)
# convert feature labels and levels to data frame
term_labels_df <- make_term_labels_df(data, feature_order, level_order = level_order)
# amce_diffs handles `by` internally
if (estimate == "mm_differences") {
return(mm_diffs(data = data, formula = formula, by = by, id = id, weights = weights,
feature_order = feature_order, feature_labels = feature_labels, level_order = level_order, ...))
} else if (estimate == "amce_differences") {
return(amce_diffs(data = data, formula = formula, by = by, id = id, weights = weights,
feature_order = feature_order, feature_labels = feature_labels, level_order = level_order, ...))
}
# split
split_df <- split(data, model.frame(by, data = data, na.action = NULL), sep = "***")
# get results for subsets
BY <- list()
for(i in seq_along(split_df)) {
# execute function on subset of data
BY[[i]] <- switch(estimate, amce = amce, freq = cj_freqs, mm = mm)(
data = split_df[[i]],
formula = formula,
id = id,
weights = weights,
feature_order = feature_order,
feature_labels = feature_labels,
level_order = level_order,
...)
BY[[i]][["BY"]] <- i
}
## get names of subsets
by_vals <- stats::setNames(do.call("rbind.data.frame", strsplit(names(split_df), "***", fixed = TRUE)), by_vars)
by_vals[["BY"]] <- factor(seq_len(nrow(by_vals)))
# combine back into data frame
out <- do.call("rbind", BY)
out <- structure(
merge(out, by_vals, by = "BY"),
class = c(paste0("cj_", estimate), "data.frame"),
BY = TRUE
)
out[["BY"]] <- factor(names(split_df)[out$BY])
out[["statistic"]] <- estimate
# label features and levels
out <- out[c("BY", "outcome", "statistic", "feature", "level", "estimate", "std.error", "z", "p", "lower", "upper", by_vars)]
# make sure factor levels match input
for (b in by_vars) {
out[[b]] <- factor(out[[b]], levels(data[[b]]))
}
rownames(out) <- seq_len(nrow(out))
} else {
if (estimate %in% c("mm_differences", "amce_differences")) {
stop("Argument 'by' is required when estimate %in% c('mm_differences', 'amce_differences')")
}
by_vars <- NULL
out <- switch(estimate,
amce = amce(data = data, formula = formula, id = id, weights = weights,
feature_order = feature_order, feature_labels = feature_labels, level_order = level_order, ...),
freqs = cj_freqs(data = data, formula = formula, id = id, weights = weights,
feature_order = feature_order, feature_labels = feature_labels, level_order = level_order, ...),
mm = mm(data = data, formula = formula, id = id, weights = weights,
feature_order = feature_order, feature_labels = feature_labels, level_order = level_order, ...)
)
}
# return value
return(structure(out, by = by_vars))
}