-
Notifications
You must be signed in to change notification settings - Fork 7
/
amce.R
318 lines (281 loc) · 16.3 KB
/
amce.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#' @rdname amce
#' @title Tidy estimation of AMCEs
#' @description Estimate AMCEs for a conjoint analysis and return a tidy data frame of results
#' @param data A data frame containing variables specified in \code{formula}. All RHS variables should be factors; the base level for each will be used in estimation and its reported AMCE will be NA (for printing). Optionally, this can instead be an object of class \dQuote{survey.design} returned by \code{\link[survey]{svydesign}}.
#' @param formula A formula specifying an AMCE model to be estimated. All variables should be factors; all levels across features should be unique. Two-way constraints can be specified with an asterisk (*) between RHS features. The specific constrained level pairs within these features are then detected automatically. Higher-order constraints are not allowed.
#' @param variable An RHS formula containing a single factor variable from \code{formula}. This will be used by \code{amce_by_reference} to estimate AMCEs relative to each possible factor level as a reference category. If more than one RHS variables are specified, the first will be used.
#' @template id
#' @template weights
#' @template feature_order
#' @template feature_labels
#' @template level_order
#' @template alpha
#' @param \dots For \code{amce}: additional arguments to \code{\link[stats]{glm}} or \code{\link[survey]{svyglm}}, the latter being used if \code{weights} is non-NULL. For \code{amce_by_reference}: additional arguments passed to \code{amce}.
#' @return A data frame of class \dQuote{cj_amce}
#' @details \code{amce} provides estimates of AMCEs (or rather, average marginal effects for each feature level). Two-way constraints can be specified with an asterisk (*) between features. The specific constrained level pairs within these features are then detected automatically. The function can also be used for calculating average component interaction effects when combined with \code{interaction}, and for balance testing by specifying a covariate rather outcome on the left-hand side of \code{formula}. See examples.
#'
#' \code{amce_by_reference} provides a tool for quick sensitivity analysis. AMCEs are defined relative to an arbitrary reference category (i.e., feature level). This function will loop over all feature levels (for a specified feature) to show how interpretation will be affected by choice of reference category. The resulting data frame will be a stacked result from \code{amce}, containing an additional \code{REFERENCE} column specifying which level of \code{variable} was used as the reference category. In unconstrained conjoint designs, only AMCEs for \code{variable} will vary by reference category; in constrained designs, AMCEs for any factor constrained by \code{variable} may also vary.
#'
#' Users may desire to specify a \code{family} argument via \code{\dots}, which should be a \dQuote{family} object such as \code{gaussian}. Sensible alternatives are \code{binomial} (for binary outcomes) and quasibinomial (for weighted survey data). See \code{\link[stats]{family}} for details. In such cases, effects are always reported on the link (not outcome) scale.
#'
#' @examples
#' data("taxes")
#' # estimating AMCEs
#' amce(taxes, chose_plan ~ taxrate1 + taxrate2 + taxrate3 +
#' taxrate4 + taxrate5 + taxrate6 + taxrev, id = ~ ID)
#'
#' \donttest{
#' data("immigration")
#' # estimating AMCEs with constraints
#' amce(immigration, ChosenImmigrant ~ Gender + ReasonForApplication * CountryOfOrigin,
#' id = ~CaseID)
#'
#'# estimating average component interaction effects (AMCEs of feature combinations)
#' immigration$language_entry <- interaction(immigration$LanguageSkills,
#' immigration$PriorEntry, sep = "_")
#' amce(immigration,ChosenImmigrant ~ language_entry, id = ~CaseID)
#'
#' # balance testing example
#' plot(amce(immigration[!is.na(immigration$ethnocentrism),],
#' ethnocentrism ~ Gender + Education + LanguageSkills, id = ~ CaseID))
#'
#' # reference category sensitivity
#' x <- amce_by_reference(immigration, ChosenImmigrant ~ LanguageSkills + Education,
#' variable = ~ LanguageSkills, id = ~ CaseID)
#' # plot
#' plot(x)
#' }
#' @seealso \code{\link{amce_diffs}} \code{\link{mm}} \code{\link{plot.cj_amce}}
#' @import stats
#' @import survey
#' @importFrom sandwich vcovCL
#' @export
amce <-
function(
data,
formula,
id = ~ 0,
weights = NULL,
feature_order = NULL,
feature_labels = NULL,
level_order = c("ascending", "descending"),
alpha = 0.05,
...
) {
# get outcome variable
outcome <- all.vars(stats::update(formula, . ~ 0))
if (!length(outcome) || outcome == ".") {
stop("'formula' is missing a left-hand outcome variable")
}
# get RHS variables, variable labels, and factor levels
RHS <- all.vars(stats::update(formula, 0 ~ . ))
# detect constraints from 'formula'
formula_terms <- terms(formula)
if (any(attr(formula_terms, "order") >= 2)) {
# constraints are present
constrained_terms <- attr(formula_terms, "factors")[ , attr(formula_terms, "order") == 2, drop = FALSE]
constrained_vars <- colnames(constrained_terms)
constraints <- lapply(constrained_vars, function(tmp) {
as.formula(paste0("~", tmp))
})
# vector of constrained variables
constrained_vars <- unlist(lapply(constraints, all.vars))
## check that constraints do not include duplicated terms
if (any(duplicated(constrained_vars))) {
stop("All variables in constraints must be unique and constraints may only be two-way")
}
constrained_vars <- unique(constrained_vars)
# vector of unconstrained variables
unconstrained_vars <- RHS[!RHS %in% constrained_vars]
} else {
constraints <- NULL
constrained_vars <- NULL
unconstrained_vars <- RHS
}
# process feature_order argument
feature_order <- check_feature_order(feature_order, RHS)
# set level_order (within features) to ascending or descending
level_order <- match.arg(level_order)
# function to produce "fancy" feature labels
feature_labels <- clean_feature_labels(data = data, RHS = RHS, feature_labels = feature_labels)
# convert feature labels and levels to data frame
term_labels_df <- make_term_labels_df(data, feature_order, level_order = level_order)
# estimate model
if (inherits(data, "data.frame") && is.null(weights)) {
data <- cj_df(data)
svydesign <- survey::svydesign(ids = id, weights = ~ 1, data = data)
mod <- survey::svyglm(formula, design = svydesign, ...)
} else if (inherits(data, "data.frame")) {
data <- cj_df(data)
svydesign <- survey::svydesign(ids = id, weights = weights, data = data)
mod <- survey::svyglm(formula, design = svydesign, ...)
} else if (inherits(data, "survey.design")) {
svydesign <- data
data <- cj_df(data[["variables"]])
mod <- survey::svyglm(formula, design = svydesign, ...)
} else {
stop("'data' is not a 'data.frame' or 'survey.design' object")
}
# empty AMCEs (used in two places below)
empty_amces <- data.frame(outcome = character(),
statistic = character(),
feature = character(),
level = character(),
estimate = numeric(),
std.error = numeric(),
z = numeric(),
p = numeric(),
lower = numeric(),
upper = numeric(),
check.names = FALSE,
stringsAsFactors = FALSE)
# get coefficients as data frame (correct, if needed, for clustering)
coef_summary <- get_coef_summary(mod = mod, data = data, id = NULL, alpha = alpha)
# first estimate unconstrained terms
if (length(unconstrained_vars)) {
# extract 'unconstrained_vars'
out <- coef_summary
names(out) <- c("estimate", "std.error", "z", "p", "lower", "upper", "factor")
# cleanup term names
out <- cbind(clean_term_names(out[["factor"]], RHS), out[names(out) != "factor"])
# fill in missing base levels with 0s
out <- merge(out[!out$feature %in% c("", "(Intercept)"),],
term_labels_df,
by = c("feature", "level"), all = TRUE)
out[["estimate"]][is.na(out[["estimate"]])] <- 0
out[["outcome"]] <- outcome
out[["statistic"]] <- "amce"
# subset to variables in 'unconstrained_vars'
unconstrained_amces <- out[out[["feature"]] %in% unconstrained_vars, , drop = FALSE]
} else {
# if no 'unconstrained_vars', return empty data frame
unconstrained_amces <- empty_amces
}
# then calculate constrained terms
if (!is.null(constraints)) {
## for each constraint, we need to:
## > figure out what estimated coefficients go with each of the two terms (using `get_coef_metadata()` to identify terms and `get_coef_summary()` to get a clean model summary)
## > average effects of first constrained variable across levels of second variable
## > average effects of second constrained variable across levels of first variable
## > if we were to generalize this to higher-order constraints, the code would simply have to be made more complex
## > `rbind()` the two sets of results back together
constrained_amces <- lapply(constraints, function(one) {
# identify variables
variables_in_this_constraint <- setdiff(unlist(lapply(one, all.vars)), "~")
var1 <- variables_in_this_constraint[1L]
var2 <- variables_in_this_constraint[2L]
# function to calculate AMCEs by averaging coefficients
calculate_amce <- function(to_average, feature) {
# subset to estimable coefficients
to_average <- to_average[!is.na(to_average[["Estimate"]]), , drop = FALSE]
# make sure estimates are sorted in correct order
to_average <- to_average[order(to_average[["_order"]]),]
# var-cov matrix of these estimates
this_varcov <- vcov(mod)[to_average[["_coef"]], to_average[["_coef"]]]
# calculate AMCE, giving uniform weight to features
## calculate weights (base term is weighted 1; other alters equally)
wts <- c(1L, rep(1L/nrow(to_average), nrow(to_average) - 1L))
est <- sum(to_average[["Estimate"]] * wts)
## variance is weights %*% vcov %*% weights
variance <- (wts %*% this_varcov %*% wts)[1L,1L,drop = TRUE]
# populate output
averaged <- data.frame(outcome = outcome,
statistic = "amce",
feature = feature,
level = to_average[to_average[["_order"]] == 1L, "_base_level", drop = TRUE],
estimate = est,
std.error = sqrt(variance),
z = est/sqrt(variance),
p = 2L*(stats::pnorm(-abs(est/sqrt(variance)))),
check.names = FALSE,
stringsAsFactors = FALSE
)
averaged[["lower"]] <- est - (stats::qnorm(1-alpha) * sqrt(variance))
averaged[["upper"]] <- est + (stats::qnorm(1-alpha) * sqrt(variance))
return(averaged)
}
terms_df <- get_coef_metadata(mod)
# calculate MEs for first variable, constraining second
## subset terms_df to terms with 'var1' and/or 'var2'
terms_df1 <- terms_df[terms_df[[var1]] | terms_df[[var2]], , drop = FALSE]
# add term levels to 'term_df1'
terms_df1 <- identify_term_levels(terms_df1, data, base_var = var1, by_var = var2)
## merge terms_df with `coef_summary`
terms_df1 <- merge(coef_summary, terms_df1, by = "_coef")
## split merged object by 'var1' levels, excluding base level, calculating AMCE
terms_df1 <- terms_df1[terms_df1[["_base_level"]] != levels(data[[var1]])[1L], , drop = FALSE]
one_out1_list <- lapply(split(terms_df1, terms_df1[["_base_level"]], drop = TRUE), calculate_amce, feature = var1)
one_out1 <- do.call("rbind", one_out1_list)
# calculate MEs for second variable, constraining first
## subset terms_df to terms with 'var1' and/or 'var2'
terms_df2 <- terms_df[terms_df[[var1]] | terms_df[[var2]], , drop = FALSE]
# add term levels to 'term_df1'
terms_df2 <- identify_term_levels(terms_df2, data, base_var = var2, by_var = var1)
## merge terms_df with `coef_summary`
terms_df2 <- merge(coef_summary, terms_df2, by = "_coef")
## split merged object by 'var1' levels, excluding base level, calculating AMCE
terms_df2 <- terms_df2[terms_df2[["_base_level"]] != levels(data[[var2]])[1L], , drop = FALSE]
one_out2_list <- lapply(split(terms_df2, terms_df2[["_base_level"]], drop = TRUE), calculate_amce, feature = var2)
one_out2 <- do.call("rbind", one_out2_list)
# NOTE: IF WE HAD HIGHER-ORDER CONSTRAINTS, WE WOULD NEED ADDITIONAL STEPS HERE
# combine both sets of AMCEs for this constraint
one_out <- rbind(one_out1, one_out2)
one_out[["BY"]] <- NULL
# restore base levels of both variables
one_out <- rbind(one_out,
data.frame(statistic = rep("amce", 2L),
outcome = rep(outcome, 2L),
feature = c(var1,var2),
level = c(levels(data[[var1]])[1L], levels(data[[var2]])[1L]),
estimate = c(0L, 0L),
std.error = NA_real_,
z = NA_real_,
p = NA_real_,
lower = NA_real_,
upper = NA_real_))
rownames(one_out) <- seq_len(nrow(one_out))
return(one_out)
})
# rbind all constraint pairs together
constrained_amces <- do.call("rbind", constrained_amces)
} else {
# if no 'constrained_vars', return empty data frame
constrained_amces <- empty_amces
}
# update out with constrained AMCEs
out <- rbind(unconstrained_amces, constrained_amces)
# label features and levels
out[["level"]] <- factor(out[["level"]], levels = term_labels_df[["level"]])
out[["feature"]] <- factor(out[["feature"]],
levels = feature_order,
labels = feature_labels[feature_order])
out <- out[c("outcome", "statistic", "feature", "level", "estimate", "std.error", "z", "p", "lower", "upper")]
out <- out[order(out$level),]
rownames(out) <- seq_len(nrow(out))
# return
return(structure(out, class = c("cj_amce", "data.frame")))
}
#' @rdname amce
#' @export
amce_by_reference <- function(data, formula, variable, ...) {
# get outcome variable
variable <- all.vars(stats::update(variable, 0 ~ .))[[1L]]
# get levels
if (!inherits(data[[variable]], "factor")) {
stop("'variable' must be a factor variable")
}
levs <- levels(data[[variable]])
# loop over levels
out <- list()
for (i in seq_along(levs)) {
data[[variable]] <- relevel(data[[variable]], levs[i])
out[[i]] <- amce(data, formula, ...)
out[[i]][["REFERENCE"]] <- levs[i]
}
# return value
## stack
out <- do.call("rbind", out)
## add reference category column
out[["REFERENCE"]] <- factor(out[["REFERENCE"]], levels = levs)
return(structure(out, class = c("cj_amce", "data.frame"), by = "REFERENCE"))
}