/
power.R
319 lines (315 loc) · 11 KB
/
power.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
#' Estimate the same model on different size subsets of data
#'
#' This function estimates the same model multiple times using different size
#' subsets of a set of choice data and then returns a data frame of the
#' estimated model coefficients and standard errors for each sample size. This
#' is useful for determining the required sample size for obtaining a desired
#' level of statistical power on each coefficient. The number of models to
#' estimate is set by the `nbreaks` argument, which breaks up the data into
#' groups of increasing sample sizes. All models are estimated models using
#' the 'logitr' package. For more details see the JSS article on the 'logitr'
#' package (Helveston, 2023).
#' @keywords logitr mnl mxl mixed logit sample size power
#'
#' @param data The data, formatted as a `data.frame` object.
#' @param outcome The name of the column that identifies the outcome variable,
#' which should be coded with a `1` for `TRUE` and `0` for `FALSE`.
#' @param obsID The name of the column that identifies each observation.
#' @param pars The names of the parameters to be estimated in the model.
#' Must be the same as the column names in the `data` argument.
#' @param randPars A named vector whose names are the random parameters and
#' values the distribution: `'n'` for normal or `'ln'` for log-normal.
#' Defaults to `NULL`.
#' @param nbreaks The number of different sample size groups.
#' @param n_q Number of questions per respondent. Defaults to `1` if not
#' specified.
#' @param return_models If `TRUE`, a list of all estimated models is returned.
#' This can be useful if you want to extract other outputs from each model,
#' such as the variance-covariance matrix, etc. Defaults to `FALSE`.
#' @param panelID The name of the column that identifies the individual (for
#' panel data where multiple observations are recorded for each individual).
#' Defaults to `NULL`.
#' @param clusterID The name of the column that identifies the cluster
#' groups to be used in model estimation. Defaults to `NULL`.
#' @param robust Determines whether or not a robust covariance matrix is
#' estimated. Defaults to `FALSE`. Specification of a `clusterID` will override
#' the user setting and set this to `TRUE' (a warning will be displayed in this
#' case). Replicates the functionality of Stata's cmcmmixlogit.
#' @param predict If `TRUE`, predicted probabilities, fitted values, and
#' residuals are also included in the returned model objects. Defaults to
#' `FALSE`.
#' @param n_cores The number of cores to use for parallel processing.
#' Set to `1` to run serially Defaults to `NULL`, in which case the number of
#' cores is set to `parallel::detectCores() - 1`. Max cores allowed is capped
#' at `parallel::detectCores()`.
#' @param ... Other arguments that are passed to `logitr::logitr()` for model
#' estimation. See the {logitr} documentation for details about other
#' available arguments.
#' @references
#' Helveston, J. P. (2023). logitr: Fast Estimation of Multinomial and Mixed Logit Models with Preference Space and Willingness-to-Pay Space Utility Parameterizations. Journal of Statistical Software, 105(10), 1–37,
#' \doi{10.18637/jss.v105.i10}
#' @return Returns a data frame of estimated model coefficients and standard
#' errors for the same model estimated on subsets of the `data` with increasing
#' sample sizes.
#' @export
#' @examples
#' library(cbcTools)
#'
#' # A simple conjoint experiment about apples
#'
#' # Generate all possible profiles
#' profiles <- cbc_profiles(
#' price = c(1, 1.5, 2, 2.5, 3, 3.5, 4, 4.5, 5),
#' type = c("Fuji", "Gala", "Honeycrisp"),
#' freshness = c('Poor', 'Average', 'Excellent')
#' )
#'
#' # Make a survey design from all possible profiles
#' # (This is the default setting where method = 'full' for "full factorial")
#' design <- cbc_design(
#' profiles = profiles,
#' n_resp = 300, # Number of respondents
#' n_alts = 3, # Number of alternatives per question
#' n_q = 6 # Number of questions per respondent
#' )
#'
#' # Simulate random choices
#' data <- cbc_choices(
#' design = design,
#' obsID = "obsID"
#' )
#'
#' # Conduct a power analysis
#' power <- cbc_power(
#' data = data,
#' pars = c("price", "type", "freshness"),
#' outcome = "choice",
#' obsID = "obsID",
#' nbreaks = 10,
#' n_q = 6,
#' n_cores = 2
#' )
cbc_power <- function(
data,
outcome,
obsID,
pars,
randPars = NULL,
nbreaks = 10,
n_q = 1,
return_models = FALSE,
panelID = NULL,
clusterID = NULL,
robust = FALSE,
predict = FALSE,
n_cores = NULL,
...
) {
n_cores <- set_num_cores(n_cores)
dataList <- make_data_list(data, obsID, nbreaks, n_q)
if (n_cores == 1) {
message("Estimating models...")
suppressMessages(suppressWarnings(
models <- lapply(
dataList,
logitr::logitr,
outcome = outcome,
obsID = obsID,
pars = pars,
randPars = randPars,
panelID = panelID,
clusterID = clusterID,
robust = robust,
predict = predict,
... = ...
)
))
} else {
message("Estimating models using ", n_cores, " cores...")
if (Sys.info()[['sysname']] == 'Windows') {
cl <- parallel::makeCluster(n_cores, "PSOCK")
suppressMessages(suppressWarnings(
models <- parallel::parLapply(
cl = cl,
dataList,
logitr::logitr,
outcome = outcome,
obsID = obsID,
pars = pars,
randPars = randPars,
panelID = panelID,
clusterID = clusterID,
robust = robust,
predict = predict,
... = ...
)
))
parallel::stopCluster(cl)
} else {
suppressMessages(suppressWarnings(
models <- parallel::mclapply(
dataList,
logitr::logitr,
outcome = outcome,
obsID = obsID,
pars = pars,
randPars = randPars,
panelID = panelID,
clusterID = clusterID,
robust = robust,
predict = predict,
... = ...,
mc.cores = n_cores
)
))
}
}
message("done!")
# Add sample size to each model
sizes <- names(models)
for (i in seq_len(length(models))) {
models[[i]]$sampleSize <- as.numeric(sizes[i])
}
if (return_models) {
class(models) <- c("cbc_models", "list")
return(models)
}
errors <- extract_errors(models)
class(errors) <- c("cbc_errors", "data.frame")
return(errors)
}
set_num_cores <- function(n_cores) {
cores_available <- parallel::detectCores()
max_cores <- cores_available - 1
# CRAN checks limits you to 2 cores, see this SO issue:
# https://stackoverflow.com/questions/50571325/r-cran-check-fail-when-using-parallel-functions
chk <- tolower(Sys.getenv("_R_CHECK_LIMIT_CORES_", ""))
if (nzchar(chk) && (chk != "false")) {
# use 2 cores in CRAN/Travis/AppVeyor
return(2L)
}
if (is.null(n_cores)) {
return(max_cores)
} else if (!is.numeric(n_cores)) {
warning(
"Non-numeric value provided for n_cores...setting n_cores to ",
max_cores
)
return(max_cores)
} else if (n_cores > cores_available) {
warning(
"Cannot use ", n_cores, " cores because your machine only has ",
cores_available, " available...setting n_cores to ", max_cores
)
return(max_cores)
}
return(n_cores)
}
make_data_list <- function(data, obsID, nbreaks, n_q) {
maxObs <- max(data[obsID])
nObs <- ceiling(seq(ceiling(maxObs/nbreaks), maxObs, length.out = nbreaks))
dataList <- list()
for (i in 1:nbreaks) {
temp <- data[which(data[,obsID] %in% seq(nObs[i])),]
temp$sampleSize <- round(nObs[i] / n_q)
dataList[[i]] <- temp
}
sampleSizes <- unlist(lapply(dataList, function(x) unique(x$sampleSize)))
names(dataList) <- sampleSizes
return(dataList)
}
extract_errors <- function(models) {
sampleSize <- unlist(lapply(models, function(x) x$sampleSize))
est <- lapply(models, function(x) stats::coef(x))
se <- lapply(models, function(x) logitr::se(x))
names <- lapply(est, names)
results <- data.frame(
sampleSize = rep(sampleSize, each = length(est[[1]])),
coef = do.call(c, names),
est = do.call(c, est),
se = do.call(c, se)
)
row.names(results) <- NULL
return(results)
}
#' Plot a comparison of different design powers
#'
#' This function creates a ggplot2 object comparing the power curves of
#' different designs. Each design is color coded and each facet (sub plot)
#' is a model coefficient.
#' @param ... Any number of data frame containing power results obtained from
#' the `cbc_power()` function, separated by commas.
#' @return A plot comparing the power curves of different designs.
#' @importFrom ggplot2 ggplot aes geom_hline geom_point expand_limits theme_bw
#' theme element_blank labs facet_wrap
#' @importFrom rlang .data
#' @export
#' @examples
#' \dontrun{
#' library(cbcTools)
#'
#' # Generate all possible profiles
#' profiles <- cbc_profiles(
#' price = c(1, 1.5, 2, 2.5, 3),
#' type = c("Fuji", "Gala", "Honeycrisp"),
#' freshness = c('Poor', 'Average', 'Excellent')
#' )
#'
#' # Make designs to compare: full factorial vs bayesian d-efficient
#' design_random <- cbc_design(
#' profiles = profiles,
#' n_resp = 100, n_alts = 3, n_q = 6
#' )
#' # Same priors will be used in bayesian design and simulated choices
#' priors <- list(
#' price = -0.1,
#' type = c(0.1, 0.2),
#' freshness = c(0.1, 0.2)
#' )
#' design_bayesian <- cbc_design(
#' profiles = profiles,
#' n_resp = 100, n_alts = 3, n_q = 6, n_start = 1, method = "CEA",
#' priors = priors, parallel = FALSE
#' )
#'
#' # Obtain power for each design by simulating choices
#' power_random <- design_random |>
#' cbc_choices(obsID = "obsID", priors = priors) |>
#' cbc_power(
#' pars = c("price", "type", "freshness"),
#' outcome = "choice", obsID = "obsID", nbreaks = 5, n_q = 6, n_cores = 2
#' )
#' power_bayesian <- design_bayesian |>
#' cbc_choices(obsID = "obsID", priors = priors) |>
#' cbc_power(
#' pars = c("price", "type", "freshness"),
#' outcome = "choice", obsID = "obsID", nbreaks = 5, n_q = 6, n_cores = 2
#' )
#'
#' # Compare power of each design
#' plot_compare_power(power_bayesian, power_random)
#' }
plot_compare_power <- function(...) {
power <- list(...)
design_names <- unlist(lapply(as.list(match.call())[-1], deparse))
names(power) <- design_names
for (i in 1:length(power)) {
power[[i]]$design <- names(power)[i]
}
power <- do.call(rbind, power)
ggplot2::ggplot(power) +
geom_hline(yintercept = 0.05, color = "red", linetype = 2) +
geom_point(
aes(x = .data$sampleSize, y = .data$se, color = .data$design),
size = 1.8
) +
facet_wrap(~.data$coef) +
expand_limits(y = 0) +
theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank()) +
labs(
color = "Design",
x = "Sample size",
y = "Standard error"
)
}