-
Notifications
You must be signed in to change notification settings - Fork 21
/
panel_data.R
442 lines (368 loc) · 13.3 KB
/
panel_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#' @title Create panel data frames
#' @description Format your data for use with \pkg{panelr}.
#' @param data A data frame.
#' @param id The name of the column (unquoted) that identifies
#' participants/entities. A new column will be created called `id`,
#' overwriting any column that already has that name.
#' @param wave The name of the column (unquoted) that identifies
#' waves or periods. A new column will be created called `wave`,
#' overwriting any column that already has that name.
#' @param ... Attributes for adding onto this method. See
#' [tibble::new_tibble()] for a run-through of the logic.
#' @return A `panel_data` object.
#' @examples
#' data("WageData")
#' wages <- panel_data(WageData, id = id, wave = t)
#'
#' @rdname panel_data
#' @import dplyr
#' @export
panel_data <- function(data, id = id, wave = wave, ...) {
id <- as_name(enexpr(id))
wave <- as_name(enexpr(wave))
if (id %nin% names(data)) {
stop(id, "was not found in the data.")
} else if (wave %nin% names(data)) {
stop(wave, "was not found in the data.")
}
# Let's make sure ID var doesn't get confused for numeric
if (!is.factor(data[[id]])) {data[[id]] <- factor(data[[id]])}
# Group by case ID
if (id %nin% group_vars(data)) {data <- group_by(data, !!sym(id), .add = TRUE)}
# Warn about multi-grouped DFs
if (length(group_vars(data)) > 1) {
msg_wrap("Detected additional grouping variables. Be aware this may
cause unexpected behavior or incorrect results.")
}
# Make sure wave variable is in format I can understand
if (is.factor(data[[wave]]) & !is.ordered(data[[wave]])) {
data[[wave]] <- factor(data[[wave]], ordered = TRUE)
msg_wrap("Unordered factor wave variable was converted to ordered.
You should check that the order is correct.")
periods <- levels(data[[wave]])
} else if (!valid_wave(data[[wave]])) {
stop("The wave variable must be numeric or an ordered factor.")
} else {
periods <- sort(unique(data[[wave]]))
}
# if (!is.ordered(wave) && 0 %in% data[[wave]]) {
# message("There cannot be a wave 0. Adding 1 to each wave.\n")
# data[[wave]] <- data[[wave]] + 1
# periods <- periods + 1
# }
# Ordering by wave and then group ensures lag functions work right
data <- arrange(data, !!sym(wave), .by_group = TRUE)
# Order the columns to put id and wave first (must do this *before*
# creating the panel_data object or else there are endless loops in
# reconstruct functions)
data <- data[c(id, wave, names(data) %not% c(id, wave))]
# Inherit from df, tibble, and grouped_df (last one is critical)
data <- tibble::new_tibble(data, ...,
id = id,
wave = wave,
class = c("panel_data", "grouped_df"),
nrow = nrow(data),
periods = periods)
return(data)
}
#' @title Check if object is panel_data
#' @description This is a convenience function that checks whether an object
#' is a `panel_data` object.
#' @param x Any object.
#' @examples
#' data("WageData")
#' is_panel(WageData) # FALSE
#' wages <- panel_data(WageData, id = id, wave = t)
#' is_panel(wages) # TRUE
#' @export
is_panel <- function(x) {
"panel_data" %in% class(x)
}
#' @title Filter out entities with too few observations
#' @description This function allows you to define a minimum number of
#' waves/periods and exclude all individuals with fewer observations than
#' that.
#' @param data A [panel_data()] frame.
#' @param ... Optionally, unquoted variable names/expressions separated by
#' commas to be passed to [dplyr::select()]. Otherwise, all columns are
#' included if `formula` and `vars` are also NULL.
#' @param formula A formula, like the one you'll be using to specify your model.
#' @param vars As an alternative to formula, a vector of variable names.
#' @param min.waves What is the minimum number of observations to be kept?
#' Default is `"all"`, but it can be any number.
#' @return A `panel_data` frame.
#' @details
#'
#' If `...` (that is, unquoted variable name(s)) are included, then `formula`
#' and `vars` are ignored. Likewise, `formula` takes precedence over `vars`.
#' These are just different methods for selecting variables and you can choose
#' whichever you prefer/are comfortable with. `...` corresponds with the
#' "tidyverse" way, `formula` is useful for programming or working with
#' model formulas, and `vars` is a "standard" evaluation method for when you
#' are working with strings.
#'
#' @examples
#'
#' data("WageData")
#' wages <- panel_data(WageData, id = id, wave = t)
#' complete_data(wages, wks, lwage, min.waves = 3)
#'
#' @rdname complete_data
#' @export
#' @importFrom stats complete.cases as.formula
complete_data <- function(data, ..., formula = NULL, vars = NULL,
min.waves = "all") {
# OG data frame for reconstruct()
old <- data
id <- get_id(data)
wave <- get_wave(data)
# Handling case of no selected vars --- I want to assume a selection
# of none means selection of all rather than default select behavior
# (which is to return nothing)
cols <- enexprs(...)
if (length(cols) == 0 & is.null(formula) & is.null(vars)) {
cols <- names(data)
cols <- sapply(cols, backtick_name) # Avoid parsing non-syntactic names
cols <- lapply(cols, parse_expr)
# cols <- c(sym(id), sym(wave), cols)
d <- select(data, !!! cols)
} else if (length(cols) > 0) {
# cols <- lapply(cols, parse_expr)
# cols <- c(sym(id), sym(wave), cols)
d <- select(data, !!! cols)
} else {
if (!is.null(formula)) {
d <- data[all.vars(formula)]
} else if (!is.null(vars)) {
d <- data[vars]
} else {
d <- data
}
}
# Keep only complete cases
d <- d[complete.cases(d),]
# Using the table to count up how many obs. of each person
t <- table(d[[id]])
if (min.waves == "all") {
min.waves <- max(t) # Whoever has the most observations has all the waves
}
# Keep only people who were observed minimum number of times
keeps <- which(t >= min.waves)
keeps <- names(t)[keeps]
data <- dplyr::filter(data, !! sym(id) %in% keeps)
data <- reconstruct(data, old)
return(data)
}
#' @importFrom tibble deframe
#' @import dplyr
#' @import rlang
is_varying <- function(data, variable) {
variable <- ensym(variable)
out <- data %>%
# For each group, does the variable vary?
transmute(variable := n_distinct(!! variable, na.rm = TRUE) %in% c(0L,1L)) %>%
ungroup()
out <- out[["variable"]]
# Asking if all groups had zero changes within the groups
out <- all(out, na.rm = TRUE)
# Because the above operation basically produces the answer to is_constant
# I now need to return the opposite of out
return(!out)
}
#' @title Check if variables are constant or variable over time.
#' @description This function is designed for use with [panel_data()] objects.
#' @param data A data frame, typically of [panel_data()] class.
#' @param ... Variable names. If none are given, all variables are checked.
#' @param type Check for variance over time or across individuals? Default
#' is `"time"`. `"individual"` considers variables like age to be non-varying
#' because everyone ages at the same speed.
#' @return A named logical vector. If TRUE, the variable is varying.
#' @examples
#'
#' wages <- panel_data(WageData, id = id, wave = t)
#' wages %>% are_varying(occ, ind, fem, blk)
#'
#' @rdname are_varying
#' @import rlang
#' @importFrom purrr map_lgl
#' @importFrom stringr str_detect
#' @export
are_varying <- function(data, ..., type = "time") {
wave <- get_wave(data)
id <- get_id(data)
class(data) <- class(data)[class(data) %nin% "panel_data"]
dots <- quos(...)
if (length(dots) == 0) {
dnames <- names(data) %not% c(wave, id)
dots <- syms(as.list(dnames))
} else {
# This gives an unsurprising "adding grouping variable" message
suppressMessages(data <- dplyr::select(data, ...))
dots <- as.character(enexprs(...))
is_wave <- if (wave %in% dots) NULL else wave
dots <- syms(
as.list(names(data) %not% c(id, is_wave))
)
}
# Get time variation
if ("time" %in% type) {
outt <- map_lgl(dots, function(x, d) {
is_varying(!! x, data = select(d, !! id, !! x))
}, d = data)
}
# Get individual variation
if ("individual" %in% type) {
outi <- map_lgl(dots, function(x, d) {
is_varying_individual(!! x, data = select(d, !! id, !! x))
}, d = data)
# If both, rbind them into a d.f.
if (exists("outt")) {
out <- as.data.frame(rbind(outt, outi))
rownames(out) <- c("time", "individual")
} else {out <- outi}
}
# If not both, make time the out object
if (!exists("out", inherits = FALSE)) {
out <- outt
}
names(out) <- as.character(unlist(dots))
out
}
#' @importFrom tibble deframe
#' @import dplyr
#' @import rlang
#' @importFrom stats var
is_varying_individual <- function(data, variable) {
variable <- enquo(variable)
# Need to deal with non-numeric data
if (!is.numeric(data[[as_name(variable)]])) {
# If ordered, pretend it's numeric
if (is.ordered(data[[as_name(variable)]])) {
data[[as_name(variable)]] <- as.numeric(data[[as_name(variable)]])
} else {
# Otherwise just check if it varies at all
return(is_varying(data, !! variable))
}
}
out <- data %>%
# make new variable with the within-subject variance
mutate(variable = var(!! variable, na.rm = TRUE)) %>%
# ungroup
ungroup() %>%
# select only our new value
select(variable) %>%
# change to a vector
deframe() %>%
# see how many distinct values there are
n_distinct(na.rm = TRUE) %nin% c(0L, 1L)
}
## Using these to get around limitations with constants that are measured
## after Wave 1 in labeled wide data. e.g., the wide data has var_W2, but only
## measured in W2.
set_constants <- function(data, vars) {
constants <- lapply(syms(vars), set_constant, data = data)
data[vars] <- constants
return(data)
}
set_constant <- function(data, var) {
var <- enquo(var)
var_name <- quo_name(var)
transmute(data, !! var_name := uniq_nomiss(!! var)) %>%
unpanel() %>%
select(!! var_name) %>%
deframe()
}
## This is my way of grabbing the lone non-NA value from the group
## ...unless, of course, they are all NA in which case I need to give back NA
uniq_nomiss <- function(x) {
un <- unique(x)
if (!all(is.na(un))) {
un <- un[!is.na(un)]
}
return(un)
}
#' @export
#' @importFrom tibble tbl_sum
#'
tbl_sum.panel_data <- function(x, ...) {
default_header <- NextMethod()
names(default_header)[1] <- "Panel data"
names(default_header)[2] <- "Entities"
periods <- get_periods(x)
if (length(periods) > 3) {
periods <- paste0(paste(periods[1:3], collapse = ", "), ", ... (",
n_distinct(periods), " waves)")
} else {
periods <- paste0(paste(periods, collapse = ", "), " (",
n_distinct(periods), " waves)")
}
the_periods <- paste0(get_wave(x), " [", periods, "]")
c(default_header, "Wave variable" = the_periods)
}
#' @title Convert panel_data to regular data frame
#' @description This convenience function removes the special features of
#' `panel_data`.
#' @param panel A `panel_data` object.
#' @return An ungrouped `tibble`.
#' @examples
#' data("WageData")
#' wages <- panel_data(WageData, id = id, wave = t)
#' wages_non_panel <- unpanel(wages)
#' @export
unpanel <- function(panel) {
class(panel) <- class(panel) %not% c("panel_data", "grouped_df")
attributes(panel) <- attributes(panel)[names(attributes(panel)) %not% "groups"]
return(panel)
}
#' @title Retrieve panel_data metadata
#' @description `get_id()`, `get_wave()`, and `get_periods()` are extractor
#' functions that can be used to retrieve the names of the id and wave
#' variables or time periods of a `panel_data` frame.
#' @param data A `panel_data` frame
#' @return A `panel_data` frame
#' @examples
#'
#' data("WageData")
#' wages <- panel_data(WageData, id = id, wave = t)
#' get_wave(wages)
#' get_id(wages)
#' get_periods(wages)
#'
#' @rdname get_wave
#' @export
get_wave <- function(data) {
attr(data, "wave")
}
#' @export
#' @rdname get_wave
get_id <- function(data) {
attr(data, "id")
}
#' @export
#' @rdname get_wave
get_periods <- function(data) {
attr(data, "periods")
}
#' @importFrom methods is
valid_wave <- function(x) {
is.numeric(x) | is.ordered(x) | is(x, "Date") | inherits(x, "POSIXct") |
inherits(x, "POSIXlt") | inherits(x, "POSIXt") | inherits(x, "difftime")
}
##### internal panel_data tools #############################################
complete_cases <- function(data, min.waves = "all") {
id <- get_id(data)
wave <- get_wave(data)
# Keep only complete cases
data <- dplyr::filter(unpanel(data), complete.cases(data))
data <- panel_data(data, id = !! sym(id), wave = !! sym(wave))
# Using the table to count up how many obs. of each person
t <- table(data[[id]])
if (min.waves == "all") {
min.waves <- max(t) # Whoever has the most observations has all the waves
}
# Keep only people who were observed minimum number of times
keeps <- which(t >= min.waves)
keeps <- names(t)[keeps]
data <- dplyr::filter(data, !! sym(id) %in% keeps)
return(data)
}