-
Notifications
You must be signed in to change notification settings - Fork 0
/
mode-df.R
196 lines (192 loc) · 8.45 KB
/
mode-df.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#' Tabulate mode estimates with the certainty about them
#'
#' @description `mode_df()` takes a data frame (or another list) of numeric
#' vectors and computes the mode or modes of each element. Where the true mode
#' is unknown due to missing values, more and more `NA`s are ignored until an
#' estimate for the mode is found.
#'
#' Estimates are presented along with information about whether they are known
#' to be the true mode, how many `NA`s had to be ignored during estimation,
#' the rate of ignored `NA`s, etc.
#'
#' @param x List of vectors. Each vector needs to be numeric or similar. Note
#' that data frames are lists, so `x` can be a data frame.
#' @param method String. How to determine the mode(s)? Options are:
#' - `"first"` for [`mode_first()`], the default.
#' - `"all"` for [`mode_all()`]. This may return multiple values per estimate.
#' - `"single"` for [`mode_single()`]. The only option that can return `NA`
#' estimates.
#' @param na.rm.from String. Only relevant to the default `method = "first"`.
#' Where to start when removing `NA`s from `x`? Options are `"start"`,
#' `"end"`, and `"random"`. Default is `"start"`.
#' @param accept Passed on to [`mode_first()`] and [`mode_single()`]. Default is
#' `FALSE`.
#' @param multiple Passed on to [`mode_single()`]. Default is `"NA"`.
#'
#' @details The function deals with missing values (`NA`s) by first checking
#' whether they make the true mode unknown. If they do, it removes one `NA`,
#' then checks again; and so on until an estimate is found.
#'
#' This strategy is based on the `na.rm.amount` argument of [`mode_first()`],
#' [`mode_all()`], and [`mode_single()`]. It represents a middle way between
#' simply ignoring all `NA`s and not even trying to compute an estimate.
#' Instead, it only removes the minimum number of `NA`s necessary, because
#' some distributions have a known mode (or set of modes) even if some of
#' their values are missing. By keeping track of the removed `NA`s,
#' `mode_df()` quantifies the uncertainty about its estimates.
#'
#' @return Tibble (data frame) with these columns:
#' - `term`: the names of `x` elements. Only present if any are named.
#' - `estimate`: the modes of `x` elements, ignoring as many `NA`s as
#' necessary. List-column if `method = "all"`.
#' - `certainty`: `TRUE` if the corresponding estimate is certain to be the
#' true mode, and `FALSE` if this is unclear due to missing values.
#' - `na_ignored`: the number of missing values that had to be ignored to
#' arrive at the estimate.
#' - `na_total`: the total number of missing values.
#' - `rate_ignored_na`: the proportion of missing values that had to be
#' ignored from among all missing values.
#' - `sum_total`: the total number of values, missing or not.
#' - `rate_ignored_sum`: the proportion of missing values that had to be
#' ignored from among all values, missing or not.
#'
#' @export
#'
#' @include mode-proper.R
#'
#' @examples
#' # Use a list of numeric vectors:
#' my_list <- list(
#' a = 1:15,
#' b = c(1, 1, NA),
#' c = c(4, 4, NA, NA, NA, NA),
#' d = c(96, 24, 3, NA)
#' )
#'
#' mode_df(my_list)
#'
#' # Data frames are allowed:
#' mode_df(iris[1:4])
# # Test with:
# x <- list(
# a = 1:15,
# b = c(1, 1, NA),
# c = c(4, 4, NA, NA, NA, NA),
# d = c(96, 24, 3, NA)
# )
# method <- "first"
# na.rm.amount <- 0
# na.rm.from <- c("first", "last", "random")
# accept <- FALSE
# multiple <- "NA"
mode_df <- function(x, method = c("first", "all", "single"),
na.rm.from = c("first", "last", "random"), accept = FALSE,
multiple = c("NA", "min", "max", "mean", "median",
"first", "last", "random")) {
# Check that `x` is a list because the point of this function is to find
# estimates for the mode of each element of `x`:
if (!is.list(x)) {
stop("`x` must be a list.")
}
# Check specifications of the strategy arguments:
method <- match.arg(method)
na.rm.from <- match.arg(na.rm.from)
multiple <- match.arg(multiple)
# Initialize the two most important output vectors. They will be columns of
# the output data frame. The `estimate` vector is a list if the method is
# `"all"` because `mode_all()` can return length > 1 vectors. Otherwise,
# `estimate` is integer for the corner case that all values that will be
# assigned to its elements are integers (i.e., none is a double; this would
# coerce any integer elements).
length_x <- length(x)
na_ignored <- integer(length_x)
estimate <- if (method == "all") {
vector("list", length_x)
} else {
integer(length_x)
}
# The `mode_*()` function called below is determined by the `method` argument:
name_fn <- paste0("mode_", method)
# Loop through the `x` elements, attempting to find a mode estimate for each
# by removing one `NA` at a time:
for (i in seq_len(length_x)) {
# Vectors where all elements are missing have an unknown mode, so the
# estimate should be `NA`, as well (of the same type as `x[[i]]`). This is
# implemented via a shortcut:
if (all(is.na(x[[i]]))) {
estimate[[i]] <- x[[i]][NA_integer_]
next
}
# Initialize the number of `NA`s that need to be ignored at zero. This value
# will be incremented once for every failed attempt to get a non-`NA`
# estimate. At the start, however, the true mode of a given element of `x`
# may yet be known.
na_ignored_current <- 0L
# Keep looking for a modal estimate for `x[[i]]`. This is implemented via an
# infinite loop. In other words, `repeat` is equivalent to `while (TRUE)`:
repeat {
# Call the mode function chosen via the `method` argument, then call it
# with its specific list of arguments:
estimate_current <- do.call(
what = name_fn,
args = switch(
method,
"first" = list(
x = x[[i]], na.rm = FALSE, na.rm.amount = na_ignored_current,
na.rm.from = na.rm.from, accept = accept
),
"all" = list(
x = x[[i]], na.rm = FALSE, na.rm.amount = na_ignored_current
),
"single" = list(
x = x[[i]], na.rm = FALSE, na.rm.amount = na_ignored_current,
accept = accept, multiple = multiple
)
)
)
# Check whether `estimate_current` is `NA` and, if `mode_single()` was
# called with its `multiple == "NA"` default, whether some `NA`s in `x`
# have not yet been ignored. This additional check is necessary to prevent
# an infinite loop in certain cases because `mode_single()` returns `NA`
# by default if there are multiple modes.
estimate_is_na <- length(estimate_current) == 1L &&
is.na(estimate_current)
if (estimate_is_na && method == "single" && multiple == "NA") {
estimate_is_na <- na_ignored_current < length(x[[i]][is.na(x[[i]])])
}
# -- If the estimate is `NA`, there will be (at least) one more iteration
# of the repeat loop, and it will ignore one more `NA`.
# -- If a non-`NA` estimate has been found, enter it into the `estimate`
# vector, record the number of `NA`s that needed to be ignored for this
# result, and break out of the repeat loop.
if (estimate_is_na) {
na_ignored_current <- na_ignored_current + 1L
} else {
na_ignored[i] <- na_ignored_current
estimate[[i]] <- estimate_current
break
}
} # End of repeat loop
} # End of for loop
# As a purely mechanical consequence of the `na_ignored` integer vector's
# values being zero or greater, `certainty` marks those cases where no `NA`s
# need to be ignored to compute an estimate; and thus, where there is a known
# and determinate mode:
certainty <- na_ignored == 0L
# Count the missing values in each element of `x`, then compute the rate of
# missing values that had to be ignored (to find a mode estimate) as a share
# of the total number of missing values:
na_total <- vapply(
x, function(y) length(y[[1L]][is.na(y)]), integer(1L), USE.NAMES = FALSE
)
# Compute the rates of ignored `NA`s from all `NA`s and from all values:
sum_total <- vapply(x, length, integer(1L), USE.NAMES = FALSE)
rate_ignored_na <- na_ignored / na_total
rate_ignored_sum <- na_ignored / sum_total
# Collect the length-`x` vectors in a data frame, adding a `term` column in
# case any elements of `x` are named:
tibble::tibble(
term = names(x), estimate, certainty, na_ignored, na_total, rate_ignored_na,
sum_total, rate_ignored_sum
)
}