/
duplicate-detect.R
297 lines (266 loc) · 10.1 KB
/
duplicate-detect.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
#' New duplicate analysis functions
#'
#' @description `function_duplicate_cols()` is an internal (non-exported)
#' function factory. It creates new functions that take a data frame and affix
#' one new column to the right of each existing column.
#'
#' These functions were made by `function_duplicate_cols()`:
#' - [`duplicate_tally()`]
#' - [`duplicate_detect()`]
#'
#' @param code_new_cols Expression which the factory-made function will evaluate
#' at runtime. It computes the vector that will be split into the new columns
#' -- those that contain the test results. Therefore, the vector must have the
#' same length as the `x` object on which it operates, and which contains all
#' values from the factory-made function's first argument.
#' @param default_end String. In the factory-made function, this will be the
#' default for the `colname_end` argument, i.e., the ending of each of the
#' newly created columns.
#' @param name_class String. Name of the class which the factory-made function
#' will add to the tibble that it returns.
#' @param include_numeric_only_arg Logical. Should the factory-made function
#' have the deprecated `numeric_only = TRUE` argument? Only for compatibility
#' with older versions of scrutiny (i.e., before 0.3.0). Default is `FALSE`.
#'
#' @include utils.R
#'
#' @return Function such as [`duplicate_tally()`] or [`duplicate_detect()`].
#'
#' @noRd
function_duplicate_cols <- function(code_new_cols, default_end, name_class,
include_numeric_only_arg = FALSE) {
code_new_cols <- rlang::enexpr(code_new_cols)
if (include_numeric_only_arg) {
numeric_only_arg <- list(numeric_only = deprecated())
numeric_only_check <- rlang::expr({
if (lifecycle::is_present(numeric_only)) {
lifecycle::deprecate_warn(
when = "0.3.0",
what = paste0(name_caller_call(), "(numeric_only)"),
details = "It no longer has any effect because all input \
values are now coerced to character strings."
)
}
})
} else {
numeric_only_arg <- NULL
numeric_only_check <- NULL
}
# --- Start of the manufactured function ---
rlang::new_function(
args = rlang::pairlist2(
x =, ignore = NULL, colname_end = default_end, !!!numeric_only_arg
),
body = rlang::expr({
`!!`(numeric_only_check)
# Type checking:
if (!rlang::is_vector(x)) {
cli::cli_abort(c(
"`x` must be a data frame or other type of vector.",
"x" = paste0("It is ", an_a_type(x), ".")
))
}
# Convert `x` to a data frame if needed:
x_was_named <- rlang::is_named(x)
if (!x_was_named || !is.data.frame(x)) {
x <- tibble::as_tibble(
x, .name_repair = if (x_was_named) {
function(x) x
} else if (is.atomic(x) || length(x) == 1L) {
function(x) "value"
} else {
function(x) paste0("col", seq_along(x))
}
)
} else if (!tibble::is_tibble(x)) {
x <- tibble::as_tibble(x)
}
# Save the column names before the transformations that will occur below:
colnames_original <- names(x)
nrow_original <- nrow(x)
# Create a reference vector with all values from `x` so that they can be
# tested against. To make all values fit together, they are coerced to
# character strings:
x <- x %>%
tidyr::pivot_longer(
cols = everything(),
values_transform = as.character,
cols_vary = "slowest"
) %>%
dplyr::pull(.data$value)
new_cols <- `!!`(code_new_cols)
# With missing values from the input, it's not known whether they have
# duplicates or not. `NA` should also be substituted if the user chose to
# ignore that value:
if (is.null(ignore)) {
new_cols[is.na(x)] <- NA
} else {
new_cols[is.na(x) | x %in% ignore] <- NA
}
# Gather both vectors in a tibble, so that each test value is joined by a
# Logical value indicating whether it has any duplicates in the rest of
# the vector (i.e., in the flattened original data frame). Split the
# two-column tibble and rearrange it into one of the same shape as the
# original data frame, but with every test value accompanied by its
# corresponding logical value to the right, as above. Also, add the
# "scr_dup_detect" class added, which is recognized by the `audit()`
# generic:
x %>%
tibble::tibble(new_cols) %>%
split(ceiling(seq_along(x) / nrow_original)) %>%
dplyr::bind_cols(.name_repair = function(x) {
colnames_test <- paste0(colnames_original, "_", colname_end)
as.vector(rbind(colnames_original, colnames_test))
}) %>%
add_class(`!!`(name_class))
})
)
# --- End of the manufactured function ---
}
# duplicate_detect() ------------------------------------------------------
#' Detect duplicate values
#'
#' @description `r lifecycle::badge('superseded')`
#'
#' `duplicate_detect()` is superseded because it's less informative than
#' [`duplicate_tally()`] and [`duplicate_count()`]. Use these functions
#' instead.
#'
#' For every value in a vector or data frame, `duplicate_detect()` tests
#' whether there is at least one identical value. Test results are presented
#' next to every value.
#'
#' This function is a blunt tool designed for initial data checking. Don't put
#' too much weight on its results.
#'
#' For summary statistics, call [`audit()`] on the results.
#'
#' @details This function is not very informative with many input values that
#' only have a few characters each. Many of them may have duplicates just by
#' chance. For example, in R's built-in `iris` data set, 99% of values have
#' duplicates.
#'
#' In general, the fewer values and the more characters per value, the more
#' significant the results.
#'
#' @param x Vector or data frame.
#' @param ignore Optionally, a vector of values that should not be checked. In
#' the test result columns, they will be marked `NA`.
#' @param colname_end String. Name ending of the logical test result columns.
#' Default is `"dup"`.
#' @param numeric_only [[Deprecated]] No longer used: All values are coerced to
#' character.
#' @return A tibble (data frame). It has all the columns from `x`, and to each
#' of these columns' right, the corresponding test result column.
#'
#' The tibble has the `scr_dup_detect` class, which is recognized by the
#' `audit()` generic.
#' @section Summaries with [`audit()`]: There is an S3 method for the
#' [`audit()`] generic, so you can call [`audit()`] following
#' `duplicate_detect()`. It returns a tibble with these columns ---
#' - `term`: The original data frame's variables.
#' - `dup_count`: Number of "duplicated" values of that `term` variable: those
#' which have at least one duplicate anywhere in the data frame.
#' - `total`: Number of all non-`NA` values of that `term` variable.
#' - `dup_rate`: Rate of "duplicated" values of that `term` variable.
#'
#' The final row, `.total`, summarizes across all other rows: It adds up the
#' `dup_count` and `total_count` columns, and calculates the mean of the
#' `dup_rate` column.
#'
#' @seealso
#' - [`duplicate_tally()`] to count instances of a value instead of just
#' stating whether it is duplicated.
#' - [`duplicate_count()`] for a frequency table.
#' - [`duplicate_count_colpair()`] to check each combination of columns for
#' duplicates.
#' - [`janitor::get_dupes()`] to search for duplicate rows.
#'
#' @include utils.R
#'
#' @export
#'
#' @examples
#' # Find duplicate values in a data frame...
#' duplicate_detect(x = pigs4)
#'
#' # ...or in a single vector:
#' duplicate_detect(x = pigs4$snout)
#'
#' # Summary statistics with `audit()`:
#' pigs4 %>%
#' duplicate_detect() %>%
#' audit()
#'
#' # Any values can be ignored:
#' pigs4 %>%
#' duplicate_detect(ignore = c(8.131, 7.574))
duplicate_detect <- function_duplicate_cols(
# Create a logical vector pointing out duplicates within the vector of all
# input values, both from the start forward and from the end backward:
code_new_cols = duplicated(x) | duplicated(x, fromLast = TRUE),
default_end = "dup",
name_class = "scr_dup_detect",
include_numeric_only_arg = TRUE
)
# duplicate_tally() -------------------------------------------------------
#' Count duplicates at each observation
#'
#' @description For every value in a vector or data frame, `duplicate_tally()`
#' counts how often it appears in total. Tallies are presented next to each
#' value.
#'
#' For summary statistics, call [`audit()`] on the results.
#'
#' @param colname_end String. Name ending of the logical test result columns.
#' Default is `"n"`.
#'
#' @inheritParams duplicate_detect
#' @inherit duplicate_detect details
#'
#' @return A tibble (data frame). It has all the columns from `x`, and to each
#' of these columns' right, the corresponding tally column.
#'
#' The tibble has the `scr_dup_detect` class, which is recognized by the
#' `audit()` generic.
#'
#' @section Summaries with [`audit()`]: There is an S3 method for the [`audit()`]
#' generic, so you can call [`audit()`] following `duplicate_tally()`. It
#' returns a tibble with summary statistics.
#'
#' @seealso
#' - [`duplicate_count()`] for a frequency table.
#' - [`duplicate_count_colpair()`] to check each combination of columns for
#' duplicates.
#' - [`janitor::get_dupes()`] to search for duplicate rows.
#'
#' @include utils.R
#' @export
#'
#' @examples
#' # Tally duplicate values in a data frame...
#' duplicate_tally(x = pigs4)
#'
#' # ...or in a single vector:
#' duplicate_tally(x = pigs4$snout)
#'
#' # Summary statistics with `audit()`:
#' pigs4 %>%
#' duplicate_tally() %>%
#' audit()
#'
#' # Any values can be ignored:
#' pigs4 %>%
#' duplicate_tally(ignore = c(8.131, 7.574))
duplicate_tally <- function_duplicate_cols(
# For each input value, count how many other instances of that value exist:
code_new_cols = {
new_cols <- integer(length(x))
for (i in seq_along(new_cols)) {
new_cols[i] <- length(x[x == x[i]])
}
new_cols
},
default_end = "n",
name_class = "scr_dup_tally"
)