/
duplicate-count.R
157 lines (140 loc) · 5.35 KB
/
duplicate-count.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#' Count duplicate values
#'
#' @description `duplicate_count()` returns a frequency table. When searching a
#' data frame, it includes values from all columns for each frequency count.
#'
#' This function is a blunt tool designed for initial data checking. It is not
#' too informative if many values have few characters each.
#'
#' For summary statistics, call [`audit()`] on the results.
#'
#' @param x Vector or data frame.
#' @param ignore Optionally, a vector of values that should not be counted.
#' @param locations_type String. One of `"character"` or `"list"`. With
#' `"list"`, each `locations` value is a vector of column names, which is
#' better for further programming. By default (`"character"`), the column
#' names are pasted into a string, which is more readable.
#' @param numeric_only [[Deprecated]] No longer used: All values are coerced to
#' character.
#' @return If `x` is a data frame or another named vector, a tibble with four
#' columns. If `x` isn't named, only the first two columns appear:
#'
#' - `value`: All the values from `x`.
#' - `frequency`: Absolute frequency of each value in `x`, in descending order.
#' - `locations`: Names of all columns from `x` in which `value` appears.
#' - `locations_n`: Number of columns named in `locations`.
#'
#' The tibble has the `scr_dup_count` class, which is recognized by the
#' [`audit()`] generic.
#' @details Don't use `numeric_only`. It no longer has any effect and will be
#' removed in the future. The only reason for this argument was the risk of
#' errors introduced by coercing values to numeric. This is no longer an issue
#' because all values are now coerced to character, which is more appropriate
#' for checking reported statistics.
#' @section Summaries with [`audit()`]: There is an S3 method for the
#' [`audit()`] generic, so you can call [`audit()`] following
#' `duplicate_count()`. It returns a tibble with summary statistics for the
#' two numeric columns, `frequency` and `locations_n` (or, if `x` isn't named,
#' only for `frequency`).
#'
#' @seealso
#' - [`duplicate_count_colpair()`] to check each combination of columns for
#' duplicates.
#' - [`duplicate_tally()`] to show instances of a value next to each instance.
#' - [`janitor::get_dupes()`] to search for duplicate rows.
#'
#' @include utils.R
#'
#' @export
#'
#' @examples
#' # Count duplicate values...
#' iris %>%
#' duplicate_count()
#'
#' # ...and compute summaries:
#' iris %>%
#' duplicate_count() %>%
#' audit()
#'
#' # Any values can be ignored:
#' iris %>%
#' duplicate_count(ignore = c("setosa", "versicolor", "virginica"))
duplicate_count <- function(x, ignore = NULL,
locations_type = c("character", "list"),
numeric_only = deprecated()) {
locations_type <- rlang::arg_match(locations_type)
if (lifecycle::is_present(numeric_only)) {
lifecycle::deprecate_warn(
when = "0.3.0",
what = "duplicate_count(numeric_only)",
details = "It no longer has any effect because all input \
values are now coerced to character strings."
)
}
# Convert `x` to a data frame if needed (`x_was_named` will also be checked
# further below):
x_was_named <- rlang::is_named(x)
if (!x_was_named || !is.data.frame(x)) {
x <- tibble::as_tibble(
x, .name_repair = if (x_was_named) {
function(x) x
} else {
function(x) paste0("col", seq_along(x))
}
)
} else if (!tibble::is_tibble(x)) {
x <- tibble::as_tibble(x)
}
names_orig <- colnames(x)
x <- x %>%
dplyr::mutate(dplyr::across(everything(), as.factor)) %>%
tidyr::pivot_longer(
cols = everything(),
names_to = "name",
values_to = "value"
) %>%
dplyr::mutate("name" = as.factor(.data$name))
if (is.null(ignore)) {
x <- dplyr::filter(x, !is.na(.data$value))
} else {
x <- dplyr::filter(x, !is.na(.data$value) & !.data$value %in% ignore)
}
out <- x$value %>%
table() %>%
tibble::as_tibble(.name_repair = function(x) c("value", "frequency")) %>%
dplyr::filter(!.data$value %in% ignore) %>%
dplyr::arrange(dplyr::desc(.data$frequency)) %>%
add_class("scr_dup_count")
# All code below is about the `locations` and `locations_n` columns, but they
# are only meant for data frames and other named vectors. If the `x` input was
# not named, there is nothing more to do:
if (!x_was_named) {
return(out)
}
# In the original `x` input data frame, count the columns in which each unique
# value appears. Store the names of those columns, and sort them by their
# order of appearance in the input data frame:
x$name <- as.character(x$name)
locations <- vector("list", nrow(out))
locations_n <- integer(nrow(out))
for (i in seq_along(locations)) {
temp <- unique(x[x$value == out$value[i], ]$name)
locations[i] <- list(temp[order(match(temp, names_orig))])
locations_n[i] <- length(locations[[i]])
}
# The user may specify `locations_type` to remain a list:
if (locations_type == "list") {
return(dplyr::mutate(out, locations, locations_n))
}
# By default (`locations_type == "character"`), collapse each list element --
# i.e., each vector of location names -- into a string:
dplyr::mutate(
out,
locations = vapply(
locations, function(x) paste(x, collapse = ", "),
character(1L), USE.NAMES = FALSE
),
locations_n
)
}