-
Notifications
You must be signed in to change notification settings - Fork 0
/
find_xref.R
193 lines (170 loc) · 8.25 KB
/
find_xref.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
#' Find an xref of a record given a set of search terms
#'
#' @details
#' This is a helper function to identify the xref of a record given information such
#' as a name or reference number. You provide a named `search_patterns` vector of namespaced tag-pattern pairs,
#' such as:
#'
#' c(INDI.NAME = "Homer", INDI.SEX = "M", INDI.BIRT.DATE = "JAN 1974")
#'
#' If you're not sure what namespace to use, use the `mutate_tag_namespace` function.
#'
#' The search patterns will be treated as regular expressions, so they will match a value if it contains
#' the pattern provided. You can anchor your search pattern if you want an exact match, e.g. "^JAN 1974$".
#' If you're not familiar with regular expressions, you may need to escape certain characters such as
#' brackets and a full-stop/period (i.e. `\\.`).
#'
#' @param gedcom A tidyged object.
#' @param search_patterns A named vector of terms to search for (see Details).
#' @param mode Whether to only return an xref if all patterns are matched ("strict"). A value of
#' "best" will return the xref with the most matches. If either of these still result in more than
#' one xref it will return an error unless `multiple` is TRUE.
#' @param multiple If more than one xref is found (according to `mode`), whether to return all xrefs
#' or throw an error.
#' @param ignore_case Should case differences be ignored in the match?
#'
#' @return A vector of one or more xrefs.
#' @examples
#' find_xref(sample555, c(INDI.BURI.PLAC = "Spring Hill"), multiple = FALSE)
#' find_xref(sample555, c(INDI.SEX = "M"), multiple = TRUE)
#' find_xref(sample555, c(FAM.MARR.DATE = "1859"), multiple = FALSE)
#' find_xref(sample555, c(REPO.ADDR.CITY = "Salt Lake"), multiple = TRUE)
#' find_xref(sample555, c(INDI.NAME.SURN = "Williams", INDI.ADOP.DATE = "Never"),
#' mode = "best", multiple = TRUE)
#' @export
#' @tests
#' expect_error(find_xref(sample555, character()))
#' expect_error(find_xref(sample555, c(INDI.NAME = "test"), mode = "foo"))
#' expect_error(find_xref(sample555, letters[1:5]))
#' expect_error(find_xref(sample555, c(a = "das", "sd", b = "r42")))
#' expect_error(find_xref(sample555, c(INDI.SEX = "L")))
#' expect_error(find_xref(sample555, c(INDI.NAME.SURN = "Williams")))
#' expect_equal(find_xref(sample555, c(FAM.MARR.DATE = "1859")), "@F1@")
#' expect_equal(find_xref(sample555, c(INDI.SEX = "M", INDI.NAME.SURN = "Williams", INDI.BIRT.PLAC = "Falls")), "@I3@")
find_xref <- function(gedcom, search_patterns, mode = "strict", multiple = FALSE, ignore_case = FALSE) {
mode <- tolower(mode)
if(length(search_patterns) == 0) stop("At least one search pattern must be provided")
if(!mode %in% c("strict","best")) stop("The mode must be either 'strict' or 'best'")
if(is.null(names(search_patterns))) stop("The search_patterns vector must be named")
if(any(names(search_patterns)=="")) stop("Each search pattern must be named with a namespace value")
tags_ns <- toupper(names(search_patterns))
search_patterns_val <- as.character(search_patterns) #unname
gedcom_ns <- mutate_tag_namespace(gedcom) |>
temporarily_remove_name_slashes()
reg_case <- purrr::partial(stringr::regex, ignore_case = ignore_case)
matches <- purrr::map2(tags_ns, search_patterns_val,
~unique(dplyr::filter(gedcom_ns,
tag_ns == .x,
stringr::str_detect(value, reg_case(.y)))$record))
if(mode == "strict") {
xref <- Reduce(intersect, matches)
if(length(xref) == 0) stop("No records found that match all patterns.")
if(length(xref) > 1 & !multiple)
stop("No unique records found that match all patterns. Try being more specific.")
} else { #best
xref <- tibble::tibble(matches = unlist(matches)) |>
dplyr::count(matches) |>
dplyr::filter(n==max(n)) |>
dplyr::pull(matches)
if(length(xref) == 0) stop("No records found that match any patterns.")
if(length(xref) > 1 & !multiple) stop("No unique records found that match any patterns. Try being more specific.")
}
xref
}
#' Helper functions to locate record xrefs
#'
#' These functions act as wrappers to the `find_xref` function to find one or more record xrefs.
#'
#' @details If you have your own specific use cases to identify records, it's easy to write your own wrapper.
#' It's best to name your function `find_recordtype_*` and end it with `_all` if it can return multiple xrefs.
#' If you provide more than one search pattern, you should also include the mode argument.
#'
#' @param gedcom A tidyged object.
#' @param pattern The search pattern to use (regular expression).
#' @param ignore_case Should case differences be ignored in the match?
#'
#' @return A character vector of xref(s).
#' @examples
#' find_indi_name(sample555, "Mary")
#' find_indi_name_all(sample555, "Williams")
#' find_repo_name(sample555, "library", ignore_case = TRUE)
#' find_sour_titl(sample555, "Madison.+Records")
#' @export
find_indi_refn <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(INDI.REFN = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_indi_name <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(INDI.NAME = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_indi_name_all <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(INDI.NAME = pattern), multiple = TRUE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_repo_refn <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(REPO.REFN = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_repo_name <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(REPO.NAME = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_repo_name_all <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(REPO.NAME = pattern), multiple = TRUE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_note_refn <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(NOTE.REFN = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_note_text <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(NOTE = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_note_text_all <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(NOTE = pattern), multiple = TRUE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_media_refn <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(OBJE.REFN = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_media_fileref <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(OBJE.FILE = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_media_fileref_all <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(OBJE.FILE = pattern), multiple = TRUE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_sour_refn <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(SOUR.REFN = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_sour_titl <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(SOUR.TITL = pattern), multiple = FALSE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_sour_titl_all <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(SOUR.TITL = pattern), multiple = TRUE, ignore_case = ignore_case)
}
#' @rdname find_indi_refn
#' @export
find_famg_refn <- function(gedcom, pattern, ignore_case = FALSE) {
find_xref(gedcom, search_patterns = c(FAM.REFN = pattern), multiple = FALSE, ignore_case = ignore_case)
}