-
Notifications
You must be signed in to change notification settings - Fork 2
/
manualOutlierFindeR.R
218 lines (197 loc) · 9.12 KB
/
manualOutlierFindeR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
# This function was written on the 25th of February to find manually identified outliers in
# bee data
# For questions, contact James B Dorey at jbdorey[at]me.com
#' Finds outliers, and their duplicates, as determined by experts
#'
#' Uses expert-identified outliers with source spreadsheets that may be edited by users. The function
#' will also use the duplicates file made using [BeeBDC::dupeSummary()] to identify duplicates of the
#' expert-identified outliers and flag those as well.
#' The function will add a flagging column called `.expertOutlier` where records that are FALSE are
#' the expert outliers.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param DataPath A character path to the directory that contains the outlier spreadsheets.
#' @param PaigeOutliersName A character patch. Should lead to outlier spreadsheet from Paige Chesshire (csv file).
#' @param newOutliersName A character path. Should lead to appropriate outlier spreadsheet (xlsx file).
#' @param ColombiaOutliers_all A character path. Should lead to spreadsheet of bee outliers from Colombia (csv file).
#' @param duplicates A data frame or tibble. The duplicate file produced by [BeeBDC::dupeSummary()].
#' @param NearTRUE Optional. A character file name to the csv file. If you want to remove expert
#' outliers that are too close to TRUE points, use the name of the NearTRUE.csv.
#' Note: This implementation is only basic for now unless there is a greater need in the future.
#' @param NearTRUE_threshold Numeric. The threshold (in km) for the distance to TRUE points to
#' keep expert outliers.
#'
#' @return Returns the data with a new column, `.expertOutlier` where records that are FALSE are
#' the expert outliers.
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples
#' \dontrun{
#' # Read example data
#' data(beesFlagged)
#' # Read in the most-recent duplicates file as well
#' if(!exists("duplicates")){
#' duplicates <- fileFinder(path = DataPath,
#' fileName = "duplicateRun_") %>%
#' readr::read_csv()}
#' # identify the outliers and get a list of their database_ids
#' beesFlagged_out <- manualOutlierFindeR(
#' data = beesFlagged,
#' DataPath = DataPath,
#' PaigeOutliersName = "removedBecauseDeterminedOutlier.csv",
#' newOutliersName = "^All_outliers_ANB_14March.xlsx",
#' ColombiaOutliers_all = "All_Colombian_OutlierIDs.csv",
#' duplicates = duplicates)
#' }
#'
manualOutlierFindeR <- function(
data = NULL,
DataPath = NULL,
PaigeOutliersName = "removedBecauseDeterminedOutlier.csv",
newOutliersName = "All_outliers_ANB.xlsx",
ColombiaOutliers_all = "All_Colombian_OutlierIDs.csv",
duplicates = NULL,
NearTRUE = NULL,
NearTRUE_threshold = 5
){
# locally bind variables to the function
OutPath_Report <- eventDate <- near_truepoints_KM <- database_id <- occurrenceID <- NULL
institutionCode <- database_id_keep <- catalogNumber <- .expertOutlier <- . <- NULL
#### 0.0 Prep ####
##### 0.1 Errors ####
###### a. FATAL errors ####
if(is.null(data)){
stop(paste0(" - No data was given. Please specify the occurrence data."))
}
if(is.null(DataPath)){
stop(paste0(" - No DataPath was given. Please specify the directory that contains the outliers."))
}
if(is.null(PaigeOutliersName)){
stop(paste0(" - No PaigeOutliersName was given. Please specify the outliers' file name."))
}
if(is.null(newOutliersName)){
stop(paste0(" - No newOutliersName was given. Please specify the outliers' file name."))
}
if(is.null(duplicates)){
stop(paste0(" - No duplicates was given. Please provide the duplicates dataset as generated by jbd_dupeSummary."))
}
###
#### 1.0 Data prep ####
##### 1.1 Find data ####
writeLines(" - Looking for the datasets...")
###### a. Paige outliers ####
# Find the outliers from chesshire et al. 2023
PaigeOutliers <- fileFinder(path = DataPath,
fileName = PaigeOutliersName) %>%
readr::read_csv( col_types = readr::cols(.default = "c")) %>%
suppressWarnings()
###### b. new outliers ####
# Find the new outliers from the three sheets concatenates by Angela
outliersAll <- fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Outliers_FromCanadaToPanama_ANB") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character)) %>%
dplyr::bind_rows(fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Tracys_outliers") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
) %>%
dplyr::bind_rows(fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Colombian_outliers") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
) %>%
dplyr::bind_rows(fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Outliers_SppInStatus3") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
) %>%
readr::write_excel_csv(paste(tempdir(), "newOutliers.csv", sep = "/"))
# Read back in with the correct column classes
outliersAll <- fileFinder(path = tempdir(),
fileName = "newOutliers.csv") %>%
readr::read_csv(col_types = readr::cols(.default = "c"), lazy = FALSE) %>%
dplyr::mutate(eventDate = eventDate %>%
lubridate::ymd_hms(truncated = 5)) %>%
suppressWarnings()
###### c. Colombia ####
ColombiaOutliers <- fileFinder(path = DataPath,
fileName = ColombiaOutliers_all) %>%
readr::read_csv( col_types = readr::cols(.default = "c")) %>%
suppressWarnings()
###### d. remove NearTRUE ####
# If user provies a NearTRUE input
if(!is.null(NearTRUE)){
# Find and read the csv
NearTRUE_data <- fileFinder(path = DataPath,
fileName = NearTRUE) %>%
readr::read_csv() %>%
dplyr::filter(near_truepoints_KM >= NearTRUE_threshold)
# Remove those below a threshold in from NearTRUE in outliersAll
outliersAll <- outliersAll %>%
dplyr::filter(!database_id %in% NearTRUE_data$database_id)
# Remove those below a threshold in from NearTRUE in ColombiaOutliers
ColombiaOutliers <- ColombiaOutliers %>%
dplyr::filter(!database_id %in% NearTRUE_data$database_id)
}
###### e. eventDate ####
# format data eventDate
data <- data %>%
dplyr::mutate(eventDate = eventDate %>%
lubridate::ymd_hms(truncated = 5))
##### 1.2 Process Paige ####
writeLines(" - Processing the Paige outliers...")
# Find PaigeOutliers in the occurrence data by occurrenceID and institutionCode
Outl_occID <- data %>%
tidyr::drop_na(occurrenceID) %>%
dplyr::filter(occurrenceID %in% PaigeOutliers$occurrenceID &
institutionCode %in% PaigeOutliers$institutionCode)
# Find PaigeOutliers by occurrenceID and institutionCode
Outliers_matched <- data %>%
# Remove matched IDs
dplyr::filter(!occurrenceID %in% Outl_occID$occurrenceID) %>%
tidyr::drop_na(catalogNumber, institutionCode) %>%
dplyr::filter(catalogNumber %in% PaigeOutliers$catalogNumber &
institutionCode %in% PaigeOutliers$institutionCode) %>%
# Re-bind the outlier matches
dplyr::bind_rows(Outl_occID)
# Combine the Paige and new outliers
outliersAll <- outliersAll %>%
# Convert to the correct column types
readr::type_convert(col_types = ColTypeR()) %>%
dplyr::bind_rows(Outliers_matched)
#### 2.0 Find outlier duplicates ####
##### 2.1 Find duplicates ####
writeLines(" - Looking for duplicates of the outliers...")
# Get a list of the outliers and their duplicates
outlierDuplicates <- duplicates %>%
dplyr::filter(database_id %in% outliersAll$database_id |
database_id_keep %in% outliersAll$database_id)
duplicateList <- c(outlierDuplicates %>% dplyr::pull(database_id),
outlierDuplicates %>% dplyr::pull(database_id_keep)) %>% unique()
##### 2.2 Combine duplicates ####
# Get a list of all outliers and their duplicates - database_id
outList <- c(outliersAll %>% dplyr::pull(database_id), duplicateList,
ColombiaOutliers %>% pull(database_id)) %>% unique()
#### 3.0 Flag records ####
# Find the occurrences that did not match
data <- data %>%
# Add the .expertOutlier columns as TRUE (not flagged)
dplyr::mutate(.expertOutlier = dplyr::if_else(
database_id %in% outList,
FALSE, TRUE))
# Return user output
message(
paste(
"\\manualOutlierFindeR:\n",
"Flagged",
format(sum(data$.expertOutlier == FALSE, na.rm = TRUE), big.mark = ","),
"expert-identified outliers:\n",
"The column '.expertOutlier' was added to the database.\n"
)
)
# Return data
return(data)
}