-
Notifications
You must be signed in to change notification settings - Fork 2
/
diagonAlley.R
308 lines (278 loc) · 13.2 KB
/
diagonAlley.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
# This function was written by James B Dorey to identify occurrence records with potential fill-down
# errors in the decimalLatitude and decimalLongitude columns. This function was written between
# the 27th and 28th of May 2022. Please contact James at jbdorey[at]me.com with questions if needed.
#' Find fill-down errors
#'
#' A simple function that looks for potential latitude and longitude fill-down errors by
#' identifying consecutive occurrences with coordinates at regular intervals. This is accomplished
#' by using a sliding window with the length determined by minRepeats.
#'
#' The sliding window (and hence fill-down errors) will only be examined
#' within the user-defined groupingColumns; if any of those
#' columns are empty, that record will be excluded.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param minRepeats Numeric. The minimum number of lat or lon repeats needed to flag a record
#' @param groupingColumns Character. The column(s) to group the analysis by and search for fill-down
#' errors within. Default = c("eventDate", "recordedBy", "datasetName").
#' @param ndec Numeric. The number of decimal places below which records will not be considered
#' in the diagonAlley function. This is fed into [BeeBDC::jbd_coordinates_precision()]. Default = 3.
#' @param stepSize Numeric. The number of occurrences to process in each chunk. Default = 1000000.
#' @param mc.cores Numeric. If > 1, the function will run in parallel
#' using mclapply using the number of cores specified. If = 1 then it will be run using a serial
#' loop. NOTE: Windows machines must use a value of 1 (see ?parallel::mclapply). Additionally,
#' be aware that each thread can use large chunks of memory.
#' Default = 1.
#'
#' @return The function returns the input data with a new column, .sequential, where FALSE =
#' records that have consecutive latitudes or longitudes greater than or equal to the user-defined
#' threshold.
#' @export
#'
#' @importFrom dplyr %>%
#' @importFrom stats complete.cases
#'
#' @examples
#' # Read in the example data
#' data(beesRaw)
#' # Run the function
#' beesRaw_out <- diagonAlley(
#' data = beesRaw,
#' # The minimum number of repeats needed to find a sequence in for flagging
#' minRepeats = 4,
#' groupingColumns = c("eventDate", "recordedBy", "datasetName"),
#' ndec = 3,
#' stepSize = 1000000,
#' mc.cores = 1)
#'
#'
diagonAlley <- function(
data = NULL,
minRepeats = NULL,
groupingColumns = c("eventDate", "recordedBy", "datasetName"),
ndec = 3,
stepSize = 1000000,
mc.cores = 1
){
# locally bind variables to the function
eventDate<-recordedBy<-decimalLatitude<-decimalLongitude<-database_id<-.data<-leadingLat<-
laggingLat<-diffLead_Lat<-diffLag_Lat<-diffLat<- . <- NULL
.rou <- leadingLon <- laggingLon <- diffLead_Lon <- diffLag_Lon <- diffLon <- NULL
#### 0.0 Warnings ####
if(is.null(data)){
stop("\n - Please provide an argument for data. I'm a program, not a magician.")
}
if(is.null(minRepeats)){
warning("\n - minRepeats not provided. Using default value of four")
minRepeats = 4
}
if(is.null(groupingColumns)){
warning("\n - groupingColumns not provided. Using the default of, eventDate, recordedBy, ",
"and datasetName.")
groupingColumns <- c("eventDate", "recordedBy", "datasetName")
}
#### 1.0 prepare data ####
startTime <- Sys.time()
##### 1.1 ndec ####
# If an ndec is provided, then filter to remove decimal places lower than ndec
if(!is.null(ndec)){
writeLines("Removing rounded coordinates with BeeBDC::jbd_coordinates_precision...")
runningData <- data %>%
BeeBDC::jbd_coordinates_precision(
data = .,
lon = "decimalLongitude",
lat = "decimalLatitude",
ndec = ndec,
quieter = TRUE) %>%
dplyr::filter(!.rou == FALSE) %>%
dplyr::select(!.rou)
}else{
runningData <- data
}
##### 1.2 Initial filtering and prep ####
runningData <- runningData %>%
# Select fewer columns to make it easier on the old computer
dplyr::select(database_id, decimalLatitude, decimalLongitude,
tidyselect::all_of(groupingColumns)) %>%
# Remove incomplete values
tidyr::drop_na( tidyselect::all_of(groupingColumns)) %>%
tidyr::drop_na(decimalLatitude, decimalLongitude) %>%
# Group the data by eventDate and recordedBy
dplyr::group_by( dplyr::across(tidyselect::all_of(groupingColumns))) %>%
# Arrange from biggest to lowest decimalLatitude and then decimalLongitude by grouping
dplyr::arrange(dplyr::desc(decimalLatitude), dplyr::desc(decimalLongitude),
.by_group = TRUE) %>%
# Remove duplicate lat and longs
#dplyr::distinct(decimalLatitude, decimalLongitude, .keep_all = TRUE) %>%
# Select those groups with four or more occurrences
dplyr::filter(dplyr::n() >= minRepeats)
#### 2.0 Identify sequences ####
if(nrow(runningData) > 0){
##### 2.1 Create function ####
# Set up the loop function
LatLonFun <- function(funData){
for(i in 1:length(funData)){
# Run the sliding window
for(j in 1:(nrow(funData) - minRepeats+1)){
# select an amount of rows based on minRepeats
windowj <- funData[j:(j+minRepeats-1),]
# If all differences are equal, then add to a running list of database_ids
if(all(windowj$diff == windowj$diff[1])){
flaggedRecords <- flaggedRecords %>%
dplyr::bind_rows(windowj %>%
dplyr::select(database_id) )} # END if statement
} # END J loop
# Keep distinct flaggedRecords
# Run distinct every 1000th iteration, or at the end
if(i %in% seq(0, length(funData), 1000) |
i == length(funData)){
flaggedRecords <- flaggedRecords %>%
dplyr::distinct(.keep_all = TRUE)}
} # END i loop
return(flaggedRecords)
}# End LatLonFun
##### 2.2 Lat sequences ####
writeLines(" - Starting the latitude sequence...")
# Find the groups where ALL of the differences between values is the same (is.sequential)
# Return their database_id
runningData_Lat <- runningData %>%
# Sort
dplyr::arrange(dplyr::desc(.data$decimalLatitude), .by_group = TRUE) %>%
dplyr::distinct(dplyr::across(c(decimalLongitude, decimalLatitude,
tidyselect::all_of(groupingColumns))), .keep_all = TRUE) %>%
# Add leading columns with the value of the next one
dplyr::mutate(leadingLat = dplyr::lag(decimalLatitude)) %>%
dplyr::mutate(laggingLat = dplyr::lead(decimalLatitude)) %>%
# Add new new columns with the difference
dplyr::mutate(diffLead_Lat = (decimalLatitude - leadingLat)) %>%
dplyr::mutate(diffLag_Lat = (decimalLatitude - laggingLat)) %>%
# COMBINE these columns so that they are all complete from lead AND lag (no NAs)
dplyr::mutate(diff = dplyr::if_else(is.na(diffLead_Lat),
-diffLag_Lat,
diffLead_Lat),
diff = diff %>% round(digits = 9)) %>%
# Remove extra columns
dplyr::select(!c(leadingLat, laggingLat, diffLead_Lat, diffLag_Lat)) %>%
# Remove groups below the threshold
# Group by lat and lon and the groupingColumns
dplyr::group_by(dplyr::across(tidyselect::all_of(groupingColumns))) %>%
dplyr::filter(!dplyr::n() < minRepeats)
# Re-join with the runningData and match up duplicate lat/lon within groups and assign the same
# diff values
runningData_Lat <- runningData_Lat %>%
dplyr::bind_rows(runningData %>%
dplyr::filter(!database_id %in% runningData_Lat$database_id)) %>%
# Group by lat and lon and the groupingColumns
dplyr::group_by(decimalLatitude, decimalLongitude,
dplyr::across(tidyselect::all_of(groupingColumns))) %>%
dplyr::arrange(decimalLatitude) %>%
# Assign matching occurrences to the same diff number so that they will also be flagged
dplyr::mutate(diff = diff[[1]]) %>%
tidyr::drop_na(diff) %>%
dplyr::filter(!diff == 0) %>%
dplyr::ungroup()
# Turn each of the groups into its own tibble within a list
runningData_LatGrp <- runningData_Lat %>%
# Re-group by the groupingColumns and then filter to those that pass the minimum repeats
dplyr::group_by( dplyr::across(tidyselect::all_of(groupingColumns)), .add = TRUE) %>%
dplyr::filter(dplyr::n() >= minRepeats) %>%
dplyr::mutate(diff = diff %>% as.character()) %>%
# Split groups into a list
dplyr::group_split()
# Remove the spent dataset
rm(runningData_Lat)
# Remove excess columns from list
runningData_LatGrp <- lapply(runningData_LatGrp, function(x) x[(names(x) %in% c("database_id", "diff"))])
# Set up a tibble for the flagged records
flaggedRecords <- dplyr::tibble()
# Run the loop function in parallel
flagRecords_Lat <- runningData_LatGrp %>%
parallel::mclapply(LatLonFun, mc.cores = mc.cores) %>%
# Re-bind the list elements
dplyr::bind_rows()
# Remove the spent dataset
rm(runningData_LatGrp)
##### 2.3 Lon sequences ####
writeLines(" - Starting the longitude sequence...")
# Find the groups where ALL of the differences between values is the same (is.sequential)
# Return their database_id
runningData_Lon <- runningData %>%
# Sort
dplyr::arrange(dplyr::desc(.data$decimalLongitude), .by_group = TRUE) %>%
dplyr::distinct(dplyr::across(c(decimalLongitude, decimalLatitude,
tidyselect::all_of(groupingColumns))), .keep_all = TRUE) %>%
# Add leading columns with the value of the next one
dplyr::mutate(leadingLon = dplyr::lag(decimalLongitude)) %>%
dplyr::mutate(laggingLon = dplyr::lead(decimalLongitude)) %>%
# Add new new columns with the difference
dplyr::mutate(diffLead_Lon = (decimalLongitude - leadingLon)) %>%
dplyr::mutate(diffLag_Lon = (decimalLongitude - laggingLon)) %>%
# COMBINE these columns so that they are all complete from lead AND lag (no NAs)
dplyr::mutate(diff = dplyr::if_else(is.na(diffLead_Lon),
-diffLag_Lon,
diffLead_Lon),
diff = diff %>% round(digits = 9)) %>%
# Remove extra columns
dplyr::select(!c(leadingLon, laggingLon, diffLead_Lon, diffLag_Lon)) %>%
# Remove groups below the threshold
# Group by lat and lon and the groupingColumns
dplyr::group_by(dplyr::across(tidyselect::all_of(groupingColumns))) %>%
dplyr::filter(!dplyr::n() < minRepeats)
# Re-join with the runningData and match up duplicate lat/lon within groups and assign the same
# diffLon values
runningData_Lon <- runningData_Lon %>%
dplyr::bind_rows(runningData %>%
dplyr::filter(!database_id %in% runningData_Lon$database_id)) %>%
# Group by lat and lon and the groupingColumns
dplyr::group_by(decimalLatitude, decimalLongitude,
dplyr::across(tidyselect::all_of(groupingColumns))) %>%
dplyr::arrange(decimalLongitude) %>%
# Assign matching occurrences to the same diffLon number so that they will also be flagged
dplyr::mutate(diff = diff[[1]]) %>%
tidyr::drop_na(diff) %>%
dplyr::filter(!diff == 0) %>%
dplyr::ungroup()
# Turn each of the groups into its own tibble within a list
runningData_LonGrp <- runningData_Lon %>%
# Re-group by the groupingColumns and then filter to those that pass the minimum repeats
dplyr::group_by( dplyr::across(tidyselect::all_of(groupingColumns)), .add = TRUE) %>%
dplyr::filter(dplyr::n() >= minRepeats) %>%
dplyr::mutate(diff = diff %>% as.character()) %>%
# Split groups into a list
dplyr::group_split()
# Remove the spent dataset
rm(runningData_Lon)
# Remove excess columns from list
runningData_LonGrp <- lapply(runningData_LonGrp, function(x) x[(names(x) %in% c("database_id", "diff"))])
# Run the loop function in parallel
flagRecords_Lon <- runningData_LonGrp %>%
parallel::mclapply(LatLonFun, mc.cores = mc.cores) %>%
# Re-bind the list elements
dplyr::bind_rows()
##### 2.4 Merge lat lon ####
# Merge the minor runs
flagRecords <- dplyr::bind_rows(flagRecords_Lat, flagRecords_Lon) %>%
dplyr::distinct()
}else{
flagRecords = dplyr::tibble(database_id = NA_character_)
} # END nrow(runningData) > 0
# Remove the spent dataset
rm(runningData_LonGrp)
#### 3.0 Merge ####
writeLines(" - Merging results and adding the .sequential column...")
# Add a new column called .sequential to flag sequential lats and longs as FALSE
data <- data %>%
dplyr::mutate(.sequential = !database_id %in% flagRecords$database_id)
# Use output
message("\ndiagonAlley:\nFlagged ",
format(sum(data$.sequential == FALSE, na.rm = TRUE), big.mark = ","),
" records\nThe .sequential column was added to the database.\n")
# Return runtime
endTime <- Sys.time()
message(paste(
" - Completed in ",
round(difftime(endTime, startTime), digits = 2 )," ",
units(round(endTime - startTime, digits = 2)),
sep = ""))
return(data)
}# END function