-
Notifications
You must be signed in to change notification settings - Fork 2
/
importOccurrences.R
143 lines (128 loc) · 5.72 KB
/
importOccurrences.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#### importOccurrences ####
#' Imports the most-recent repoMerge data
#'
#' Looks for and imports the most-recent version of the occurrence data created by the [BeeBDC::repoMerge()]
#' function.
#'
#' @param path A directory as a character. The directory to recursively look in for the above data.
#' @param fileName Character. A String of text to look for the most-recent dataset.
#' Default = "^BeeData_". Find faults by modifying [BeeBDC::fileFinder()]
#' and logic-checking the file that's found.
#'
#' @return A list with a data frame of merged occurrence records, "Data_WebDL", and a list of EML
#' files contained in "eml_files".
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples
#' \dontrun{
#' DataImp <- importOccurrences(path = DataPath)
#' }
importOccurrences <- function(path = path,
fileName = "^BeeData_" #occurrence file name. If not provided, R will search to match "BeeData_"
){ #spatial reference system as epsg code
. <- NULL
# Load required packages
requireNamespace("dplyr")
requireNamespace("lubridate")
# if the fileName is not provided...
if(!exists("fileName")){
fileName = "^BeeData"
}
#### Find files ####
# Find all of the previously-produced data files
most_recent <- BeeBDC::fileFinder(path = path, fileName = fileName)
# Return information to user
writeLines(paste(" - Great, R has detected file(s), including... ", "\n",
paste(most_recent, collapse = "\n") ), sep = "")
#### Detect format ####
# Find the format of the most-recent files. This could potentially be .csv or .rds
# TRUE IF .rds are present (that are not attribute files):
(rdata_query <- any(most_recent[stringr::str_count(most_recent, pattern = "([aA]ttribute)|\\.rds") > 0] %>%
stringr::str_detect(., pattern = "([aA]ttribute)", negate = TRUE) == TRUE))
# TRUE IF .csv data are present:
(csv_query <- any(stringr::str_detect(most_recent, "([aA]ttributes)")) == TRUE &&
any(stringr::str_detect(most_recent, ".*\\.csv{1}")) == TRUE)
#### Both present ####
# IF their is a complete .rds file among the most-recent files AND a .csv version...
if(rdata_query == TRUE && csv_query == TRUE){
writeLines(paste("\n",
" - Oh boy, it looks like there are both .csv and .rds versions of your data!",
"\n", "R will preferentially use the .rds file.", "\n",
"NOTE: the .rds file can be very slow to load"))
# File to read:
fileLoc <- most_recent[intersect(grep(".*\\.rds{1}", most_recent),
grep("([aA]ttributes)", most_recent, invert = TRUE))]
# Read in the .rds file
writeLines(paste("Reading in ", fileLoc, "...", sep = ""))
# Find the index of the string that matches and select that to read in
occurDF <- fileLoc %>%
readRDS()
} #END IF both
#### RData present ####
# IF their is ONLY a complete .rds file among the most-recent files...
if(rdata_query == TRUE && csv_query == FALSE){
writeLines(paste(" - .rds export version found. Loading this file...", "\n",
"NOTE: the .rds file can be very slow to load"))
# File to read:
fileLoc <- most_recent[intersect(grep(".*\\.rds{1}", most_recent),
grep("([aA]ttributes)", most_recent, invert = TRUE))]
# Read in the .rds file
writeLines(paste("Reading in ",
fileLoc,
"...", sep = ""))
# Find the index of the string that matches and select that to read in
occurDF <- as.character(fileLoc) %>%
readRDS()
writeLines("Completed reading in .rds file")
} #END IF .rds
#### CSV present ####
# IF their is ONLY a complete .csv file among the most-recent files...
if(csv_query == TRUE && rdata_query == FALSE){
writeLines(paste(" - .csv exported version found. Loading this file..."))
ColTypes <- ColTypeR()
# Find the most-recent .csv occurrence file
occurDF <- most_recent[intersect(grep(".*\\.csv", most_recent),
grep("([aA]ttributes)", most_recent, invert = TRUE))] %>%
read_csv(col_types = ColTypes)
# Find the most-recent .rds attributes file
attr_loc <- most_recent[stringr::str_which(most_recent, "(.*[aA]ttribute)(.*\\.rds)")]
# Check to see if the attributes file exists or not
if(length(attr_loc) == 0){
writeLines("No attribute file found... Please make sure that one exists to include the EML data")
}else{
writeLines(
paste("Reading attribute file named ", attr_loc, "..." ))
attr_file <- readRDS(attr_loc)
}
# Add the attributes file to the occurrence data file
attributes(occurDF) <- attr_file
# Read in the EML file
# Find the folder that the attributes file is in.
EML_home <- stringr::str_replace(attr_loc, pattern = "\\/[a-zA-Z0-9-_]+\\.rds$", "")
# Find the .xml file in the same location as the attribute's folder
EML_loc <- BeeBDC::fileFinder(path = EML_home, fileName = "eml.*\\.rds")
# Read in the EML file
EML_file <- readRDS(EML_loc)
} #END IF .csv
# Extract and save the data and the metadata based on their class
for(i in 1:length(occurDF)){
# If eml
if(base::any(class(occurDF[[i]]) %in% c( "emld") )){
eml_files <- occurDF[i]
}
# If tibble
if(base::any(class(occurDF[[i]]) %in% c("tbl_df", "data.frame", "tbl"))){
Data_WebDL <- occurDF[i]
}
}
#### Return data ####
# Re-combine the data and EML data
Data_WebDL <- dplyr::lst(Data_WebDL,
eml_files)
# Return the Data_WebDL
return(Data_WebDL)
# Return end product and print completion note
writeLines(paste(" - Fin.", sep = "\n"))
} # END data_importer