/
subset_data.R
208 lines (189 loc) · 8.85 KB
/
subset_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#' Subset a `data.frame`, `inspect`, or `inspect.ft` object
#'
#' `subset_data` subsets a `data.frame`, `inspect`, or `inspect.ft` object based
#' on a given set of criteria. The function is ideal for passing only selected
#' regions of data to other functions such as [`calc_rate()`] and
#' [`auto_rate()`], either by saving the output as a new object or via the use
#' of pipes (`%>%` or `%>%`). It is also very useful in analysis of
#' intermittent-flow data, where in a loop each replicate can be extracted and
#' passed to an analytical function such as `calc_rate` or `auto_rate`. See
#' examples and vignettes.
#'
#' The function can subset data based on ranges of `"time"`, `"oxygen"`, or
#' `"row"`. For data frames, to subset by `"time"` or `"oxygen"` the time data
#' is assumed to be in the first column, and oxygen data in the second column.
#' For [`inspect()`] and [`inspect.ft()`] objects, the data will have been
#' coerced to this structure already. In these cases the `$dataframe` element in
#' the output is replaced by the subset, and in `inspect.ft` the `$data` element
#' is also subset and replaced. Note for `inspect.ft` objects, the oxygen data
#' in column 2 will be either `out.oxy` data or `delta.oxy` data depending on
#' what was inspected. The function can subset *any* data frame by `row`.
#'
#' When multiple columns are present, for example time in column 1, and multiple
#' columns of oxygen data, the subset object will include *all* columns. In the
#' case of subsetting `by = "oxygen"`, subsetting is based on the *first* column
#' of oxygen data only (i.e. column 2), and all subsequent columns are subset
#' between the same rows regardless of oxygen values.
#'
#' For all methods, if exact matching values of `from` and `to` are not present
#' in the data, the closest values are used. For `"time"` and `"row"`
#' subsetting, `from` and `to` should be in the correct order. No warning or
#' messages are given if the input values are outside those in the data frame.
#' For instance, if `to = 100` and there are only 50 rows in the data, the last
#' row (50) will be used instead. The same for `from` and `to` time values
#' outside those in the data frame.
#'
#' For `"oxygen"` subsetting, `from` and `to` are generally interchangeable, and
#' the function will subset data *between* the first and last occurrences (or
#' closest occurrences) of these values. It works best with generally increasing
#' or decreasing oxygen data, and results may vary with other data such as
#' intermittent flow data or those in `inspect.ft` objects.
#'
#' **Note for `inspect` and `inspect.ft` object inputs:** after subsetting the
#' locations of any data issues highlighted when the object was originally
#' inspected will no longer be accurate. If these are important, best practice
#' is to subset the original dataframe, and then process the subset through
#' `inspect` or `inspect.ft`.
#'
#' A summary of the subset can be printed to the console if the default `quiet =
#' FALSE` is changed to `TRUE`.
#'
#' ## More
#'
#' For additional help, documentation, vignettes, and more visit the `respR`
#' website at <https://januarharianto.github.io/respR/>
#'
#' @return **Output**: If the input is an `inspect`, or `inspect.ft` object, the
#' output is an object of the same class containing the subset data. For
#' `data.frame` inputs the output is a `data.table` of the subset.
#'
#' @param x `data.frame`, `inspect`, or `inspect.ft` object. The data from which
#' to produce a subset.
#' @param from numeric. The lower bounds of the subset based on the `by` input.
#' @param to numeric. The upper bounds of the subset based on the `by` input.
#' @param by string. `"time"`, `"row"`, or `"oxygen"`. Method by which to apply
#' the `from` and `to` inputs.
#' @param quiet logical. Controls if a summary of the output is printed to the
#' console. Default is `TRUE`.
#'
#' @export
#'
#' @examples
#' \donttest{
#' # Subset by time:
#' x <- subset_data(squid.rd, from = 2000, to = 4000, by = "time")
#'
#' # Subset by oxygen:
#' subset_data(sardine.rd, from = 94, to = 91, by = "oxygen")
#'
#' # Subset by row:
#' subset_data(flowthrough.rd, from = 10, to = 750, by = "row")
#'
#' # Subset multiple columns:
#' # In this case subsetting is based on the first two columns
#' subset_data(flowthrough.rd, from = 50, to = 600, by = "time")
#'
#' # Pass (via piping) only a subset of a dataset to inspect() and auto_rate()
#' subset_data(sardine.rd, from = 94, to = 91, by = "oxygen") %>%
#' inspect(time = 1, oxygen = 2) %>%
#' auto_rate()
#' }
subset_data <- function(x, from = NULL, to = NULL, by = "time", quiet = TRUE) {
# Check if object is from respR function(s)
if (any(class(x) %in% "inspect")) {
dt <- data.table(x$dataframe)
if (length(dt) > 2)
message("subset_data: Multi-column dataset detected in input! \nsubset_data is intended to subset datasets containing single time and oxygen columns. \nSubsetting will proceed anyway using columns 1 and 2 as time and oxygen respectively. All other columns will be ignored.")
} else if (any(class(x) %in% "inspect.ft")) {
dt <- data.table(x$dataframe)
} else {
dt <- data.table(x)
if (length(dt) > 2)
message("subset_data: Multi-column dataset detected in input! \nsubset_data is generally intended to subset data already passed through inspect(), or 2-column data frames where time and oxygen are in columns 1 and 2 respectively. \nSubsetting will proceed anyway based on this assumption, but please ensure you understand what you are doing.")
}
## verify by input
by <- by_val(by, msg = "subset_data")
## replace NULL inputs
if(is.null(from)){
if(by == "time") from <- min(nainf.omit(dt[[1]]))
if(by == "row") from <- 1
if(by == "oxygen") from <- dt[[2]][1] # first oxygen value
}
if(is.null(to)){
if(by == "time") to <- max(nainf.omit(dt[[1]]))
if(by == "row") to <- nrow(dt)
if(by == "oxygen") to <- dt[[2]][nrow(dt)] # last oxygen value
}
## if time, 'from' required, single val, not bigger than max value in time
if(by == "time") {
input.val(from, num = TRUE, int = FALSE, req = FALSE,
max = 1, min = 1, range = c(0, max(nainf.omit(dt[[1]]))),
msg = "subset_data: 'from' -")
## if time, 'to' required, single val, greater than from
input.val(to, num = TRUE, int = FALSE, req = FALSE,
max = 1, min = 1, range = c(from, Inf),
msg = "subset_data: 'to' -")
}
## if row, 'from' required, single val, integer
if(by == "row") {
input.val(from, num = TRUE, int = TRUE, req = FALSE,
max = 1, min = 1, range = c(1, nrow(dt)),
msg = "subset_data: 'from' -")
## if row, 'to' required, single val, integer, greater than from
input.val(to, num = TRUE, int = TRUE, req = FALSE,
max = 1, min = 1, range = c(from+1, Inf),
msg = "subset_data: 'to' -")}
## if oxygen, 'from' & 'to' required, single val, numeric
if(by == "oxygen") {
input.val(from, num = TRUE, int = FALSE, req = FALSE,
max = 1, min = 1, range = c(-Inf, Inf),
msg = "subset_data: 'from' -")
input.val(to, num = TRUE, int = FALSE, req = FALSE,
max = 1, min = 1, range = c(-Inf, Inf),
msg = "subset_data: 'to' -")}
# Perform subset
out <- truncate_data(dt, from, to, by)
# inspect.ft has an additional element that needs subset - $data
if (any(class(x) %in% "inspect.ft")) {
# get indices of start and end of subset
# Must be a better way of doing this...
# collapse list of lists to list of dfs
dt2 <- sapply(x$data, function(z) rbind.data.frame(z))
# collapse list of dfs to dt
dt2 <- as.data.table(dt2)
# for some stupid reason the names get changed
names(dt2) <- names(x$dataframe)
# start and end of rows which are identical
start <- min(which(duplicated(rbind(dt2, out), fromLast = TRUE)))
end <- max(which(duplicated(rbind(dt2, out), fromLast = TRUE)))
# if out was empty, start and end are now Inf & -Inf, which produces errors
if(start == Inf || start == -Inf) start <- 0
if(end == Inf || end == -Inf) end <- 0
# subset x$data elements
data_sub <- lapply(x$data, function(y) {
lapply(y, function(z){
z[start:end]
})
})
x$data <- data_sub
}
if(!quiet) {
cat("\n# subset_data # -------------------------\n")
cat("Original data:\n")
print(dt, topn = 2)
cat("\nSubset data:\n")
print(data.table(out), topn = 2)
cat("-----------------------------------------\n")
}
# if out is empty, warn.
# Still return in case this would break loops or whatever
if(nrow(out) == 0)
warning("subset_data: subsetting criteria result in empty dataset!")
if (any(class(x) %in% "inspect")) {
x$dataframe <- out
return(invisible(x))
} else if (any(class(x) %in% "inspect.ft")) {
x$dataframe <- out
return(invisible(x))
} else return(invisible(out))
}