/
LDA_helper_functions.R
413 lines (378 loc) · 15.7 KB
/
LDA_helper_functions.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
#' Summarize observations within bins per track segment
#'
#' Prepares the data that has already been segmented for clustering by Latent
#' Dirichlet Allocation. This function summarizes the counts observed per
#' movement variable bin within each track segment per animal ID.
#'
#' @param dat A data frame of \strong{only} the animal ID, track segment number,
#' and the discretized data for each movement variable. Animal ID and time
#' segment must be the first two columns of this data frame. This should be a
#' simplified form of the output from \code{\link{assign_tseg}}.
#' @param nbins numeric. A vector of the number of bins used to discretize each
#' movement variable. These must be in the same order as the columns within
#' \code{dat}.
#'
#' @return A new data frame that contains the animal ID, track segment number,
#' and the counts per bin for each movement variable. The names for each of
#' these bins are labeled according to the order in which the variables were
#' provided to \code{summarize_tsegs}.
#'
#'
#' @examples
#' #load data
#' data(tracks.seg)
#'
#' #select only id, tseg, SL, and TA columns
#' tracks.seg2<- tracks.seg[,c("id","tseg","SL","TA")]
#'
#' #run function
#' obs<- summarize_tsegs(dat = tracks.seg2, nbins = c(5,8))
#'
#'
#' @export
summarize_tsegs=function(dat, nbins){
#create list of input and to store output
dat.list<- df_to_list(dat = dat, ind = "id")
id<- unique(dat$id) %>%
as.character()
n<- length(id)
obs.list<- vector("list", n)
names(obs.list)<- id
#calculate # of obs in each bin (per move var) by tseg
for (i in 1:length(dat.list)) {
dat.ind=dat.list[[i]]
ntseg=max(dat.ind$tseg)
#generate counts of obs per var bin by tseg for all vars; 'id' and 'tseg'
#need to be 1st two cols
mat.list=list()
for (j in 1:length(nbins)) {
mat.list[[j]]<- matrix(0, ntseg, nbins[j])
colnames(mat.list[[j]])<- paste0("y",j,".",1:nbins[j])
for (k in 1:ntseg){
tmp<- dat.ind %>%
dplyr::filter(tseg == k) %>%
dplyr::select(j+2) %>%
table()
mat.list[[j]][k,as.numeric(names(tmp))]<- tmp
}
}
id<- rep(unique(dat.ind$id) %>%
as.character(), ntseg)
tseg<- 1:ntseg
behav.res<- do.call(cbind.data.frame, mat.list) %>%
data.frame()
behav.res<- cbind(id, tseg, behav.res)
behav.res$id<- as.character(behav.res$id)
obs.list[[i]]<- behav.res
}
obs<- dplyr::bind_rows(obs.list)
obs[is.na(obs)]<- 0 #replace NAs w/ zero
obs
}
#------------------------------------------------
#' Extract behavior proportion estimates for each track segment
#'
#' Calculates the mean of the posterior for the proportions of each behavior
#' within track segments. These results can be explored to determine the optimal
#' number of latent behavioral states.
#'
#' @param res A list of results returned by \code{\link{cluster_segments}}.
#' Element \code{theta} stores estimate for behavior proportions for all time
#' segments.
#' @param ngibbs numeric. The total number of iterations of the MCMC chain.
#' @param nburn numeric. The length of the burn-in phase.
#' @param nmaxclust numeric. A single number indicating the maximum number of
#' clusters to test.
#'
#' @return A matrix that stores the proportions of each state/cluster (columns)
#' per track segment (rows).
#'
#'
#' @examples
#'
#' \donttest{
#' #load data
#' data(tracks.seg)
#'
#' #select only id, tseg, SL, and TA columns
#' tracks.seg2<- tracks.seg[,c("id","tseg","SL","TA")]
#'
#' #summarize data by track segment
#' obs<- summarize_tsegs(dat = tracks.seg2, nbins = c(5,8))
#'
#' #cluster data with LDA
#' res<- cluster_segments(dat = obs, gamma1 = 0.1, alpha = 0.1, ngibbs = 1000,
#' nburn = 500, nmaxclust = 7, ndata.types = 2)
#'
#' #Extract proportions of behaviors per track segment
#' theta.estim<- extract_prop(res = res, ngibbs = 1000, nburn = 500, nmaxclust = 7)
#' }
#'
#'
#' @export
extract_prop=function(res, ngibbs, nburn, nmaxclust) {
#Extract and plot proportions of behaviors per track segment
theta.post<- res$theta[(nburn+1):ngibbs,] #extract samples from posterior
theta.estim<- colMeans(theta.post)
theta.estim<- matrix(data = theta.estim, ncol = nmaxclust) #calc mean of posterior
theta.estim
}
#------------------------------------------------
#' Extract bin estimates from Latent Dirichlet Allocation or mixture model
#'
#' Pulls model results for the estimates of bin proportions per movement
#' variable from the posterior distribution. This can be used for visualization
#' of movement variable distribution for each behavior estimated.
#'
#' @param dat The list object returned by the LDA model
#' (\code{\link{cluster_segments}}) or mixture model
#' (\code{\link{cluster_obs}}). Used for extracting the element \emph{phi}.
#' @param nburn numeric. The length of the burn-in phase.
#' @param ngibbs numeric. The total number of iterations of the MCMC chain.
#' @param nmaxclust numeric. The maximum number of clusters on which to
#' attribute behaviors.
#' @param var.names character. A vector of names used for each of the movement
#' variables. Must be in the same order as were listed within the data frame
#' returned by \code{\link{summarize_tsegs}} (if running LDA model).
#'
#' @return A data frame that contains columns for bin number, behavioral state,
#' proportion represented by a given bin, and movement variable name. This is
#' displayed in a long format, which is easier to visualize using
#' \code{ggplot2}.
#'
#'
#' @examples
#'
#' \donttest{
#' #load data
#' data(tracks.seg)
#'
#' #select only id, tseg, SL, and TA columns
#' tracks.seg2<- tracks.seg[,c("id","tseg","SL","TA")]
#'
#' #summarize data by track segment
#' obs<- summarize_tsegs(dat = tracks.seg2, nbins = c(5,8))
#'
#' #cluster data with LDA
#' res<- cluster_segments(dat = obs, gamma1 = 0.1, alpha = 0.1, ngibbs = 1000,
#' nburn = 500, nmaxclust = 7, ndata.types = 2)
#'
#' #Extract proportions of behaviors per track segment
#' theta.estim<- extract_prop(res = res, ngibbs = 1000, nburn = 500, nmaxclust = 7)
#'
#' #run function for clustered segments
#' behav.res<- get_behav_hist(dat = res, nburn = 500, ngibbs = 1000, nmaxclust = 7,
#' var.names = c("Step Length","Turning Angle"))
#' }
#'
#' @importFrom rlang .data
#' @export
get_behav_hist=function(dat, nburn, ngibbs, nmaxclust, var.names) {
#summarize cluster results by frequency and proportion
behav.list<- list()
for (i in 1:length(dat$phi)) {
tmp<- matrix(dat$phi[[i]][(nburn+1):ngibbs,], length((nburn+1):ngibbs),
ncol(dat$phi[[i]]))
tmp1<- matrix(colMeans(tmp), ncol(tmp) / nmaxclust, nmaxclust, byrow = T)
behav.list[[i]]<- data.frame(bin = 1:nrow(tmp1), tmp1) %>%
dplyr::rename_at(dplyr::vars(tidyr::starts_with('X')), ~as.character(1:ncol(tmp1))) %>%
tidyr::pivot_longer(-"bin", names_to = "behav", values_to = "prop") %>%
dplyr::arrange("behav") %>%
dplyr::mutate(var = var.names[i])
}
#combine params
behav.res<- dplyr::bind_rows(behav.list)
behav.res
}
#------------------------------------------------
#' Expand behavior estimates from track segments to observations
#'
#' @param dat A data frame of the animal ID, track segment labels, and all other
#' data per observation. Animal ID, date, track segment, and observation
#' number columns must be labeled \emph{id}, \emph{date}, \emph{tseg}, and
#' \emph{time1}, respectively.
#' @param theta.estim A matrix (returned by \code{\link{extract_prop}})
#' containing the proportions of each behavioral state as separate columns for
#' each track segment (rows).
#' @param obs A data frame summarizing the number of observations within each
#' bin per movement variable that is returned by
#' \code{\link{summarize_tsegs}}.
#' @param nbehav numeric. The number of behavioral states that will be retained
#' in 1 to nmaxclust.
#' @param behav.names character. A vector of names to label each state (in
#' order).
#' @param behav.order numeric. A vector that identifies the order in which the
#' user would like to rearrange the behavioral states. If satisfied with order
#' returned by the LDA model, this still must be specified.
#'
#' @return A new data frame that expands behavior proportions for each
#' observation within all track segments, including the columns labeled
#' \emph{time1} and \emph{date} from the original \code{dat} data frame.
#'
#'
#' @examples
#'
#' \donttest{
#' #load data
#' data(tracks.seg)
#'
#' #select only id, tseg, SL, and TA columns
#' tracks.seg2<- tracks.seg[,c("id","tseg","SL","TA")]
#'
#' #summarize data by track segment
#' obs<- summarize_tsegs(dat = tracks.seg2, nbins = c(5,8))
#'
#' #cluster data with LDA
#' res<- cluster_segments(dat = obs, gamma1 = 0.1, alpha = 0.1, ngibbs = 1000,
#' nburn = 500, nmaxclust = 7, ndata.types = 2)
#'
#' #Extract proportions of behaviors per track segment
#' theta.estim<- extract_prop(res = res, ngibbs = 1000, nburn = 500, nmaxclust = 7)
#'
#' #Create augmented matrix by replicating rows (tsegs) according to obs per tseg
#' theta.estim.long<- expand_behavior(dat = tracks.seg, theta.estim = theta.estim, obs = obs,
#' nbehav = 3, behav.names = c("Encamped","ARS","Transit"),
#' behav.order = c(1,2,3))
#' }
#'
#'
#' @importFrom rlang .data
#' @export
expand_behavior=function(dat, theta.estim, obs, nbehav, behav.names, behav.order) {
#Assign behaviors (via theta) to each track segment
theta.estim1<- apply(theta.estim[,1:nbehav], 1, function(x) x/sum(x)) %>%
t() #normalize probs for only first 3 behaviors being used
theta.estim1<- data.frame(id = obs$id, tseg = obs$tseg, theta.estim1)
theta.estim1$id<- as.character(theta.estim1$id)
names(theta.estim1)<- c("id", "tseg", behav.names) #define behaviors
nobs<- data.frame(id = obs$id, tseg = obs$tseg)
nobs <- dplyr::left_join(x = nobs,
y = dat %>%
dplyr::mutate(id = as.character(id)) %>%
dplyr::group_by(.data$id, .data$tseg) %>%
dplyr::tally() %>%
dplyr::ungroup(),
by = c('id', 'tseg'))
for (i in 1:nrow(theta.estim1)) {
ind<- which(dat$id == theta.estim1$id[i] & dat$tseg == theta.estim1$tseg[i])
if (i == 1) {
theta.estim2<- rep(theta.estim1[i,], nobs$n[i]) %>%
unlist() %>%
matrix(nrow = nobs$n[i], ncol = ncol(theta.estim1), byrow = TRUE)
} else {
tmp<- rep(theta.estim1[i,], nobs$n[i]) %>%
unlist() %>%
matrix(nrow = nobs$n[i], ncol = ncol(theta.estim1), byrow = TRUE)
theta.estim2<- rbind(theta.estim2, tmp)
}
}
colnames(theta.estim2)<- names(theta.estim1)
theta.estim2<- data.frame(theta.estim2, time1 = dat$time1, date = dat$date,
stringsAsFactors = FALSE)
#Change col classes to numeric besides ID
ind1<- which(names(theta.estim2) != "id")
theta.estim2<- theta.estim2 %>%
dplyr::mutate_at(names(theta.estim2)[ind1], as.numeric) %>%
dplyr::select("id", "tseg", "time1", "date", dplyr::everything())
#Change into long format
theta.estim.long<- tidyr::pivot_longer(theta.estim2, cols = -c(1:4),
names_to = "behavior", values_to = "prop")
theta.estim.long$behavior<- factor(theta.estim.long$behavior,
levels = behav.names[behav.order])
theta.estim.long<- theta.estim.long %>%
dplyr::arrange("behavior") %>%
dplyr::mutate_at("date", lubridate::as_datetime)
theta.estim.long
}
#------------------------------------------------
#' Assign behavior estimates to observations
#'
#' @param dat.orig A data frame that contains all of the original data for all
#' animal IDs. Must be same as was used to originally segment the tracks. Must
#' have columns \code{obs} and \code{time1} generated by
#' \code{\link{filter_time}}.
#' @param dat.seg.list A list of data associated with each animal ID where names
#' of list elements are the ID names and tracks have already been segmented.
#' Must have columns \code{obs} and \code{time1} generated by
#' \code{\link{filter_time}}.
#' @param theta.estim.long A data frame in long format where each observation
#' (\emph{time1}) of each track segment (\emph{tseg}) of each animal ID
#' (\emph{id}) has separate rows for behavior proportion estimates per state.
#' Columns for behavior and proportion estimates should be labeled
#' \emph{behavior} and \emph{prop}, respectively. Date (in POSIXct format)
#' should also be included as a column labeled \emph{date}.
#' @param behav.names deprecated. Now taken from the \code{theta.estim.long} object.
#'
#' @return A data frame of all animal IDs where columns (with names from
#' \code{behav.names}) include proportions of each behavioral state per
#' observation, as well as a column that stores the dominant behavior within a
#' given track segment for which the observation belongs (\code{behav}). This
#' is merged with the original data frame \code{dat.orig}, so any observations
#' that were excluded (not at primary time interval) will show \code{NA} for
#' behavior estimates.
#'
#'
#' @examples
#'
#' \donttest{
#' #load original and segmented data
#' data(tracks)
#' data(tracks.seg)
#'
#' #convert segmented dataset into list
#' tracks.list<- df_to_list(dat = tracks.seg, ind = "id")
#'
#' #select only id, tseg, SL, and TA columns
#' tracks.seg2<- tracks.seg[,c("id","tseg","SL","TA")]
#'
#' #summarize data by track segment
#' obs<- summarize_tsegs(dat = tracks.seg2, nbins = c(5,8))
#'
#' #cluster data with LDA
#' res<- cluster_segments(dat = obs, gamma1 = 0.1, alpha = 0.1, ngibbs = 1000,
#' nburn = 500, nmaxclust = 7, ndata.types = 2)
#'
#' #Extract proportions of behaviors per track segment
#' theta.estim<- extract_prop(res = res, ngibbs = 1000, nburn = 500, nmaxclust = 7)
#'
#' #Create augmented matrix by replicating rows (tsegs) according to obs per tseg
#' theta.estim.long<- expand_behavior(dat = tracks.seg, theta.estim = theta.estim, obs = obs,
#' nbehav = 3, behav.names = c("Encamped","ARS","Transit"),
#' behav.order = c(1,2,3))
#'
#' #Run function
#' dat.out<- assign_behavior(dat.orig = tracks, dat.seg.list = tracks.list,
#' theta.estim.long = theta.estim.long)
#' }
#'
#'
#' @export
assign_behavior=function(dat.orig, dat.seg.list, theta.estim.long, behav.names = NULL) {
if (!is.null(behav.names)) {
stop("The 'behav.names' argument has been deprecated. Don't specify this argument")
}
for (i in 1:length(dat.seg.list)) {
sub<- theta.estim.long[theta.estim.long$id == unique(dat.seg.list[[i]]$id),]
sub<- sub %>%
dplyr::arrange("tseg", "date", "behavior") %>%
tidyr::pivot_wider(names_from = "behavior", values_from = "prop")
behav.names1 <- unique(theta.estim.long$behavior)
sub<- sub %>%
dplyr::mutate(behav = behav.names1[apply(sub[,5:ncol(sub)], 1, which.max)])
dat.seg.list[[i]]<- cbind(dat.seg.list[[i]], sub[,5:ncol(sub)])
}
#Convert original data to list and filter for only IDs analyzed
dat.orig.list<- df_to_list(dat = dat.orig, ind = "id")
ind<- which(names(dat.orig.list) %in% names(dat.seg.list))
dat.orig.list<- dat.orig.list[ind]
#Add obs col to dat.orig.list
dat.orig.list1<- purrr::map(dat.orig.list, ~dplyr::mutate(.x, obs = 1:nrow(.x)))
#Merge behav estimates w/ original data
dat1<- purrr::map2(dat.orig.list1, dat.seg.list,
~dplyr::left_join(.x, .y[,c("obs","tseg", as.character(behav.names1), "behav")],
by = "obs")
) %>%
dplyr::bind_rows()
dat1$behav<- factor(dat1$behav, levels = levels(behav.names1))
dat1
}