-
Notifications
You must be signed in to change notification settings - Fork 23
/
amp_merge_ampvis2.R
234 lines (217 loc) · 7.14 KB
/
amp_merge_ampvis2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
#' Merge ampvis2 object(s)
#'
#' @description Merge any number of ampvis2 objects into a single object.
#'
#' @param ... (required) Any number of ampvis2-class objects to merge
#' @param by_refseq (recommended) Merge by exact matches between DNA reference sequences. The full DNA sequences will then be used as the new names in the output. (\emph{default:} \code{TRUE})
#' @param refseq_names Path to a FASTA file or a \code{DNAbin} class object with sequences whose names will be used as OTU names by exact matches (i.e. same length, 100\% sequence identity). (\emph{default:} \code{NULL})
#' @param rename_unmatched Whether to rename any unmatched sequences or not when \code{refseq_names} is provided. (\emph{default:} \code{TRUE})
#' @param unmatched_prefix Prefix used to name any unmatched sequences when \code{refseq_names} is provided. An integer counting from 1 will be appended to this prefix, so for example the 123th unmatched sequence will be named \code{unmatched123}, and so on. (\emph{default:} \code{"unmatched"})
#'
#' @return An ampvis2-class object
#' @importFrom data.table rbindlist
#' @importFrom purrr reduce
#' @importFrom dplyr full_join
#' @importFrom ape read.FASTA
#'
#' @details
#' It's important to ensure that the taxonomy for all OTU's across data sets is generated in the exact same way with the same database.
#' When \code{by_refseq = FALSE} it's likewise important to ensure that OTU ID's are not arbitrary between data sets and that they are corresponding to the same sequences across data sets (objects).
#' When \code{by_refseq = TRUE} the full DNA sequences will be used as the new OTU ID's unless \code{refseq_names} is provided.
#'
#' Currently, phylogenetic trees are not merged. Feel free to contribute.
#' @export
#'
#' @examples
#' data("MiDAS")
#'
#' # summary of samples from 2010-2012
#' amp_filter_samples(
#' MiDAS,
#' Year %in% c("2010", "2011", "2012")
#' )
#'
#' # now merge individual objects and verify summary is the same
#' d_2010 <- amp_filter_samples(
#' MiDAS,
#' Year %in% "2010"
#' )
#' d_2011 <- amp_filter_samples(
#' MiDAS,
#' Year %in% "2011"
#' )
#' d_2012 <- amp_filter_samples(
#' MiDAS,
#' Year %in% "2012"
#' )
#'
#' amp_merge_ampvis2(
#' d_2010,
#' d_2011,
#' d_2012
#' )
amp_merge_ampvis2 <- function(
...,
by_refseq = TRUE,
refseq_names = NULL,
rename_unmatched = TRUE,
unmatched_prefix = "unmatched"
) {
obj_list <- list(...)
# all objects must be ampvis2-class objects
if (!all(sapply(obj_list, inherits, "ampvis2"))) {
stop("One or more objects is not an ampvis2-class object", call. = FALSE)
}
# which ones have DNA seqs loaded
has_refseq <- sapply(
obj_list,
function(obj) {
inherits(obj$refseq, c("DNAbin", "AAbin"))
}
)
# ensure all objects have refseqs loaded when merging by refseq
if (isTRUE(by_refseq)) {
if (!all(has_refseq)) {
stop("All objects must have DNA sequences loaded to be able to merge by DNA sequence (recommended). Otherwise merge by OTU name by setting by_refseq = FALSE if you're sure it makes sense for your data.", call. = FALSE) # nolint
}
# all objects must be either normalised or not, cant have both
normalised <- obj_list %>%
lapply(
attr,
which = "normalised",
exact = TRUE
) %>%
unlist()
if (sum(normalised) > 0L & sum(normalised) != length(obj_list)) {
stop("All objects must be either normalised or not, not mixed", call. = FALSE) # nolint
}
# no duplicate samples between objects are allowed (check abund)
obj_list %>%
lapply(
function(x) {
colnames(x[["abund"]])
}
) %>%
unlist() %>%
duplicated() %>%
any() %>%
if (.) {
stop("One or more samples occurs more than once between the objects (according to abundance table)", call. = FALSE) # nolint
}
# no duplicate samples between objects are allowed (check sample metadata)
obj_list %>%
lapply(
function(x) {
x[["metadata"]][[1]]
}
) %>%
unlist() %>%
duplicated() %>%
any() %>%
if (.) {
stop("One or more samples occurs more than once between the objects (according to sample metadata)", call. = FALSE) # nolint
}
# for each object replace OTU names and rownames with the full DNA sequences
# in both abund and tax, also making sure the order is identical
obj_list <- obj_list %>%
lapply(
function(obj) {
obj$abund$OTU <- rownames(obj$abund)
refseq_chr <- obj$refseq %>%
as.character() %>%
lapply(paste, collapse = "") %>%
unlist(use.names = TRUE)
obj$abund <- obj$abund[names(refseq_chr), ]
obj$abund$OTU <- refseq_chr -> rownames(obj$abund)
obj$tax <- obj$tax[names(refseq_chr), ]
obj$tax$OTU <- refseq_chr -> rownames(obj$tax)
obj
}
)
}
# merge abundance tables
abund <- obj_list %>%
lapply(
function(obj) {
obj$abund$OTU <- rownames(obj$abund)
obj$abund
}
) %>%
reduce(
full_join,
by = "OTU"
)
# merge metadata
# ensure the first sample ID column are named the same, just use that of the first obj # nolint
idcolname <- colnames(obj_list[[1]][["metadata"]])[1]
metadata <- obj_list %>%
lapply(
function(obj) {
colnames(obj[["metadata"]])[1] <- idcolname
obj[["metadata"]]
}
) %>%
rbindlist(
fill = TRUE
)
# merge taxonomy
taxonomy <- obj_list %>%
lapply(
`[[`,
"tax"
) %>%
rbindlist(
fill = TRUE
) %>%
unique()
# check for contradictions in taxonomy. If an OTU occurs more than once
# after running unique() in the code above there must be conflicts where
# where taxonomy is different
dups <- duplicated(taxonomy[["OTU"]])
if (any(dups)) {
warning(
paste(
sum(dups),
"OTU(s) had conflicting taxonomy between the objects.",
"Discarding taxonomy further down the list only keeping the first observation for each OTU.",
"Discarded taxonomy is available in the output object as obj$discarded_tax for inspection."
),
call. = FALSE
)
}
# merge refseq and rename sequences if chosen
if (isTRUE(by_refseq)) {
seqs <- strsplit(abund$OTU, "")
names(seqs) <- abund$OTU
fasta <- ape::as.DNAbin(seqs)
} else {
# all or none
if (sum(has_refseq) > 0L & sum(has_refseq) != length(obj_list)) {
fasta <- NULL
warning("All objects must have sequences loaded to be able to merge them. Returning NULL in the output object") # nolint
} else {
fasta <- obj_list %>%
lapply(`[[`, "refseq") %>%
reduce(c)
}
}
#combine
out_obj <- amp_load(
otutable = abund,
taxonomy = taxonomy[!dups],
metadata = metadata,
fasta = fasta,
tree = NULL,
pruneSingletons = FALSE
)
out_obj$discarded_tax <- taxonomy[dups]
if (!is.null(refseq_names)) {
out_obj <- matchOTUs(
out_obj,
fasta = refseq_names,
unmatched_prefix = unmatched_prefix,
rename_unmatched = TRUE
)
}
return(out_obj)
}