-
Notifications
You must be signed in to change notification settings - Fork 23
/
amp_frequency.R
177 lines (165 loc) · 5.57 KB
/
amp_frequency.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#' Frequency plot
#'
#' Generates a barplot with frequency vs read abundance.
#'
#' @param data (\emph{required}) Data list as loaded with \code{\link{amp_load}}.
#' @param group_by Group the samples by a variable in the metadata. (\emph{default:} \code{"Sample"})
#' @param tax_aggregate The taxonomic level to aggregate the OTUs. (\emph{default:} \code{"OTU"})
#' @param tax_empty How to show OTUs without taxonomic information. One of the following:
#' \itemize{
#' \item \code{"remove"}: Remove OTUs without taxonomic information.
#' \item \code{"best"}: (\emph{default}) Use the best classification possible.
#' \item \code{"OTU"}: Display the OTU name.
#' }
#' @param tax_class Converts a specific phylum to class level instead, e.g. \code{"p__Proteobacteria"}.
#' @param weight (\emph{logical}) Weight the frequency by abundance. (\emph{default:} \code{TRUE})
#' @param normalise (\emph{logical}) Transform the OTU read counts to be in percent per sample. (\emph{default:} \code{TRUE})
#' @param detailed_output (\emph{logical}) Return additional details or not. If \code{TRUE}, it is recommended to save to an object and then access the additional data by \code{View(object$data)}. (\emph{default:} \code{FALSE})
#'
#' @return A ggplot2 object. If \code{detailed_output = TRUE} a list with a ggplot2 object and additional data.
#'
#' @import ggplot2
#' @importFrom dplyr filter group_by mutate summarise
#' @importFrom tidyr gather
#' @importFrom data.table as.data.table data.table setkey
#'
#' @export
#'
#' @section Preserving relative abundances in a subset of larger data:
#' See \code{?\link{amp_filter_samples}} or the \href{https://kasperskytte/ampvis2/articles/faq.html#preserving-relative-abundances-in-a-subset-of-larger-data}{ampvis2 FAQ}.
#'
#' @references
#' Saunders, Aaron M; Albertsen, Mads; Vollertsen, Jes; Nielsen, Per H (2016): The activated sludge ecosystem contains a core community of abundant organisms, ISME Journal 10, 11-20. \doi{10.1038/ismej.2015.117}
#'
#' @seealso
#' \code{\link{amp_load}}, \code{\link{amp_core}}, \code{\link{amp_venn}}
#'
#' @examples
#' # Load example data
#' data("AalborgWWTPs")
#'
#' # Frequency plot
#' amp_frequency(AalborgWWTPs)
#' @author Kasper Skytte Andersen \email{ksa@@bio.aau.dk}
#' @author Mads Albertsen \email{MadsAlbertsen85@@gmail.com}
amp_frequency <- function(data,
group_by = "Sample",
tax_class = NULL,
tax_empty = "best",
tax_aggregate = "OTU",
weight = TRUE,
normalise = TRUE,
detailed_output = FALSE) {
### Data must be in ampvis2 format
is_ampvis2(data)
# Aggregate only at the lowest level. No tax_add here
if(length(tax_aggregate) > 1) {
tax_aggregate <- getLowestTaxLvl(
tax = data$tax,
tax_aggregate = tax_aggregate,
tax_add = NULL
)
warning(
"tax_aggregate has length > 1, detecting and using only the lowest taxonomic level: ",
tax_aggregate
)
}
## Clean up the taxonomy
data <- amp_rename(
data = data,
tax_class = tax_class,
tax_empty = tax_empty,
tax_level = tax_aggregate
)
# normalise counts
if (isTRUE(normalise)) {
data <- normaliseTo100(data)
}
# Aggregate to a specific taxonomic level
abund1 <- aggregate_abund(
abund = data$abund,
tax = data$tax,
tax_aggregate = tax_aggregate,
tax_add = NULL,
calcSums = TRUE,
format = "long"
) %>%
as.data.frame()
## Add group information
suppressWarnings(
if (group_by != "Sample") {
grp <- data.frame(
Sample = rownames(data$metadata),
.Group = apply(data$metadata[, group_by], 1, paste, collapse = " ")
)
abund1$.Group <- grp$.Group[match(abund1$Sample, grp$Sample)]
abund2 <- abund1
} else {
abund2 <- data.frame(
abund1,
.Group = abund1$Sample
)
}
)
## Take the average to group level
abund3 <- data.table(abund2)[
,
Abundance := mean(Sum),
by = list(Display, .Group)
] %>%
setkey(Display, .Group) %>%
unique() %>%
filter(Abundance > 0) %>%
mutate(freq = 1)
## Make a nice frequency plot
temp3 <- group_by(abund3, Display) %>%
summarise(Frequency = sum(freq), Total = sum(Sum)) %>%
mutate(Percent = Total / length(unique(abund3$.Group))) %>%
as.data.frame()
if (weight == T) {
p <- ggplot(data = temp3, aes(x = Frequency, weight = Percent)) +
ylab("Read abundance (%)") +
xlab(paste("Frequency (Observed in N ", group_by, "s)", sep = ""))
if (max(temp3$Frequency) > 30) {
p <- p + geom_bar()
}
if (max(temp3$Frequency) <= 30) {
p <- p + geom_bar(binwidth = 1)
}
}
if (weight == F) {
p <- ggplot(data = temp3, aes(x = Frequency)) +
ylab("Read counts") +
xlab(paste("Frequency (Observed in N ", group_by, "s)", sep = ""))
if (max(temp3$Frequency) > 30) {
p <- p + geom_bar()
}
if (max(temp3$Frequency) <= 30) {
p <- p + geom_bar(binwidth = 1)
}
}
p <- p +
theme_classic() +
theme(
panel.grid.major.x = element_line(color = "grey90"),
panel.grid.major.y = element_line(color = "grey90")
)
if (tax_aggregate == "OTU") {
colnames(temp3)[1] <- "OTU"
core <- merge(x = temp3, y = data$tax, by = "OTU")
temp3 <- core
}
if (detailed_output) {
invisible(
list(
data = temp3,
plot = p,
abund = data$abund,
tax = data$tax,
metadata = data$metadata
)
)
} else if (!detailed_output) {
return(p)
}
}