-
Notifications
You must be signed in to change notification settings - Fork 5
/
mmload.R
310 lines (295 loc) · 13.6 KB
/
mmload.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#' @title Load and combine data for use with other mmgenome2 functions
#'
#' @description Loads, validates and combines multiple aspects of metagenome data into one dataframe for use with all mmgenome2 functions, including scaffold assembly sequences, scaffold coverage, essential genes, taxonomy, and more.
#'
#' @param assembly (\emph{required}) A character string with the path to the assembly FASTA file, or the assembly as already loaded with \code{\link{readDNAStringSet}}.
#' @param coverage (\emph{required}) A path to a folder to scan for coverage files, or otherwise a named \code{vector}, \code{data.frame}, or a \code{list} hereof containing coverage of each scaffold. The prefix \code{"cov_"} will be appended to all coverage column names in the output so that \code{\link{mmstats}} and \code{\link{mmplot_cov_profiles}} know which columns are coverage columns.
#' \describe{
#' \item{\code{vector}}{If provided as a vector, the elements of the vector must be named by the scaffold names exactly matching those of the assembly.}
#' \item{\code{data.frame}}{If provided as a dataframe, the first column must contain the scaffold names exactly matching those of the assembly, and any additional column(s) contain coverage of each scaffold.}
#' \item{\code{list}}{If provided as a list, it must contain any number of \code{vector}'s or \code{data.frame}'s as described above. If names are assigned to the objects in the list, then they will be used as column names in the output (does not apply to any dataframes that may have more than 2 columns, however).}
#' \item{\code{path}}{If a path to a folder is provided, then all files with filenames ending with \code{"_cov"} will be loaded (by the \code{\link[data.table]{fread}} function) into a list of \code{data.frame}'s and treated as if a \code{list} of \code{data.frame}'s were provided. The filenames (stripped from extension and \code{"_cov"}) will then be used as column names in the output. \strong{Note: only the first 2 columns will be used in the loaded files!}}
#' }
#' @param essential_genes Either a path to a CSV file (comma-delimited ",") containing the essential genes, or a 2-column dataframe with scaffold names in the first column and gene ID's in the second. Can contain duplicates. (\emph{Default: } \code{NULL})
#' @param taxonomy A dataframe containing taxonomy assigned to the scaffolds. The first column must contain the scaffold names. (\emph{Default: } \code{NULL})
#' @param additional A dataframe containing any additional data. The first column must contain the scaffold names. (\emph{Default: } \code{NULL})
#' @param kmer_pca (\emph{Logical}) Perform Principal Components Analysis of kmer nucleotide frequencies (kmer size defined by \code{kmer_size}) of each scaffold and merge the scores of the 3 most significant axes. (\emph{Default: } \code{FALSE})
#' @param kmer_BH_tSNE (\emph{Logical}) Calculate Barnes-Hut t-Distributed Stochastic Neighbor Embedding (B-H t-SNE) representations of kmer nucleotide frequencies (kmer size defined by \code{kmer_size}) using \code{\link[Rtsne]{Rtsne}} and merge the result. Additional arguments may be required for success (passed on through \code{...}), refer to the documentation of \code{\link[Rtsne]{Rtsne}}. This is done in parallel, thus setting the \code{num_threads} to the number of available cores may greatly increase the calculation time of large data. (\emph{Default: } \code{FALSE})
#' @param kmer_size The kmer frequency size (k) used when \code{kmer_pca = TRUE} or \code{kmer_BH_tSNE = TRUE}. The default is tetramers (\code{k = 4}). (\emph{Default: } \code{4})
#' @param verbose (\emph{Logical}) Whether to print status messages during the loading process. (\emph{Default: } \code{TRUE})
#' @param ... Additional arguments are passed on to \code{\link[Rtsne]{Rtsne}}.
#'
#' @export
#'
#' @return A dataframe (tibble) compatible with other mmgenome2 functions.
#'
#' @importFrom tibble add_column as.tibble tibble
#' @importFrom magrittr %>%
#' @importFrom Biostrings width readDNAStringSet letterFrequency oligonucleotideFrequency reverseComplement
#' @importFrom dplyr mutate_all funs group_by left_join summarise_all
#' @importFrom stringr str_replace_all str_remove
#' @importFrom data.table fread
#' @importFrom tools file_path_sans_ext
#'
#' @examples
#' \dontrun{
#' library(mmgenome2)
#' mm <- mmload(
#' assembly = "path/to/assembly.fa",
#' coverage = list(
#' nameofcoverage1 = read.csv("path/to/coveragetable1.csv", col.names = TRUE),
#' nameofcoverage2 = read.csv("path/to/coveragetable2.csv", col.names = TRUE)
#' ),
#' essential_genes = "path/to/ess_genes.txt",
#' verbose = TRUE
#' )
#' mm
#' }
#'
#' @author Kasper Skytte Andersen \email{ksa@@bio.aau.dk}
mmload <- function(assembly,
coverage = NULL,
essential_genes = NULL,
taxonomy = NULL,
additional = NULL,
kmer_pca = FALSE,
kmer_BH_tSNE = FALSE,
kmer_size = 4L,
verbose = TRUE,
...) {
##### Assembly #####
# Load assembly sequences from the provided file path or object
if (isTRUE(verbose)) {
message("\nLoading assembly...")
}
if (is.character(assembly)) {
assembly <- Biostrings::readDNAStringSet(assembly, format = "fasta")
} else if (!class(assembly) == "DNAStringSet") {
stop("The assembly must either be an object of class \"DNAStringSet\" loaded with readDNAStringSet() from the Biostrings package, or a file path to the assembly FASTA file to be loaded.", call. = FALSE)
}
# check if a different assembly already exists in global environment
if (!exists("assembly", where = .GlobalEnv, envir = .GlobalEnv)) {
assign("assembly", assembly, envir = .GlobalEnv)
} else if (exists("assembly", where = .GlobalEnv, envir = .GlobalEnv)) {
checkReqPkgs("digest")
if (!identical(digest::digest(assembly), digest::digest(get("assembly", envir = .GlobalEnv)))) {
userChoice <- readline(prompt = "A different object named \"assembly\" already exists in the global environment. Do you want to overwrite it? (y/n or ENTER/ESC): ")
if (any(tolower(userChoice) %in% c("y", "yes", ""))) {
assign("assembly", assembly, envir = .GlobalEnv)
} else {
stop("Aborted by user.", call. = FALSE)
}
}
}
# duplicate sequence names are not allowed
if (any(duplicated(names(assembly)))) {
stop("The assembly contains duplicate sequence names", call. = FALSE)
}
##### Create base mm object #####
if (isTRUE(verbose)) {
message("Calculating GC content...")
}
mm <- tibble::tibble(
scaffold = as.character(names(assembly)),
length = as.integer(Biostrings::width(assembly)),
gc = round(as.numeric(Biostrings::letterFrequency(assembly, letters = c("CG"), as.prob = T)) * 100, digits = 2)
)
##### Coverage #####
if (!is.null(coverage)) {
if (isTRUE(verbose)) {
message("Loading coverage data...")
}
if (is.character(coverage)) {
filepaths <- list.files(
path = coverage,
full.names = TRUE,
all.files = FALSE,
recursive = FALSE,
ignore.case = TRUE
)
filepaths <- filepaths[grepl("*._cov$", tools::file_path_sans_ext(filepaths))]
path <- coverage
if (length(filepaths) > 0) {
filenames <- basename(filepaths)
coverage <- list()
for (i in 1:length(filenames)) {
coverage[[stringr::str_remove(tools::file_path_sans_ext(filenames)[i], "_cov$")]] <- data.table::fread(filepaths[i], data.table = FALSE)[, 1:2]
}
if (isTRUE(verbose)) {
message(paste0(" Found the following ", length(filenames), " coverage files in the folder \"", path, "\":\n ", paste0(filenames, collapse = "\n ")))
}
} else {
stop("No files with a filename ending with \"_cov\" were found in the folder \"", coverage, "\"", call. = FALSE)
}
}
beforeMerge <- ncol(mm)
mm <- mmmerge(
x = mm,
y = coverage,
type = "coverage"
)
colnames(mm)[c((beforeMerge + 1):ncol(mm))] <- paste0("cov_", colnames(mm)[c((beforeMerge + 1):ncol(mm))])
} else {
warning("No coverage data loaded, this may cause trouble with other mmgenome2 functions", call. = FALSE)
}
##### Essential genes #####
if (!is.null(essential_genes)) {
if (isTRUE(verbose)) {
message("Loading essential genes...")
}
if (is.character(essential_genes)) {
if (length(essential_genes) == 1) {
essential_genes <- read.csv(essential_genes,
comment.char = "#",
header = TRUE,
colClasses = "character"
)
if (any(tolower(colnames(essential_genes)) == "orf")) {
essential_genes <- essential_genes[, -which(colnames(essential_genes) == "orf")]
}
}
}
if (is.data.frame(essential_genes) & ncol(essential_genes) == 2) {
essential_genes[[1]] <- as.character(essential_genes[[1]])
essential_genes[[2]] <- as.character(essential_genes[[2]])
# replace all values to only contain alpha-numerics and dots "."
essential_genes[, -1] <- dplyr::mutate_all(essential_genes[, -1, drop = FALSE], dplyr::funs(stringr::str_replace_all(., "[^[:alnum:].]", "")))
colnames(essential_genes) <- c("scaffold", "geneID")
essential_genes <- essential_genes %>%
dplyr::group_by(scaffold) %>%
dplyr::summarise_all(dplyr::funs(paste(., collapse = ",")))
mm <- dplyr::left_join(mm,
essential_genes,
by = "scaffold"
)
} else {
stop("Essential genes must be a 2 column table, where the first column contains the sequence names exactly matching those of the assembly, and the second column the gene names/IDs.", call. = FALSE)
}
}
##### calculate kmer nucleotide frequencies #####
if (isTRUE(kmer_pca) || isTRUE(kmer_BH_tSNE)) {
if (is.numeric(kmer_size)) {
if (isTRUE(verbose)) {
message(paste0(
"Calculating kmer (k=",
as.integer(kmer_size),
") nucleotide frequencies in the assembly sequences..."
))
}
kmer_fwd <- Biostrings::oligonucleotideFrequency(assembly,
width = as.integer(kmer_size),
as.prob = TRUE,
with.labels = TRUE
)
kmer_revC <- Biostrings::oligonucleotideFrequency(Biostrings::reverseComplement(assembly),
width = as.integer(kmer_size),
as.prob = TRUE
)
kmer <- (kmer_fwd + kmer_revC) / 2 * 100
} else {
stop("kmer_size must be a positive integer larger than 0", call. = FALSE)
}
}
##### PCA of tetranucleotides #####
if (isTRUE(kmer_pca)) {
checkReqPkg("vegan")
if (isTRUE(verbose)) {
message(paste0(
"Calculating principal components of kmer (k=",
as.integer(kmer_size),
") nucleotide frequencies..."
))
}
PCA_res <- kmer %>%
vegan::rda() %>%
vegan::scores(choices = 1:3, display = "sites") %>%
tibble::as.tibble()
mm <- tibble::add_column(mm,
PC1 = PCA_res[[1]],
PC2 = PCA_res[[2]],
PC3 = PCA_res[[3]]
)
}
##### BH tSNE of tetranucleotides #####
if (isTRUE(kmer_BH_tSNE)) {
checkReqPkg("Rtsne", "To install with support for multithreading run:\n remotes::install_github('kasperskytte/Rtsne@openmp')\notherwise just install from CRAN.")
if (isTRUE(verbose)) {
message(paste0(
"Calculating Barnes-Hut t-Distributed Stochastic Neighbor Embedding representations of kmer (k=",
as.integer(kmer_size),
") nucleotide frequencies..."
))
}
set.seed(42) # Sets seed for reproducibility
tSNE_res <- Rtsne::Rtsne(kmer,
verbose = verbose,
check_duplicates = F,
...
)[["Y"]] %>%
tibble::as.tibble()
mm <- tibble::add_column(mm,
tSNE1 = tSNE_res[[1]],
tSNE2 = tSNE_res[[2]]
)
}
##### Taxonomy #####
if (!is.null(taxonomy)) {
if (isTRUE(verbose)) {
message("Loading taxonomy...")
}
if (is.character(taxonomy)) {
if (length(taxonomy) == 1) {
taxonomy <- read.csv(taxonomy,
comment.char = "#",
header = TRUE,
colClasses = "character"
)
}
}
if (any(class(taxonomy) %in% c("data.frame", "tbl", "tbl_df")) | is.atomic(taxonomy)) {
its_a_vector <- if (is.atomic(taxonomy)) TRUE else FALSE
replacements <- c(
" <phylum>" = "",
"unclassified Bacteria" = "Unclassified Bacteria",
"Fibrobacteres/Acidobacteria group" = "Acidobacteria",
"Bacteroidetes/Chlorobi group" = "Bacteroidetes",
"delta/epsilon subdivisions" = "Deltaproteobacteria",
"Chlamydiae/Verrucomicrobia group" = "Verrucomicrobia"
)
for (i in 1:length(replacements)) {
taxonomy <- lapply(taxonomy, gsub, pattern = paste(names(replacements)[i]), replacement = replacements[i])
}
if (!isTRUE(its_a_vector)) {
taxonomy <- tibble::as.tibble(taxonomy)
} else if (isTRUE(its_a_vector)) {
taxonomy <- unlist(taxonomy)
}
}
mm <- mmmerge(
x = mm,
y = taxonomy,
type = "taxonomy"
)
}
##### Additional data #####
if (!is.null(additional)) {
if (isTRUE(verbose)) {
message("Loading additional data...")
}
mm <- mmmerge(
x = mm,
y = additional,
type = "additional"
)
}
##### Fix colnames and return #####
newColnames <- stringr::str_replace_all(colnames(mm), "[^[:alnum:]_.]", "") # only alpha-numerics, dots and underscores are allowed
if (any(duplicated(newColnames))) {
stop("Only alpha-numeric characters, dots and underscores are allowed in column names", call. = FALSE)
}
colnames(mm) <- newColnames
if (isTRUE(verbose)) {
message("Done!")
}
return(mm)
}