-
Notifications
You must be signed in to change notification settings - Fork 15
/
fullCoverage.R
151 lines (144 loc) · 5.78 KB
/
fullCoverage.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#' Load the unfiltered coverage information from a group of BAM files and a
#' list of chromosomes
#'
#' For a group of samples this function reads the coverage information for
#' several chromosomes directly from the BAM files. Per chromosome, it merges
#' the unfiltered coverage by sample into a DataFrame. The end result is a list
#' with one such DataFrame objects per chromosome.
#'
#' @param files A character vector with the full path to the sample BAM files
#' (or BigWig files).
#' The names are used for the column names of the DataFrame. Check
#' [rawFiles] for constructing `files`. `files` can also be a
#' `BamFileList` object created with [BamFileList][Rsamtools::BamFile-class] or a
#' `BigWigFileList` object created with [BigWigFileList][rtracklayer::BigWigFile].
#' @param chrs The chromosome of the files to read. The format has to match the
#' one used in the input files.
#' @param bai The full path to the BAM index files. If `NULL` it is
#' assumed that the BAM index files are in the same location as the BAM files
#' and that they have the .bai extension. Ignored if `files` is a
#' `BamFileList` object.
#' @param chrlens The chromosome lengths in base pairs. If it's `NULL`,
#' the chromosome length is extracted from the BAM files. Otherwise, it should
#' have the same length as `chrs`.
#' @param outputs This argument is passed to the `output` argument of
#' [loadCoverage]. If `NULL` or `'auto'` it is then recycled.
#' @inheritParams loadCoverage
#' @param ... Arguments passed to other methods and/or advanced arguments.
#' Advanced arguments:
#' \describe{
#' \item{verbose }{ If `TRUE` basic status updates will be printed along
#' the way.}
#' \item{mc.cores }{ How many cores to use for reading the chromosome
#' information. There's no benefit of using a number greater than the number
#' of chromosomes. Also note that your harddisk speed will limit how much
#' you get from using a higher `mc.cores` value.}
#' \item{mc.cores.load }{ Controls the number of cores to be used per chr for
#' loading the data which is only useful in the scenario that you are loading
#' in genome tiles. If not supplied, it uses `mc.cores` for
#' [loadCoverage]. Default: 1. If you are using genome tiles, the total
#' number of cores you'll use will be `mc.cores` times
#' `mc.cores.load`.}
#' }
#' Passed to [loadCoverage], [define_cluster] and
#' [extendedMapSeqlevels].
#' Note that [filterData] is used internally
#' by [loadCoverage] (and hence [fullCoverage]) and has the important
#' arguments `totalMapped` and `targetSize` which can be used to
#' normalize the coverage by library size. See [getTotalMapped] for
#' calculating `totalMapped`.
#'
#' @return A list with one element per chromosome.
#' \describe{ Each element is a DataFrame with the coverage information
#' produced by [loadCoverage].
#' }
#'
#' @seealso [loadCoverage], [filterData], [getTotalMapped]
#'
#' @author Leonardo Collado-Torres
#' @export
#' @importFrom BiocParallel bplapply
#' @importFrom GenomeInfoDb mapSeqlevels
#'
#' @examples
#' datadir <- system.file("extdata", "genomeData", package = "derfinder")
#' files <- rawFiles(
#' datadir = datadir, samplepatt = "*accepted_hits.bam$",
#' fileterm = NULL
#' )
#' ## Shorten the column names
#' names(files) <- gsub("_accepted_hits.bam", "", names(files))
#'
#' ## Read and filter the data, only for 1 file
#' fullCov <- fullCoverage(files = files[1], chrs = c("21", "22"))
#' fullCov
#' \dontrun{
#' ## You can then use filterData() to filter the data if you want to.
#' ## Use bplapply() if you want to do so with multiple cores as shown below.
#' library("BiocParallel")
#' p <- SnowParam(2L)
#' bplapply(fullCov, function(x) {
#' library("derfinder")
#' filterData(x, cutoff = 0)
#' }, BPPARAM = p)
#' }
#'
fullCoverage <- function(
files, chrs, bai = NULL, chrlens = NULL,
outputs = NULL, cutoff = NULL, ...) {
stopifnot(length(files) > 0)
stopifnot(length(chrlens) == length(chrs) | is.null(chrlens))
stopifnot(is.character(chrs))
if (!is.null(outputs)) {
stopifnot(length(outputs) == length(chrs) | outputs == "auto")
if (outputs == "auto") {
outputs <- rep("auto", length(chrs))
}
}
## Advanged argumentsa
# @param verbose If \code{TRUE} basic status updates will be printed along the
# way.
verbose <- .advanced_argument("verbose", TRUE, ...)
# @param mc.cores.load Controls the number of cores to be used per chr for
# loading the data.
mc.cores.load <- .advanced_argument(
"mc.cores.load",
.advanced_argument("mc.cores", getOption("mc.cores", 1L), ...), ...
)
## Define cluster
BPPARAM <- define_cluster(...)
## Subsetting function that runs loadCoverage
loadChr <- function(
idx, files, chrs, bai, chrlens, outputs, cutoff,
mc.load, ...) {
if (verbose) {
message(paste(
Sys.time(), "fullCoverage: processing chromosome",
chrs[idx]
))
}
if (is.null(cutoff)) {
res <- loadCoverage(
files = files, chr = chrs[idx], cutoff = NULL,
bai = bai, chrlen = chrlens[idx], output = outputs[idx],
mc.cores = mc.load, ...
)$coverage
} else {
res <- loadCoverage(
files = files, chr = chrs[idx],
cutoff = cutoff, bai = bai, chrlen = chrlens[idx],
output = outputs[idx], mc.cores = mc.load, ...
)
}
return(res)
}
result <- bplapply(seq_len(length(chrs)), loadChr,
files = files,
chrs = chrs, bai = bai, chrlens = chrlens, outputs = outputs,
cutoff = cutoff, mc.load = mc.cores.load,
..., BPPARAM = BPPARAM
)
names(result) <- extendedMapSeqlevels(chrs, ...)
## Done
return(result)
}