/
bamutils.R
170 lines (166 loc) · 5.66 KB
/
bamutils.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
#' A function to import paired end bam files as GRanges
#'
#' This function takes bam file paths and read them into GRanges
#' objects.
#' Note: Can be quite lengthy for .bam files with 5+ millions fragments.
#'
#' @param files character vector, each element of the vector is the path
#' of an individual .bam file.
#' @param genome character, genome ID (e.g. "sacCer3", "ce11", "dm6",
#' "mm10" or "hg38").
#' @param where GRanges, only import the fragments mapping to the
#' input GRanges (can fasten the import process a lot).
#' @param max_insert_size Integer, filter out fragments larger
#' than this size.
#' @param shift_ATAC_fragments Boolean, if the fragments come
#' from ATAC-seq, one might want to shift the extremities by +5 / -4 bp.
#' @param cores Integer, number of cores to use when indexing bam files
#' @param verbose Boolean
#' @return A GRanges object containing fragments from the input .bam file.
#'
#' @import parallel
#' @import Rsamtools
#' @import GenomicRanges
#' @import GenomicAlignments
#' @import IRanges
#' @import GenomeInfoDb
#' @export
#'
#' @examples
#' bamfile <- system.file("extdata", "ex1.bam", package = "Rsamtools")
#' fragments <- importPEBamFiles(
#' bamfile,
#' shift_ATAC_fragments = TRUE
#' )
#' fragments
importPEBamFiles <- function(
files,
genome = NULL,
where = NULL,
max_insert_size = 1000,
shift_ATAC_fragments = FALSE,
cores = 10,
verbose = TRUE
)
{
if (any(!file.exists(files))) stop('
(Some) files are missing. Aborting.'
)
if (any(!file.exists(paste0(files, '.bai')))) {
message('
Bam index not found.
Proceeding to create one with samtools index.
This will fail if samtools is not installed.'
)
toIndex <- files[!file.exists(paste0(files, '.bai'))]
lapply(toIndex, function(file) {
system(sprintf('samtools index -@ %i %s', cores, file))
})
}
list.bam <- parallel::mclapply(files, function(FILE) {
# Import reads (inspired from plot2DO function)
bam <- Rsamtools::BamFile(
FILE, yieldSize = 500000000, asMates = TRUE
)
if (is.null(where)) {
where <- Rsamtools::scanBamHeader(
bam, what = c("targets")
)$targets
where <- GenomicRanges::GRanges(
seqnames = names(where), IRanges::IRanges(1, where)
)
}
else {
where <- GenomicRanges::reduce(
IRanges::resize(
where,
width = IRanges::width(where) + 1000,
fix = 'center'
)
)
}
params <- Rsamtools::ScanBamParam(
which = where,
what = c("qname", "mapq", "isize", "flag"),
flag = Rsamtools::scanBamFlag(
isPaired = TRUE,
isProperPair = TRUE,
isSecondaryAlignment = FALSE,
isUnmappedQuery = FALSE,
isNotPassingQualityControls = FALSE,
isSupplementaryAlignment = FALSE
)
)
if (verbose) message('> Importing ', FILE, ' ...')
a <- GenomicAlignments::readGAlignmentPairs(bam, param = params)
if (verbose) message('> Filtering ', FILE, ' ...')
g <- GenomicRanges::GRanges(a)
# Filter by insert size
if (max_insert_size > 0 & !is.null(max_insert_size))
g <- g[IRanges::width(g) <= max_insert_size]
# Shift ATAC fragments
if (shift_ATAC_fragments) {
if (verbose) message('> Shifting ', FILE, ' ...')
g <- shiftATACGranges(g)
}
# Add genome infos
if (!is.null(genome)) {
x <- GenomeInfoDb::keepStandardChromosomes(
GenomeInfoDb::Seqinfo(genome = genome),
pruning.mode = 'coarse'
)
g2 <- GenomeInfoDb::keepStandardChromosomes(
g, pruning.mode = 'coarse'
)
g2 <- sort(g2)
GenomeInfoDb::seqlevels(g2) <- GenomeInfoDb::seqlevels(x)
GenomeInfoDb::seqinfo(g2) <- x
g <- sort(g2)
}
# Return GRanges
if (verbose) message('> ', FILE, ' import completed.')
return(g)
}, mc.cores = length(files))
if (length(files) == 1) list.bam <- list.bam[[1]]
return(list.bam)
}
#' A function to shift GRanges fragments by 5/-4. This is useful
#' when dealing with fragments coming from ATAC-seq.
#'
#' @param g GRanges of ATAC-seq fragments
#' @param pos_shift Integer. How many bases should fragments on
#' direct strand be shifted by?
#' @param neg_shift Integer. How many bases should fragments on
#' negative strand be shifted by?
#' @return A GRanges object containing fragments from the input
#' .bam file.
#'
#' @import parallel
#' @import Rsamtools
#' @import GenomicRanges
#' @import IRanges
#' @export
#'
#' @examples
#' data(bam_test)
#' shiftATACGranges(bam_test)
shiftATACGranges <- function(g, pos_shift = 4, neg_shift = 5) {
if (any(GenomicRanges::strand(g) == '*')) stop(
'Error: the input GRanges has some unstranded fragments.
Please read bam files using
importPEBamFiles with shift_ATAC_fragments = TRUE.'
)
w <- GenomicRanges::width(g)
g_shifted <- GenomicRanges::GRanges(
GenomicRanges::seqnames(g),
IRanges::IRanges(
GenomicRanges::start(g) + pos_shift,
width = w - (pos_shift + neg_shift)
),
strand = GenomicRanges::strand(g)
)
g_shifted <- GenomicRanges::shift(
g_shifted, ifelse(GenomicRanges::strand(g_shifted) == '+', 1, 0)
)
return(g_shifted)
}