forked from lpryszcz/bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bam2cov.py
executable file
·288 lines (268 loc) · 10.8 KB
/
bam2cov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python
desc="""Report coverage from BAM file.
Support spliced alignments and mapping quality filtering.
By default ignores secondary alignments, duplicates and quality failed reads.
bam2cov_pool.py is nearly 2x faster on large BAM files than bam2cov.py.
TDB:
- define minimum overlap
"""
epilog="""Author: l.p.pryszcz@gmail.com
Mizerow, 30/03/2015
"""
import os, sys, pysam, re, subprocess
from datetime import datetime
import numpy as np
cigarPat = re.compile('\d+\w')
def load_intervals(fn, verbose):
"""Return chr2intervals and number of entries"""
chr2intervals = {}
for i, rec in enumerate(open(fn)):
if rec.startswith('#') or not rec.strip():
continue
rdata = rec.split('\t')
score, strand = 1, "+"
# GTF / GFF
if fn.endswith(('gtf','gff')):
chrom, source, ftype, s, e, score, strand = rdata[:7]
s, e = int(s)-1, int(e)
# BED
else:
# unstranded intervals
if len(rdata)<6:
chrom, s, e = rdata[:3]
else:
chrom, s, e, name, score, strand = rdata[:6]
s, e = int(s), int(e)
if strand=="+":
strand = 0
else:
strand = 1
# add chromosome
if chrom not in chr2intervals:
chr2intervals[chrom] = []
# store interval
data = (s, e, strand, i)
chr2intervals[chrom].append(data)
# define numpy datatype
dtype = np.dtype({'names': ['start', 'end', 'strand', 'entry_id'], \
'formats': ['uint32', 'uint32', 'bool_', 'uint32']})
for chrom, data in chr2intervals.iteritems():
chr2intervals[chrom] = np.array(data, dtype=dtype)
return chr2intervals, i+1
def _filter(a, mapq=0):
"""Return True if poor quality alignment"""
if a.mapq<mapq or a.is_secondary or a.is_duplicate or a.is_qcfail:
return True
def buffer_intervals(c2i, ivals, rname, pos, aend, maxp, pref, bufferSize):
"""Return invervals buffer for faster selection"""
if rname != pref:
maxp = 0
if aend > maxp:
# get ref chrom
c = rname
s, e = pos, aend+bufferSize
# update intervals
if c in c2i:
# select intervals that either start, end or encompass current window/buffer
ivals = c2i[c][np.any([np.all([ c2i[c]['start']>=s, c2i[c]['start']<=e ], axis=0),
np.all([ c2i[c]['end'] >=s, c2i[c]['end'] <=e ], axis=0),
np.all([ c2i[c]['start']< s, c2i[c]['end'] > e ], axis=0)], axis=0)]
else:
ivals = []
#sys.stderr.write(" new buffer with %s intervals: %s:%s-%s\n"%(len(ivals),c,s,e))
# store current reference and max position
pref = rname
maxp = e
return ivals, maxp, pref
def count_overlapping_intervals(blocks, strands, ivals, counts, verbose=0):
"""Count overlapping intervals with given read alignment.
The algorithm support spliced alignments. """
# skip if not ivals
if not len(ivals):
return counts
## get intervals overlapping with given alignment blocks
# start overlapping with interval
d = [np.all([ (s+e)/2.>=ivals['start'], (s+e)/2.<=ivals['end'] ], axis=0) for s, e in blocks]
# end overlapping with interval
#d += [np.all([ e>=ivals['start'], e<=ivals['end'] ], axis=0) for s, e in blocks]
# interval inside read
#d += [np.all([ s< ivals['start'], e> ivals['end'] ], axis=0) for s, e in blocks]
# select intervals fulfilling any of above
selected = ivals[np.any(d, axis=0)]
# check if any matches, as sometimes empty cause problems
if selected.size:
# count -/+ reads
cminus = strands.count(True)
cplus = len(strands)-cminus
#cplus = strands.count(False)
# store info
for s, e, strand, ival in selected:
# - transcript on reverse
if strand:
counts[0][ival] += cminus
counts[1][ival] += cplus
# + transcript on forward
else:
counts[0][ival] += cplus
counts[1][ival] += cminus
return counts
def alignment_iterator(bam, mapq, verbose):
"""Iterate alignments from BAM"""
# open BAM
sam = pysam.AlignmentFile(bam)
# count alg quality ok
qok = 0
# keep info about previous read
pa, strands = 0, []
for i, a in enumerate(sam, 1):
#if i>1e5: break
#if i<84*1e5: continue
if verbose and not i%1e5:
sys.stderr.write(' %i algs; %i ok \r'%(i, qok))
# filter poor quality
if _filter(a, mapq):
continue
qok += 1
if not pa:
pa = a
continue
# check if similar to previous
if pa.pos==a.pos and pa.cigarstring==a.cigarstring:
strands.append(a.is_reverse)
else:
yield pa.blocks, strands, pa.pos, pa.aend, sam.references[pa.rname]
# store current entry
pa = a
strands = [a.is_reverse]
# info
if verbose:
sys.stderr.write(' %i alignments processed.\n'%i)
yield pa.blocks, strands, pa.pos, pa.aend, sam.references[pa.rname]
def cigar2blocks(pos, cigar):
"""Return alignment blocks and aend
Note, here cigar2block doesn't break block upon insertion (I), while pysam does."""
blocks = [[pos, pos]]
for c in cigarPat.findall(cigar):
bases, s = int(c[:-1]), c[-1]
'''if s in ('S','H','I'):
continue
# match'''
if s == 'M':
blocks[-1][-1] += bases
# intron
elif s in ("N","D"):
pos = blocks[-1][-1] + bases
blocks.append([pos, pos])
'''# insertion - skipped earlier
elif s =="I":
pos = blocks[-1][-1] # + bases
blocks.append([pos, pos])'''
#for i in range(len(blocks)): blocks[i] = tuple(blocks[i])
return blocks
def is_reverse(flag):
"""Return true if read aligned to reverse strand"""
if int(flag) & 16:
return True
def alignment_iterator_samtools(bam, mapq, verbose):
"""Iterate aligments from BAM using samtools view subprocess"""
# start samtools view subprocess
cmd0 = ["samtools", "view", "-q%s"%mapq, "-F3840", bam]
proc0 = subprocess.Popen(cmd0, bufsize=-1, stdout=subprocess.PIPE)
out0 = proc0.stdout
# start cut subprocess
cmd1 = ["cut", "-f2-4,6"]
proc1 = subprocess.Popen(cmd1, bufsize=-1, stdout=subprocess.PIPE, stdin=out0)
out1 = proc1.stdout
if verbose:
sys.stderr.write(" "+" | ".join([" ".join(cmd0), " ".join(cmd1)+"\n"]))
# keep info about previous read
strands = []
# iterate only ok alignments
for i, line in enumerate(out1, 1):
#if i>1e5: break
if verbose and not i%1e5:
sys.stderr.write(' %i \r'%i)
flag, rname, pos, cigar = line[:-1].split('\t')
pos = int(pos)
# get blocks, aend & strand
blocks = cigar2blocks(pos, cigar)
aend = blocks[-1][-1]
reverse = is_reverse(flag)
if not strands:
ppos, pcigar, paend, pblocks, prname = pos, cigar, aend, blocks, rname
strands = [reverse]
continue
# check if similar to previous
if ppos==pos and pcigar==cigar:
strands.append(reverse)
else:
yield pblocks, strands, ppos, paend, prname
# store current entry
ppos, pcigar, paend, pblocks, prname = pos, cigar, aend, blocks, rname
strands = [reverse]
# info
if verbose:
sys.stderr.write(' %i alignments processed.\n'%i)
yield pblocks, strands, ppos, paend, prname
def parse_bam(bam, mapq, c2i, entries, bufferSize, verbose):
"""Parse BAM and return counts for sense/antisense of each interval"""
#counts = (np.zeros(entries, dtype='uint32'), np.zeros(entries, dtype='uint32'))
counts = ([0]*entries, [0]*entries)
# keep info about intervals, max position and current reference
ivals, maxp, pref = [], 0, 0
for blocks, strands, pos, aend, rname in alignment_iterator_samtools(bam, mapq, verbose):
# update ivals
ivals, maxp, pref = buffer_intervals(c2i, ivals, rname, pos, aend, maxp, pref, bufferSize)
# add alignments
counts = count_overlapping_intervals(blocks, strands, ivals, counts, verbose)
return counts
def bam2cov(bam, bed, out=sys.stdout, mapq=0, bufferSize=1000000, verbose=1):
"""Calculate coverage for genome intervals."""
# load intervals
if verbose:
sys.stderr.write("Loading intervals...\n")
c2i, entries = load_intervals(bed, verbose)
if verbose:
sys.stderr.write(" %s intervals from %s chromosomes loaded!\n"%(entries, len(c2i)) )
# parse alignments & count interval overlaps
if verbose:
sys.stderr.write("Parsing alignments...\n")
counts = parse_bam(bam, mapq, c2i, entries, bufferSize, verbose)
if verbose:
sys.stderr.write(" sense / antisense alignments: %s / %s\n" % (sum(counts[0]), sum(counts[1])) )
# report
for sense, antisense, line in zip(counts[0], counts[1], open(bed)):
out.write("%s\t%s\t%s\n"%(line[:-1], sense, antisense))
def main():
import argparse
usage = "%(prog)s -v" #usage=usage,
parser = argparse.ArgumentParser(description=desc, epilog=epilog, \
formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('--version', action='version', version='1.0b')
parser.add_argument("-v", "--verbose", default=False, action="store_true",
help="verbose")
parser.add_argument("-i", "--bam", required=True,
help="BAM file")
parser.add_argument("-b", "--bed", required=True,
help="BED/GTF/GFF interval file")
parser.add_argument("-o", "--output", default=sys.stdout, type=argparse.FileType('w'),
help="output stream [stdout]")
parser.add_argument("-q", "--mapq", default=10, type=int,
help="min mapping quality for variants [%(default)s]")
parser.add_argument("--bufferSize", default=100000, type=int,
help="buffer size for intervals [%(default)s]")
o = parser.parse_args()
if o.verbose:
sys.stderr.write("Options: %s\n"%str(o))
# calculate coverage from bam for given intervals
bam2cov(o.bam, o.bed, o.output, o.mapq, o.bufferSize, o.verbose)
if __name__=='__main__':
t0 = datetime.now()
try:
main()
except KeyboardInterrupt:
sys.stderr.write("\nCtrl-C pressed! \n")
except IOError as e:
sys.stderr.write("I/O error({0}): {1}\n".format(e.errno, e.strerror))
dt = datetime.now()-t0
sys.stderr.write("#Time elapsed: %s\n"%dt)