forked from lpryszcz/bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bam2vcf.py
executable file
·169 lines (148 loc) · 6.27 KB
/
bam2vcf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python
"""
Identify SNP sites in mpileup out from BAM alignments.
In addition calculates overall and each contig coverage.
At first run: /users/tg/lpryszcz/cluster/assembly/mpileup.sh (s=MCO456; samtools mpileup -f gem/$s.fa gem/${s}.bam > gem/${s}.mpileup)
Execute:
python ~/workspace/assembly/src/heterozygosity.py -i gem/CBS6318.mpileup [-f minFreq -d minDepth -o out_base_name]
or use with piping:
s=CBS1954; samtools mpileup -f gem/$s.fa gem/$s.bam | python ~/workspace/assembly/src/heterozygosity.py -o gem/$s -f 0.2 -d 10
Note, it skips Ns from ref sequence.
"""
import os, sys
from optparse import OptionParser
from datetime import datetime
import subprocess
def _remove_indels( alts ):
"""
Remove indels from mpileup.
.$....,,,,....,.,,..,,.,.,,,,,,,....,.,...,.,.,....,,,........,.A.,...,,......^0.^+.^$.^0.^8.^F.^].^],
........,.-25ATCTGGTGGTTGGGATGTTGCCGCT..
"""
#remove indels info
for symbol in ('-','+'):
baseNo = 0
while symbol in alts:
i=alts.index(symbol)
j = 1
digits=[]
while alts[i+j].isdigit():
digits.append( alts[i+j] )
j += 1
if digits:
baseNo=int( ''.join(digits) )
alts=alts[:i]+alts[i+baseNo+len(digits)+1:] #......+1A..,
return alts
def get_alt_allele( base_ref,cov,alg,minFreq,alphabet,reference,bothStrands ):
"""Return alternative allele only if different than ref and freq >= minFreq.
"""
#remove deletions
alts = alg
dels = alts.count('*')
#remove insertions
alts = _remove_indels( alts )
#get base counts
baseCounts = [ ( alts.upper().count(base),base ) for base in alphabet ]
#get base frequencies
for base_count,base in sorted( baseCounts ):
freq = base_count*1.0/cov
if base!=base_ref and freq >= minFreq:
#check if alt base in both strands
if bothStrands:
if not base.upper() in alts or not base.lower() in alts:
return
return (base,freq) # base!=base_ref and
def parse_mpileup( fnames,fastaFn,minDepth,minFreq,indels,reference,bothStrands,verbose,alphabet='ACGT' ):
"""
"""
# open out files and write header
#header = '#coordinate\treference coverage\tref base\tref freq\talt coverage\talt base\talt freq\n'
header = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tref_cov:freq\alt_cov:freq\n'
i = 0
outFiles=[]
for fn in fnames:
i += 1
if i==1 and reference:
continue
out = open( "%s.snps.cov_%s.freq_%s.bothStrands_%s.vcf" % ( fn,minDepth,minFreq,bothStrands ),"w" )
outFiles.append( out )
out.write( header )
#process mpileup
contigs=[]
totCov={}; totLen={}; pContig=pPos=0
#open subprocess
args = [ 'samtools','mpileup' ] #; print args
if fastaFn:
args += [ "-f", fastaFn ]
proc = subprocess.Popen( args+ fnames,stdout=subprocess.PIPE,bufsize=2048 )
for line in proc.stdout:
line = line.strip()
lineTuple = line.split('\t')
#get coordinate
refCov = refFreq = ''
contig,pos,baseRef = lineTuple[:3]
if baseRef=="N":
continue
samplesData = lineTuple[3:]
#laod ref data
if reference:
refCov,refAlgs,refQuals = lineTuple[3:6]
refCov = int(refCov)
samplesData = lineTuple[6:]
if refCov<minDepth:
continue
alt_allele = get_alt_allele( '',refCov,refAlgs,minFreq,alphabet,reference,bothStrands )
if not alt_allele:
continue
baseRef,refFreq = alt_allele
#,cov,alg,quals= #; contig,pos,base,cov,alg,quals
for out,cov,alg,quals in zip( outFiles,samplesData[0::3],samplesData[1::3],samplesData[2::3] ):
cov=int(cov)
if cov<minDepth:
continue
# check for SNP
alt_allele = get_alt_allele( baseRef,cov,alg,minFreq,alphabet,reference,bothStrands )
if not alt_allele:
continue
# get base and freq
base,freq = alt_allele
#lineOut='%s:%s\t%s\t%s\t%1.4f\t%s\t%s\t%1.4f\n' % ( contig,pos,refCov,baseRef,refFreq,cov,base,freq )
lineOut='%s\t%s\t.\t%s\t%s\t255\t.\t%s:%s\t%s:%s\n' % (contig,pos,baseRef,base,refCov,refFreq,cov,freq)
out.write( lineOut )
if verbose:
print lineOut,
for out in outFiles: #close outfile, if opened
out.close()
def main():
usage = "usage: %prog [options] ref.bam *.bam"
parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True
parser.add_option("-i", dest="fasta", help="fasta [required only if no reference bam]")
parser.add_option("-d", dest="minDepth", default=5, type=int,
help="minimal depth [%default]")
parser.add_option("-f", dest="minFreq", default=0.8, type=float,
help="min frequency of alternative base [%default]")
parser.add_option("-n", dest="indels", default=False, action="store_true",
help="report indels [%default]")
parser.add_option("-b", dest="bothStrands", default=False, action="store_true",
help="only SNPs confirmed by both strand algs [%default]")
parser.add_option("-r", dest="reference", default=True, action="store_false",
help="first bam is reference algs [%default]")
parser.add_option("-v", dest="verbose", default=False, action="store_true" )
( o, fnames ) = parser.parse_args()
if o.verbose:
print "Options: %s\nFiles: %s" % ( o,fnames )
if not fnames:
parser.error( "Provide at least one bam file!" )
for fn in fnames:
if not os.path.isfile( fn ):
parser.error( "No such file: %s" % fn )
if o.fasta:
if not os.path.isfile( o.fasta ):
parser.error( "No such file: %s" % o.fasta )
#parse pileup
parse_mpileup( fnames,o.fasta,o.minDepth,o.minFreq,o.indels,o.reference,o.bothStrands,o.verbose )
if __name__=='__main__':
t0=datetime.now()
main()
dt=datetime.now()-t0
sys.stderr.write( "#Time elapsed: %s\n" % dt )