/
bamSeqOverlap
executable file
·105 lines (100 loc) · 2.99 KB
/
bamSeqOverlap
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env ruby
require 'rubygems'
require 'optimist'
require 'csv'
ARGV.push("--help") if ARGV.empty?
opts = Optimist::options do
banner File.basename($0)
opt :files, "bam file(s)", :type=>:strings, :required=>true
opt :beds, "bed file(s)", :type=>:strings, :required=>true
end
ranges = Hash.new
opts.beds.each do |bed|
bname = File.basename(bed, ".bed")
ranges[bname] = Hash.new
File.new(bed).each do |line|
chrom, start, stop, name, score, strand = line.chomp.split("\t")
ranges[bname][chrom] = [] if !ranges[chrom]
ranges[bname][chrom].push([start.to_i, stop.to_i, strand, name])
end
end
outputs = Hash.new
counts = Hash.new
opts.files.each do |bam|
counts[bam] = Hash.new
hits = Hash.new
ranges.keys.each do |bed|
outputs[bed] = File.new(File.basename(bam, ".bam") + "_" + bed + ".fa", "w")
counts[bam][bed] = 0
hits[bed] = Hash.new
end
outputs["unclassified"] = File.new(File.basename(bam, ".bam") + "_unclassified.fa", "w")
counts[bam]["unclassified"] = 0
hits["unclassified"] = Hash.new
STDERR << "Processing " << bam << "...\n"
`samtools view #{bam}`.split("\n").each do |line|
name, flag, chrom, pos, mapq, cigar, rnext, pnext, tlen, seq, qual = line.chomp.split("\t")
printed = false
if flag.to_i & 16 == 0
mstrand = "+"
else
mstrand = "-"
end
pos = pos.to_i
ranges.keys.each do |bed|
if ranges[bed][chrom]
ranges[bed][chrom].each do |feature|
start, stop, strand, fname = feature
if pos >= start && pos <= stop
if strand == mstrand
matched = "forward"
else
matched = "reverse"
end
if !printed
loc = chrom + " " + pos.to_s + " " + matched + " " + fname
hits[bed][loc] = 0 if !hits[bed][loc]
hits[bed][loc] += 1
outputs[bed].printf(">%s %s\n%s\n", name, loc, seq)
counts[bam][bed] += 1
printed = true
end
end
end
end
end
if !printed
bed = "unclassified"
loc = chrom + " " + pos.to_s + " " + mstrand
hits[bed][loc] = 0 if !hits[bed][loc]
hits[bed][loc] += 1
outputs[bed].printf(">%s %s\n%s\n", name, loc, seq)
counts[bam][bed] += 1
printed = true
end
end
(ranges.keys + ["unclassified"]).each do |bed|
outputs[bed].close
out = File.new(File.basename(bam, ".bam") + "_hits_" + bed + ".txt", "w")
hits[bed].keys.each do |name|
out.printf("%s %d\n", name, hits[bed][name])
end
out.close
end
end
csv = File.new("counts.csv", "w")
csv.print ([""]+counts.keys.collect{|x|File.basename(x,".bam")}).to_csv
(ranges.keys + ["unclassified"]).each do |key|
row = [key]
counts.keys.each do |bam|
sum = counts[bam].values.reduce(:+)
row.push((100000*counts[bam][key]/sum)/1000.0)
end
csv.print row.to_csv
end
row = ["reads"]
counts.keys.each do |bam|
row.push(counts[bam].values.reduce(:+))
end
csv.print row.to_csv
csv.close