Skip to content

Commit

Permalink
schtats outputs spanning coverage
Browse files Browse the repository at this point in the history
  • Loading branch information
james committed Apr 29, 2014
1 parent 082d1dc commit 52d2155
Show file tree
Hide file tree
Showing 6 changed files with 135 additions and 11 deletions.
23 changes: 23 additions & 0 deletions getSplitReads.py
@@ -0,0 +1,23 @@
#!/usr/bin/env python


##
#Attempts to print split reads from m4 file
##



import sys
from m4io import getAlignments, longestNonOverlapping
from nucio import recordToString

if not len(sys.argv) == 2:
sys.exit("getSplitReads.py in.m4")


inm4 = sys.argv[1]

for read,alignments in getAlignments(inm4, longestNonOverlapping):
if len(alignments) > 1:
astrings = map(recordToString, alignments)
print "\n".join(astrings)
5 changes: 3 additions & 2 deletions graphCellStats.py
Expand Up @@ -86,18 +86,19 @@ def getCountsFromLineArr(arr):
pp = PdfPages(p_arg_map["out"])

colors = cycle("bgrcmyk")
markers = "ooooooooxxxxxxxx++++++++"
markers = "ooooooooxxxxxxxx++++++++********ssssssssvvvvvvvv"

cellset = sorted(list(set(cellnames)))
cmap = dict(zip(cellset, zip(colors,markers)))


h = []
for cellgroup in cellset:
groupdata = filter(lambda x : x[2] == cellgroup, data)
(alld, dgreater, cells) = zip(*groupdata)
h.append(plt.scatter(alld, dgreater, marker=cmap[cellgroup][1], c=cmap[cellgroup][0]))

plt.legend(h,cellset, loc='upper left', fontsize=8, scatterpoints=1)
plt.legend(h,cellset, loc='upper left', fontsize=4, scatterpoints=1)

if p_arg_map["counts"]:
plt.xlabel("Log (Total Number of Reads)")
Expand Down
12 changes: 10 additions & 2 deletions schtats
Expand Up @@ -15,7 +15,7 @@ from nucio import fileIterator, lineItemIterator, FileOrStream
from nucio import openerFromExtension
from misc import iterApply
from stats import getBasicStats, extendedStatsToString, ExtendedStats
from stats import NStar, LBig, Hist
from stats import NStar, LBig, Hist, SpanningCoverage
from args import parseArgs, getHelpStr, argrange, argflag, CLArgument
from log import logger
from strutil import strAppend
Expand All @@ -33,6 +33,10 @@ argument_list = (["genome", "gnm", int, None,"Genome Size for calculating n50 (i
["nstar", "nstar", argrange, [10,25,50,75,90],("Calculate N* (ie N50,N75,N90 etc. "
"Can be a range start:end:increment or comma "
"separated list. Default: [10,25,50,75,90]")],
["spancov", "spancov", argrange, [], ("Calculate Spanning coverage. "
"For given increments retrieve the coverage of "
"reads that will span a region of that size. "
"Can be comma separated list or range start:end:increment.")],
["stdin", "stdin", int, -1, ("Read lengths from stdin rather "
"than from files, argument is the column from which to read "
"the lengths. O based. Ex. -stdin 1, for 2nd column of input")],
Expand Down Expand Up @@ -113,13 +117,17 @@ def calcStats(lengths):
bigs = map(lambda x: x(lengths),
LBig(p_arg_map["big"], genome_size))

#get spanning coverage
spanning_covs = map(lambda x : x(lengths),
SpanningCoverage(p_arg_map["spancov"], genome_size))

#get hist
bins = p_arg_map["hist"]
binsize = bins[1] - bins[0] if len(bins) > 1 else 0
hist = map(lambda x : x(lengths),
Hist(p_arg_map["hist"], binsize))

return ExtendedStats(basic,nstar,bigs,hist,genome_size)
return ExtendedStats(basic,nstar,bigs,hist,spanning_covs,genome_size)

all_file_lengths = []
for seqfile in seqfiles:
Expand Down
43 changes: 39 additions & 4 deletions stats.py
@@ -1,7 +1,8 @@

from itertools import imap
from collections import namedtuple
from operator import add, attrgetter
from operator import add, attrgetter, lt
from functools import partial
from math import sqrt, ceil
from string import ljust
from strutil import strAppend
Expand All @@ -18,6 +19,11 @@
'bases',
'coverage'])

SpanCov = namedtuple('SpanCov', ['increment',
'count',
'bases',
'coverage'])

HistBin = namedtuple('HistBin', ['bin',
'count'])

Expand All @@ -26,8 +32,24 @@
'nstar',
'bigs',
'hist',
'spancovs',
'genome_size'])

def SpanningCoverage(increments,genome_size =None):
'''Calculates the coverage of reads
that can cover an increment'''

def _SC(inc):
def _N(lens):
cnt = 0
bases_greater = 0
for l in lens:
if l > inc:
bases_greater += l-inc
cnt += 1
cov = bases_greater / float(genome_size) if genome_size else None
return SpanCov(inc, cnt, bases_greater, cov)
return _N
return map(_SC, increments)

def NStar(increments, genome_size):
'''Retuns a list of functions that will
Expand Down Expand Up @@ -86,6 +108,10 @@ def _H(lens):
return map(_Hist, increments)






def getBasicStats(lengths, genome_size = None):
'''get stats from a list of lengths
NOTE: lengths must be sorted in reverse
Expand Down Expand Up @@ -125,7 +151,16 @@ def nstarsToString(nstars):
return "\n".join(map(lambda x: s.format(**dict(x._asdict())),
nstars))

def spancovsToString(spancovs):

def spancovformat(spancov):
s = "#>{increment}={count} extra_bases>{increment}={bases} {cov}"
covstr = "cov={0:.2f}".format(spancov.coverage) if spancov.coverage else ""
d = dict(spancov._asdict().items() + [('cov',covstr)])
return s.format(**d)

return "Spanning Coverage:\n" + "\n".join(map(spancovformat, spancovs))

def bigsToString(bigs):
'''List of bigs to make into a string'''

Expand Down Expand Up @@ -165,8 +200,8 @@ def extendedStatsToString(stats):
fmt_strs += ["Assumed Genome Size: %d " % stats.genome_size]

fmt_strs += map( lambda func, data : func(data),
[basicStatsToString, nstarsToString, histToVertString, bigsToString],
[stats.basic, stats.nstar, stats.hist, stats.bigs])
[basicStatsToString, nstarsToString, histToVertString, bigsToString, spancovsToString],
[stats.basic, stats.nstar, stats.hist, stats.bigs, stats.spancovs])

#remove any empty ones
fmt_strs = filter(bool, fmt_strs)
Expand Down
48 changes: 48 additions & 0 deletions zmwProductivityHeatmap.py
@@ -0,0 +1,48 @@
#!/usr/bin/env python

import sys

from operator import itemgetter,attrgetter
from itertools import imap, starmap, repeat,izip
from pbcore.io import BasH5Reader
from collections import Counter

import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages


if not len(sys.argv) == 2:
sys.exit("zmwProductivityHeatmap.py input.bas.h5\n")

infile = sys.argv[1]
cell = BasH5Reader(infile)

get_prod = lambda o : getattr(o, "zmwMetric")("Productivity")

zmwgetters = map(itemgetter, cell.allSequencingZmws)
all_seq_zmws = list(starmap(apply,zip(zmwgetters, repeat([cell]))))
zmw_prods = map(get_prod, all_seq_zmws)
print Counter(zmw_prods)

print filter( lambda (prod,l): prod == 2 , zip( zmw_prods, map(lambda z: len(z.read()), all_seq_zmws)))

#print zmw_prods[:20]
xy = map(attrgetter("holeXY"), all_seq_zmws)
xyl = map(list,xy)
print "length:" + str(len(xyl))
colors="rgy"

(x,y) = zip(*xyl)

pp = PdfPages(infile.split(".")[0] + ".pdf")
colormap = map(lambda c: colors[c], zmw_prods)
plt.scatter(x, y, marker='o', s=3,lw=0, c=colormap, edgecolor=colormap)
#plt.legend(["0","1","2"])

plt.savefig(pp, format="pdf")

pp.close()


15 changes: 12 additions & 3 deletions zmwstats.py
Expand Up @@ -3,8 +3,8 @@
import sys

from functools import partial
from itertools import imap,starmap,repeat,izip,chain
from operator import itemgetter
from itertools import imap,starmap,repeat,izip,chain, ifilter
from operator import itemgetter, attrgetter, lt
from collections import Counter

from pbcore.io import BasH5Reader
Expand All @@ -26,14 +26,22 @@
print "\t".join(["movie_name","sequencing_zmws","all_sequencing_zmws",
"prod_empty", "prod_productive", "prod_other",
"Empty", "FullHqRead0", "FullHqRead1", "PartialHqRead0",
"PartialHqRead1", "PartialHqRead2", "Multiload", "Indeterminate"])
"PartialHqRead1", "PartialHqRead2", "Multiload", "Indeterminate", "Total_Bases",
"Bases_>10k"])
for cell in readers:
movieName = cell.movieName
good_zmws_cnt = len(cell.sequencingZmws)
all_seq_zmws_cnt = len(cell.allSequencingZmws)

zmwgetters = imap(itemgetter,cell.allSequencingZmws)
allSeqZmws = list(starmap(apply, izip(zmwgetters, repeat([cell]))))

#all subreads
subreads = ifilter(bool,imap(attrgetter("subreads"), allSeqZmws))
subread_lens = map(lambda r: r.readEnd - r.readStart, chain.from_iterable(subreads))
total_bases = sum(subread_lens)
bases_g10k = sum(ifilter(partial(lt,10000), subread_lens))

raw_prods = imap(get_prod, allSeqZmws)

prod_counts = Counter(raw_prods)
Expand All @@ -46,6 +54,7 @@
outdata = [movieName, good_zmws_cnt ,all_seq_zmws_cnt]
outdata += list(prod_summary)
outdata += list(read_type_summary)
outdata += [total_bases, bases_g10k]
print "\t".join(map(str, outdata))
cell.close()

Expand Down

0 comments on commit 52d2155

Please sign in to comment.