Permalink
Browse files

Add cython code #25

  • Loading branch information...
1 parent 2118748 commit 618988f083fa5badf097a447b25156b7969df886 James Casbon committed Jun 27, 2012
Showing with 97 additions and 4 deletions.
  1. +6 −1 setup.py
  2. +75 −0 vcf/cparse.pyx
  3. +14 −1 vcf/parser.py
  4. +2 −2 vcf/test/prof.py
View
@@ -1,4 +1,7 @@
from setuptools import setup
+from distutils.core import setup
+from distutils.extension import Extension
+from Cython.Distutils import build_ext
requires = []
@@ -66,5 +69,7 @@
include_package_data=True,
package_data = {
'': ['*.vcf', '*.gz', '*.tbi'],
- }
+ },
+ cmdclass = {'build_ext': build_ext},
+ ext_modules = [Extension("vcf.cparse", ["vcf/cparse.pyx"])]
)
View
@@ -0,0 +1,75 @@
+from model import _Call
+
+cdef _map(func, iterable, bad='.'):
+ '''``map``, but make bad values None.'''
+ return [func(x) if x != bad else None
+ for x in iterable]
+
+cdef char *INTEGER = 'Integer'
+cdef char *FLOAT = 'Float'
+cdef char *NUMERIC = 'Numeric'
+
+def parse_samples(
+ list names, list samples, list samp_fmt,
+ list samp_fmt_types, list samp_fmt_nums, site):
+
+ cdef char *name, *fmt, *entry_type, *sample
+ cdef int i, j
+ cdef list samp_data = []
+ cdef dict sampdict
+ cdef list sampvals
+ n_samples = len(samples)
+ n_formats = len(samp_fmt)
+
+ for i in range(n_samples):
+ name = names[i]
+ sample = samples[i]
+
+ # parse the data for this sample
+ sampdict = dict([(x, None) for x in samp_fmt])
+
+ sampvals = sample.split(':')
+
+ for j in range(n_formats):
+ if j >= len(sampvals):
+ break
+ fmt = samp_fmt[j]
+ vals = sampvals[j]
+ entry_type = samp_fmt_types[j]
+ # TODO: entry_num is None for unbounded lists
+ entry_num = samp_fmt_nums[j]
+
+ # short circuit the most common
+ if vals == '.' or vals == './.':
+ sampdict[fmt] = None
+ continue
+
+ # we don't need to split single entries
+ if entry_num == 1 or ',' not in vals:
+
+ if entry_type == INTEGER:
+ sampdict[fmt] = int(vals)
+ elif entry_type == FLOAT or entry_type == NUMERIC:
+ sampdict[fmt] = float(vals)
+ else:
+ sampdict[fmt] = vals
+
+ if entry_num != 1:
+ sampdict[fmt] = (sampdict[fmt])
+
+ continue
+
+ vals = vals.split(',')
+
+ if entry_type == INTEGER:
+ sampdict[fmt] = _map(int, vals)
+ elif entry_type == FLOAT or entry_type == NUMERIC:
+ sampdict[fmt] = _map(float, vals)
+ else:
+ sampdict[fmt] = vals
+
+ # create a call object
+ call = _Call(site, name, sampdict)
+ samp_data.append(call)
+
+ return samp_data
View
@@ -16,6 +16,11 @@
except ImportError:
pysam = None
+try:
+ import cparse
+except ImportError:
+ cparse = None
+
# Metadata parsers/constants
RESERVED_INFO = {
@@ -846,7 +851,11 @@ def _parse_sample_format(self, samp_fmt):
def _parse_samples(self, samples, samp_fmt, site):
'''Parse a sample entry according to the format specified in the FORMAT
- column.'''
+ column.
+
+ NOTE: this method has a cython equivalent and care must be taken
+ to keep the two methods equivalent
+ '''
# check whether we already know how to parse this format
if samp_fmt in self._format_cache:
@@ -857,6 +866,10 @@ def _parse_samples(self, samples, samp_fmt, site):
self._format_cache[samp_fmt] = (sf, samp_fmt_types, samp_fmt_nums)
samp_fmt = sf
+ if cparse:
+ return cparse.parse_samples(
+ self.samples, samples, samp_fmt, samp_fmt_types, samp_fmt_nums, site)
+
samp_data = []
_map = self._map
View
@@ -1,4 +1,4 @@
-import vcf
+import vcf as vcf
import cProfile
import timeit
import pstats
@@ -17,7 +17,7 @@ def parse_1kg():
p.strip_dirs().sort_stats('time').print_stats()
elif sys.argv[1] == 'time':
- n = 5
+ n = 1
t = timeit.timeit('parse_1kg()', "from __main__ import parse_1kg", number=n)
print t/n
else:

0 comments on commit 618988f

Please sign in to comment.