Skip to content
Browse files

add strict whitespace option to allow for well formed VCFs with space…

…s in sample names.
  • Loading branch information...
1 parent fb835a2 commit b6c085b74ce5c2acd6a785452e6f0f9062b1789d @casbon casbon committed
Showing with 14 additions and 2 deletions.
  1. +14 −2 vcf/parser.py
View
16 vcf/parser.py
@@ -172,12 +172,19 @@ def read_meta(self, meta_string):
class Reader(object):
""" Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """
- def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False):
+ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False,
+ strict_whitespace=False):
""" Create a new Reader for a VCF file.
You must specify either fsock (stream) or filename. Gzipped streams
or files are attempted to be recogized by the file extension, or gzipped
can be forced with ``compressed=True``
+
+ 'prepend_chr=True' will put 'chr' before all the CHROM values, useful
+ for different sources.
+
+ 'strict_whitespace=True' will split records on tabs only (as with VCF
+ spec) which allows you to parse files with spaces in the sample names.
"""
super(Reader, self).__init__()
@@ -218,6 +225,11 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals
self._parse_metainfo()
self._format_cache = {}
+ if strict_whitespace:
+ self._separator = '\t'
+ else:
+ self._separator = '\t| +'
+
def __iter__(self):
return self
@@ -437,7 +449,7 @@ def _parse_alt(self, str):
def next(self):
'''Return the next record in the file.'''
line = self.reader.next()
- row = re.split('\t+', line)
+ row = re.split(self._separator, line)
chrom = row[0]
if self._prepend_chr:
chrom = 'chr' + chrom

0 comments on commit b6c085b

Please sign in to comment.
Something went wrong with that request. Please try again.