Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add preserve_order option to toggle between using an OrderedDict or normal dict #127

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
@@ -1,7 +1,8 @@
PyVCF.egg-info
build
dist
*.pyc
*.sw?
*.py?
docs/_build
.ropeproject
1kg.prof
Expand Down
24 changes: 16 additions & 8 deletions vcf/parser.py
Expand Up @@ -72,7 +72,8 @@

class _vcf_metadata_parser(object):
'''Parse the metadat in the header of a VCF file.'''
def __init__(self):
def __init__(self, dict_type):
self.dict_type = dict_type
super(_vcf_metadata_parser, self).__init__()
self.info_pattern = re.compile(r'''\#\#INFO=<
ID=(?P<id>[^,]+),
Expand Down Expand Up @@ -159,7 +160,7 @@ def read_format(self, format_string):
match.group('type'), match.group('desc'))

return (match.group('id'), form)

def read_contig(self, contig_string):
'''Read a meta-contigrmation INFO line.'''
match = self.contig_pattern.match(contig_string)
Expand All @@ -179,7 +180,7 @@ def read_meta_hash(self, meta_string):
# Removing initial hash marks and final equal sign
key = items[0][2:-1]
# N.B., items can have quoted values, so cannot just split on comma
val = OrderedDict()
val = self.dict_type()
state = 0
k = ''
v = ''
Expand Down Expand Up @@ -223,7 +224,7 @@ class Reader(object):
""" Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """

def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False,
strict_whitespace=False):
strict_whitespace=False, preserve_order=True):
""" Create a new Reader for a VCF file.

You must specify either fsock (stream) or filename. Gzipped streams
Expand All @@ -235,9 +236,16 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals

'strict_whitespace=True' will split records on tabs only (as with VCF
spec) which allows you to parse files with spaces in the sample names.

'preserve_order=True' will use an OrderedDict instead of a regular
dict to preserve the order of the record's fields and INFO data.
Note, at large sizes there are performance implications to
preserving the order.
"""
super(Reader, self).__init__()

self.dict_type = OrderedDict if preserve_order else dict

if not (fsock or filename):
raise Exception('You must provide at least fsock or filename')

Expand Down Expand Up @@ -292,9 +300,9 @@ def _parse_metainfo(self):
The end user shouldn't have to use this. She can access the metainfo
directly with ``self.metadata``.'''
for attr in ('metadata', 'infos', 'filters', 'alts', 'contigs', 'formats'):
setattr(self, attr, OrderedDict())
setattr(self, attr, self.dict_type())

parser = _vcf_metadata_parser()
parser = _vcf_metadata_parser(self.dict_type)

line = self.reader.next()
while line.startswith('##'):
Expand All @@ -315,7 +323,7 @@ def _parse_metainfo(self):
elif line.startswith('##FORMAT'):
key, val = parser.read_format(line)
self.formats[key] = val

elif line.startswith('##contig'):
key, val = parser.read_contig(line)
self.contigs[key] = val
Expand Down Expand Up @@ -350,7 +358,7 @@ def _parse_info(self, info_str):
return {}

entries = info_str.split(';')
retdict = OrderedDict()
retdict = self.dict_type()

for entry in entries:
entry = entry.split('=')
Expand Down