jamescasbon · bruth · Mar 6, 2013 · Nov 14, 2013
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,8 @@
 PyVCF.egg-info
 build
 dist
-*.pyc
+*.sw?
+*.py?
 docs/_build
 .ropeproject
 1kg.prof

diff --git a/vcf/parser.py b/vcf/parser.py
@@ -72,7 +72,8 @@
 
 class _vcf_metadata_parser(object):
     '''Parse the metadat in the header of a VCF file.'''
-    def __init__(self):
+    def __init__(self, dict_type):
+        self.dict_type = dict_type
         super(_vcf_metadata_parser, self).__init__()
         self.info_pattern = re.compile(r'''\#\#INFO=<
             ID=(?P<id>[^,]+),
@@ -159,7 +160,7 @@ def read_format(self, format_string):
                        match.group('type'), match.group('desc'))
 
         return (match.group('id'), form)
-    
+
     def read_contig(self, contig_string):
         '''Read a meta-contigrmation INFO line.'''
         match = self.contig_pattern.match(contig_string)
@@ -179,7 +180,7 @@ def read_meta_hash(self, meta_string):
         # Removing initial hash marks and final equal sign
         key = items[0][2:-1]
         # N.B., items can have quoted values, so cannot just split on comma
-        val = OrderedDict()
+        val = self.dict_type()
         state = 0
         k = ''
         v = ''
@@ -223,7 +224,7 @@ class Reader(object):
     """ Reader for a VCF v 4.0 file, an iterator returning ``_Record objects`` """
 
     def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=False,
-                 strict_whitespace=False):
+                 strict_whitespace=False, preserve_order=True):
         """ Create a new Reader for a VCF file.
 
             You must specify either fsock (stream) or filename.  Gzipped streams
@@ -235,9 +236,16 @@ def __init__(self, fsock=None, filename=None, compressed=False, prepend_chr=Fals
 
             'strict_whitespace=True' will split records on tabs only (as with VCF
             spec) which allows you to parse files with spaces in the sample names.
+
+            'preserve_order=True' will use an OrderedDict instead of a regular
+            dict to preserve the order of the record's fields and INFO data.
+            Note, at large sizes there are performance implications to
+            preserving the order.
         """
         super(Reader, self).__init__()
 
+        self.dict_type = OrderedDict if preserve_order else dict
+
         if not (fsock or filename):
             raise Exception('You must provide at least fsock or filename')
 
@@ -292,9 +300,9 @@ def _parse_metainfo(self):
         The end user shouldn't have to use this.  She can access the metainfo
         directly with ``self.metadata``.'''
         for attr in ('metadata', 'infos', 'filters', 'alts', 'contigs', 'formats'):
-            setattr(self, attr, OrderedDict())
+            setattr(self, attr, self.dict_type())
 
-        parser = _vcf_metadata_parser()
+        parser = _vcf_metadata_parser(self.dict_type)
 
         line = self.reader.next()
         while line.startswith('##'):
@@ -315,7 +323,7 @@ def _parse_metainfo(self):
             elif line.startswith('##FORMAT'):
                 key, val = parser.read_format(line)
                 self.formats[key] = val
-            
+
             elif line.startswith('##contig'):
                 key, val = parser.read_contig(line)
                 self.contigs[key] = val
@@ -350,7 +358,7 @@ def _parse_info(self, info_str):
             return {}
 
         entries = info_str.split(';')
-        retdict = OrderedDict()
+        retdict = self.dict_type()
 
         for entry in entries:
             entry = entry.split('=')