Fix writing of Number=A and G INFO/FORMAT fields #53

merged 4 commits into from

2 participants


Currently, the Reader stores field Numbers of A and G as negative integers, and the Writer doesn't convert them back to letters.

To make it easier to reverse the cast to integer, I changed the conversion method. Instead of converting using hard-coded if/elif, I've made a module-level dictionary for both conversion and reversal lookup. I'm using if/else because I've read that it's faster than try/except if the check is expected to fail over half the time, and I put if not first because it's more likely.

For writing the header fields, I switched to use str.format() (instead of C-style) and added a method that uses the conversion dictionary to switch the integers back to the correct letters.

In the third commit I removed the '.' if None str() _mapped to each line. The dictionary handles None to '.' for the Number field. As far as I can tell from the spec, only Number is allowed to have a value of '.' -- if this is incorrect this commit can be skipped.


Added another commit to write a None alt as '.'

Current logic is str(x) or '.' but str(None) evaluates to 'None' which is True.

@jamescasbon jamescasbon merged commit 880ce55 into jamescasbon:master
46 vcf/
@@ -43,6 +43,13 @@
+# Conversion between value in file and Python value
+field_counts = {
+ '.': None, # Unknown number of values
+ 'A': -1, # Equal to the number of alleles in a given record
+ 'G': -2, # Equal to the number of genotypes in a given record
_Info = collections.namedtuple('Info', ['id', 'num', 'type', 'desc'])
_Filter = collections.namedtuple('Filter', ['id', 'desc'])
@@ -162,18 +169,12 @@ def __init__(self):
self.meta_pattern = re.compile(r'''##(?P<key>.+?)=(?P<val>.+)''')
def vcf_field_count(self, num_str):
- if num_str == '.':
- # Unknown number of values
- return None
- elif num_str == 'A':
- # Equal to the number of alleles in a given record
- return -1
- elif num_str == 'G':
- # Equal to the number of genotypes in a given record
- return -2
- else:
+ """Cast vcf header numbers to integer or None"""
+ if num_str not in field_counts:
# Fixed, specified number
return int(num_str)
+ else:
+ return field_counts[num_str]
def read_info(self, info_string):
'''Read a meta-information INFO line.'''
@@ -975,20 +976,26 @@ class Writer(object):
+ # Reverse keys and values in header field count dictionary
+ counts = dict((v,k) for k,v in field_counts.iteritems())
def __init__(self, stream, template):
self.writer = csv.writer(stream, delimiter="\t")
self.template = template
+ two = '##{key}=<ID={0},Description="{1}">\n'
+ four = '##{key}=<ID={0},Number={num},Type={2},Description="{3}">\n'
+ _num = self._fix_field_count
for line in template.metadata.iteritems():
- stream.write('##%s=%s\n' % line)
+ stream.write('##{0}={1}\n'.format(*line))
for line in template.infos.itervalues():
- stream.write('##INFO=<ID=%s,Number=%s,Type=%s,Description="%s">\n' % tuple(self._map(str, line)))
+ stream.write(four.format(key="INFO", *line, num=_num(line.num)))
for line in template.formats.itervalues():
- stream.write('##FORMAT=<ID=%s,Number=%s,Type=%s,Description="%s">\n' % tuple(self._map(str, line)))
+ stream.write(four.format(key="FORMAT", *line, num=_num(line.num)))
for line in template.filters.itervalues():
- stream.write('##FILTER=<ID=%s,Description="%s">\n' % tuple(self._map(str, line)))
+ stream.write(two.format(key="FILTER", *line))
for line in template.alts.itervalues():
- stream.write('##ALT=<ID=%s,Description="%s">\n' % tuple(self._map(str, line)))
+ stream.write(two.format(key="ALT", *line))
@@ -1006,8 +1013,15 @@ def write_record(self, record):
for sample in record.samples]
self.writer.writerow(ffs + samples)
+ def _fix_field_count(self, num_str):
+ """Restore header number to original state"""
+ if num_str not in self.counts:
+ return num_str
+ else:
+ return self.counts[num_str]
def _format_alt(self, alt):
- return ','.join([str(x) or '.' for x in alt])
+ return ','.join(self._map(str, alt))
def _format_info(self, info):
if not info:
