Merge ca92473 into 3b15e55

openvax · Aug 1, 2017 · 4c9a98b · 4c9a98b
2 parents 3b15e55 + ca92473
commit 4c9a98b
Showing 1 changed file with 221 additions and 1 deletion.
diff --git a/varcode/vcf.py b/varcode/vcf.py
@@ -19,6 +19,7 @@
 import zlib
 import logging
 from collections import OrderedDict
+from collections import defaultdict
 from warnings import warn
 
 from six.moves import urllib
@@ -30,7 +31,6 @@
 from .variant import Variant, variant_ascending_position_sort_key
 from .variant_collection import VariantCollection
 
-
 logger = logging.getLogger(__name__)
 
 
@@ -181,6 +181,226 @@ def sample_info_parser(unparsed_sample_info_strings, format_string):
             'sort_key': sort_key,
             'distinct': distinct})
 
+def variant_collection_to_vcf(variants):
+    """Output a VCF file whose records correspond to those in the given `VariantCollection`."""
+
+    # TODO: The variant metadata dictionary (in the `VariantCollection`)
+    # contains different data depending on the original input file format (VCF,
+    # MAF, CSV).  It's easy to output variants originally in VCF format, but we
+    # might want to consider how fields from MAF (for example) map to those in
+    # VCF.
+
+    # TODO: The VCF file we output doesn't contain any VCF metadata headers, as
+    # the original headers were thrown away when the VCF file was parsed. We
+    # may want to keep some of that information and/or infer some of the
+    # headers based on the variants themselves. The former is difficult because
+    # merge conflicts will inevitably occur; the latter is difficult because
+    # the variants themselves don't contain all the information required for
+    # these metadata headers (e.g., descriptions).
+
+    # TODO: If we end up needing more functions to "build" VCF record fields
+    # (see functions below), we may want to abstract away the individual
+    # functions and create a mapping from field to format function.
+
+    # TODO: Currently fails when there is > 1 source for the variant collection.
+    # Fix that. Once duplicates are merged, all is well, but we need to access
+    # metadata fields in order to merge them, which requires knowing the source.
+
+    def get_metadata_field(key, variant, default='.'):
+        """Retrieve field from variant metadata dictionary."""
+        val = variants.metadata[variant].get(key)
+        if val is None:
+            return default
+        return val
+
+    def build_filter_field(variant):
+        """Build the filter field from the given variant.
+
+        The `filter` field, as stored in the variants metadata dictionary,
+        comes in 3 flavors:
+
+            - empty list: 1+ filters were run and none failed
+            - non-empty list: 1+ filters were run and 1+ failed
+            - `None`: no filters were run
+
+        This function maps each of these internal representations to their
+        corresponding VCF representations.
+        """
+        filter_metadata = get_metadata_field('filter', variant)
+        if type(filter_metadata) == list:
+            return 'PASS' if filter_metadata == [] else ';'.join(filter_metadata)
+        else:
+            # TODO: Can the filter field ever be something other than the 3
+            # possibilities described in this function's doc comment?
+            bad_value_msg = (
+                'Filter metadata field took on unexpected value `{}`. Update '
+                'code to account for this value.').format(str(filter_metadata))
+            assert filter_metadata == '.', bad_value_msg
+            return filter_metadata
+
+    def build_info_field(variant):
+        """Build the info field from the given variant.
+
+        Format is `<key>=<val>,...;<key>=<val>,...;<key>=<val>,...`.
+        """
+
+        def build_info_pair(key, val):
+            """Build key/val pair for info field."""
+            # Note: Different from `val == True`, which returns True when `val == 1`.
+            if val is True:
+                return key
+
+            if type(val) == list:
+                val = ','.join(map(str, val))
+            else:
+                val = str(val)
+            return '{}={}'.format(key, val)
+
+        info_dict = get_metadata_field('info', variant, default={})
+        if not info_dict:
+            return '.'
+
+        return ';'.join(build_info_pair(k, v) for k,v in info_dict.iteritems())
+
+    def build_format_field(variant):
+        """Build the sample format string from the given variant.
+
+        Each sample column follows this format for the specified variant.
+        """
+        sample_info = get_metadata_field('sample_info', variant, default={})
+        return ':'.join(sample_info.values()[0].keys()) if sample_info else '.'
+
+    def build_sample_fields(variant):
+        """Build the sample fields for the given variant."""
+
+        def build_sample_field(sample):
+            """Build a specific sample's field (i.e., one sample column)."""
+            sample_vals = sample_info[sample].values()
+            return ':'.join(build_sample_val(val) for val in sample_vals)
+
+        def build_sample_val(sample_val):
+            """Build a specific value for a sample (i.e., one value for one column)."""
+            if type(sample_val) is list:
+                return ','.join(map(str, sample_val))
+            elif sample_val is not None:
+                return str(sample_val)
+            else:
+                return '.'
+
+        sample_info = get_metadata_field('sample_info', variant, default={})
+        return list(build_sample_field(sample) for sample in sample_info)
+
+    def build_vcf_record(variant, add_sample_info):
+        """Return a list of all the variant's VCF record fields."""
+        record = [
+            str(variant.original_contig),
+            str(variant.original_start),
+            get_metadata_field('id', variant),
+            variant.original_ref,
+            variant.original_alt,
+            str(get_metadata_field('qual', variant)),
+            build_filter_field(variant),
+            build_info_field(variant),
+        ]
+        if add_sample_info:
+            record.append(build_format_field(variant))
+            record.extend(build_sample_fields(variant))
+        return record
+
+    def merge_duplicate_variants(variants):
+        """Merge duplicate variants (according to ID) and return *list* of merged, sorted variants.
+
+        Multiple `Variant`s can have the same VCF id (e.g., those variants which which
+        originally had > 1 alternate base), but we can't output a VCF file with multiple
+        records having the same id. This function merges those duplicate variants.
+        """
+
+        def construct_id2variants():
+            """Construct dictionary which maps variant IDs to `Variant`s."""
+            id2variants = defaultdict(list)
+            for variant in variants:
+                # Note: For variants without IDs (e.g., those that came from
+                # MAF files), we assign the missing value.
+                variant_id = get_metadata_field('id', variant)
+                id2variants[variant_id].append(variant)
+            return id2variants
+
+        def merge_variant_list(duplicates):
+            """Merge duplicate variant list into one."""
+
+            # TODO: Currently assumes that only alternate bases differ, but we may
+            # want or need to merge variants that have the same ID but different
+            # contigs, positions, or reference bases.
+
+            assert len(duplicates) > 0
+            assert all(v.original_contig == duplicates[0].original_contig and
+                       v.original_start == duplicates[0].original_start and
+                       v.original_ref == duplicates[0].original_ref for v in duplicates)
+            import copy
+            merged = copy.copy(duplicates[0])
+            for variant in duplicates[1:]:
+                merged.original_alt += ',' + variant.original_alt
+            return merged
+
+        id2variants = construct_id2variants()
+        variants_no_id = id2variants.pop('.', [])  # don't want to merge variants w/ no id
+        merged_variants = map(merge_variant_list, id2variants.values()) + variants_no_id
+
+        # groups variants by contig; relative ordering of contigs doesn't matter
+        return sorted(merged_variants, key=lambda v: (v.original_contig, v.original_start))
+
+
+    def get_sample_names():
+        """Return the sample names for all variants."""
+
+        # TODO: For now, ensures that every variant has the same samples. If they didn't,
+        # we'd have to use the missing value for VCF records with no data on a given
+        # sample.  In and of itself, that isn't a problem until the format field contains
+        # GT (genotype), which is required for all samples if present in the format field.
+        #
+        # A couple of ways to handle this:
+        #
+        #   1) Ignore this requirement of VCF files. I'd assume this would still be
+        #   compatible with varcode, but may not be with other VCF parsers.
+        #   2) Remove the GT field from those records where this rule would be violated.
+        #   This is sad because we lose information.
+        #
+        # It's also important to note that if/when we combine all samples, we'll have to
+        # add a `.` (missing value) for those samples in which a variant has no data.
+        # Check the VCF spec to make sure this is valid; if not, we may have to write
+        # `.:.:.:.` -- one `.` for each field in the format string.
+        #
+        # Moreover, we'll want to note whether we care about maintaining the relative
+        # ordering of the sample names in the original VCF files. This probably isn't
+        # important, but the code below does not do this (it effectively alphabetizes the
+        # sample names because of the use of `set`).
+
+        sample_names = set()
+        for variant in variants:
+            sample_info = get_metadata_field('sample_info', variant, default={})
+            addl_sample_names = set(sample_info.keys())
+
+            # Ensures all variants have the same samples.
+            if sample_names and sample_names != addl_sample_names:
+                return []
+
+            sample_names.update(addl_sample_names)
+
+        return list(sample_names)
+
+    headers = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
+    sample_names = get_sample_names()
+    if sample_names:
+        headers += ['FORMAT'] + sample_names
+
+    unique_variants_list = merge_duplicate_variants(variants)
+
+    print('##fileformat=VCFv4.2')
+    print('\t'.join(headers))
+    for variant in unique_variants_list:
+        record = build_vcf_record(variant, add_sample_info=bool(sample_names))
+        print('\t'.join(record))
+
 def load_vcf_fast(*args, **kwargs):
     """
     Same as load_vcf, keeping this name for backwards compatibility.