Merge pull request #189 from hammerlab/minor_updates

Add a median VAF filter function and a filter_fn watermark
hammerlab · Mar 10, 2017 · b47b8ef · b47b8ef
2 parents 616cb5f + 44938b7
commit b47b8ef
Show file tree

Hide file tree

Showing 8 changed files with 83 additions and 21 deletions.
diff --git a/cohorts/cohort.py b/cohorts/cohort.py
@@ -103,6 +103,8 @@ class Cohort(Collection):
         Verify that the cached provenance is equal to the current environment.
     print_provenance : bool
         Print a summary of cache file provenance.
+    print_filter : bool
+        Print the name of the default `filter_fn` on initialization.
     polyphen_dump_path : str
         Path to a Polyphen database dump.
     pageant_coverage_path : str
@@ -128,6 +130,7 @@ def __init__(self,
                  responder_pfs_equals_os=False,
                  check_provenance=False,
                  print_provenance=True,
+                 print_filter=True,
                  polyphen_dump_path=None,
                  pageant_coverage_path=None,
                  benefit_plot_name="Benefit",
@@ -177,6 +180,9 @@ def __init__(self,
                             "polyphen": "cached-polyphen-annotations",
                             "isovar": "cached-isovar-output"}
 
+        if print_filter:
+            print("Applying %s filter by default" % self.filter_fn.__name__ if
+                  self.filter_fn is not None else "None")
         if print_provenance:
             pprint.pprint(self.summarize_data_sources())
 
@@ -506,7 +512,7 @@ def load_variants(self, patients=None, filter_fn=None, **kwargs):
             if variants is not None:
                 patient_variants[patient.id] = variants
         return patient_variants
-    
+
     def _hash_filter_fn(self, filter_fn, **kwargs):
         """ Construct string representing state of filter_fn
             Used to cache filtered variants or effects uniquely depending on filter fn values

diff --git a/cohorts/functions.py b/cohorts/functions.py
@@ -15,11 +15,14 @@
 from __future__ import print_function
 
 from .variant_filters import no_filter, effect_expressed_filter
+from .varcode_utils import FilterableVariant
 from .utils import first_not_none_param
+from .variant_stats import variant_stats_from_variant
 
 from functools import wraps
 import numpy as np
-from varcode.effects import Substitution
+import pandas as pd
+from varcode.effects import Substitution, FrameShift
 from varcode.common import memoize
 from varcode.effects.effect_classes import Exonic
 import inspect
@@ -207,6 +210,20 @@ def count_filter_fn(filterable_effect, **kwargs):
         isinstance(filterable_effect.effect, Exonic) and
         filterable_effect.variant.is_insertion))
 
+frameshift_count = count_effects_function_builder(
+    "frameshift_count",
+    only_nonsynonymous=False, # Should not matter, because FrameShift extends NonsilentCodingMutation
+    filterable_effect_function=lambda filterable_effect: (
+        isinstance(filterable_effect.effect, FrameShift)))
+
+missense_snv_and_nonsynonymous_indel_count = count_effects_function_builder(
+    "missense_snv_and_nonsynonymous_indel_count",
+    only_nonsynonymous=True,
+    filterable_effect_function=lambda filterable_effect: (
+        (filterable_effect.variant.is_indel) or
+         (type(filterable_effect.effect) == Substitution and
+          filterable_effect.variant.is_snv)))
+
 @count_function
 def neoantigen_count(row, cohort, filter_fn, normalized_per_mb, **kwargs):
     patient = cohort.patient_from_id(row["patient_id"])
@@ -242,3 +259,19 @@ def expressed_neoantigen_count(row, cohort, filter_fn, normalized_per_mb, **kwar
                             normalized_per_mb=normalized_per_mb,
                             only_expressed=True,
                             **kwargs)
+
+def median_vaf_purity(row, cohort):
+    """
+    Estimate purity based on 2 * median VAF.
+
+    Even if the Cohort has a default filter_fn, ignore it: we want to use all variants for
+    this estimate.
+    """
+    patient_id = row["patient_id"]
+    patient = cohort.patient_from_id(patient_id)
+    variants = cohort.load_variants(patients=[patient], filter_fn=no_filter)[patient_id]
+    def grab_vaf(variant):
+        filterable_variant = FilterableVariant(variant, variants, patient)
+        return variant_stats_from_variant(variant, filterable_variant.variant_metadata).tumor_stats.variant_allele_frequency
+    vafs = [grab_vaf(variant) for variant in variants]
+    return 2 * pd.Series(vafs).median()
diff --git a/cohorts/patient.py b/cohorts/patient.py
@@ -14,7 +14,7 @@
 
 from __future__ import print_function
 
-from .utils import require_id_str
+from .utils import require_id_str, set_attributes
 
 class Patient(object):
     """
@@ -47,6 +47,7 @@ class Patient(object):
         A list of this patient's HLA class I alleles.
     additional_data : dict
         A dictionary of additional data: name of datum mapping to value.
+        Will create these attributes in the Patient object.
     """
     def __init__(self,
                  id,
@@ -76,6 +77,9 @@ def __init__(self,
         self.hla_alleles = hla_alleles
         self.additional_data = additional_data
 
+        if self.additional_data is not None:
+            set_attributes(self, self.additional_data)
+
         # TODO: This can be removed once all patient-specific functions are
         # removed from Cohort.
         self.cohort = cohort

diff --git a/cohorts/sample.py b/cohorts/sample.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 
+from .utils import set_attributes
+
 class Sample(object):
     """
     Represents a single tumor or normal sample. It can point to DNA and/or
@@ -29,15 +31,23 @@ class Sample(object):
         Path to the RNA BAM file.
     cufflinks_path : str
         Path to the Cufflinks output file.
+    additional_data : dict
+        A dictionary of additional data: name of datum mapping to value.
+        Will create these attributes in the Sample object.
     """
     def __init__(self,
                  is_tumor,
                  bam_path_dna=None,
                  bam_path_rna=None,
                  cufflinks_path=None,
-                 kallisto_path=None):
+                 kallisto_path=None,
+                 additional_data=None):
         self.is_tumor = is_tumor
         self.bam_path_dna = bam_path_dna
         self.bam_path_rna = bam_path_rna
         self.cufflinks_path = cufflinks_path
         self.kallisto_path = kallisto_path
+        self.additional_data = additional_data
+
+        if self.additional_data is not None:
+            set_attributes(self, self.additional_data)
diff --git a/cohorts/utils.py b/cohorts/utils.py
@@ -185,3 +185,14 @@ def get_logger(name, level=logging.INFO):
         logger.handlers = []
     logger.setLevel(level)
     return logger
+
+def set_attributes(obj, additional_data):
+    """
+    Given an object and a dictionary, give the object new attributes from that dictionary.
+
+    Uses _strip_column_name to git rid of whitespace/uppercase/special characters.
+    """
+    for key, value in additional_data.items():
+        if hasattr(obj, key):
+            raise ValueError("Key %s in additional_data already exists in this object" % key)
+        setattr(obj, _strip_column_name(key), value)
diff --git a/cohorts/variant_filters.py b/cohorts/variant_filters.py
@@ -31,9 +31,8 @@ def variant_qc_filter(filterable_variant,
                       min_tumor_vaf,
                       max_normal_vaf,
                       min_tumor_alt_depth):
-
     logger.debug('Applying variant_qc_filter with params: min_tumor_depth={}, min_normal_depth={}, min_tumor_vaf={}, max_normal_vaf={}, min_tumor_alt_depth={}'.format(min_tumor_depth, min_normal_depth, min_tumor_vaf, max_normal_vaf, min_tumor_alt_depth))
-    
+
     somatic_stats = variant_stats_from_variant(filterable_variant.variant,
                                                filterable_variant.variant_metadata)
 

diff --git a/test/test_basic.py b/test/test_basic.py
@@ -40,13 +40,13 @@ def make_simple_cohort(merge_type="union",
     clinical_dataframe = make_simple_clinical_dataframe(**kwargs)
     patients = []
     for i, row in clinical_dataframe.iterrows():
-        patient = Patient(id=row["id"],
-                          os=row["OS"],
-                          pfs=row["PFS"],
-                          deceased=row["deceased"],
-                          progressed_or_deceased=row["progressed_or_deceased"],
-                          additional_data=row
-                          )
+        row = dict(row)
+        patient = Patient(id=row.pop("id"),
+                          os=row.pop("OS"),
+                          pfs=row.pop("PFS"),
+                          deceased=row.pop("deceased"),
+                          progressed_or_deceased=row.pop("progressed_or_deceased"),
+                          additional_data=row)
         patients.append(patient)
 
     Cohort.normalized_per_mb = False
@@ -74,7 +74,6 @@ def test_simple_cohort():
     eq_(len(cohort.as_dataframe()), 3)
 
     columns = set(cohort.as_dataframe().columns)
-    ok_("id" in columns)
     ok_("patient_id" in columns)
     ok_("age" in columns)
     ok_("pfs" in columns)

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -38,13 +38,13 @@ def make_alt_simple_cohort(merge_type="union", **kwargs):
     clinical_dataframe = make_alt_simple_clinical_dataframe(**kwargs)
     patients = []
     for i, row in clinical_dataframe.iterrows():
-        patient = Patient(id=row["id"],
-                          os=row["os"],
-                          pfs=row["pfs"],
-                          deceased=row["deceased"],
-                          progressed_or_deceased=row["progressed_or_deceased"],
-                          additional_data=row
-                          )
+        row = dict(row)
+        patient = Patient(id=row.pop("id"),
+                          os=row.pop("os"),
+                          pfs=row.pop("pfs"),
+                          deceased=row.pop("deceased"),
+                          progressed_or_deceased=row.pop("progressed_or_deceased"),
+                          additional_data=row)
         patients.append(patient)
 
     return Cohort(