Skip to content

Commit

Permalink
Merge pull request #189 from hammerlab/minor_updates
Browse files Browse the repository at this point in the history
Add a median VAF filter function and a filter_fn watermark
  • Loading branch information
tavinathanson committed Mar 10, 2017
2 parents 616cb5f + 44938b7 commit b47b8ef
Show file tree
Hide file tree
Showing 8 changed files with 83 additions and 21 deletions.
8 changes: 7 additions & 1 deletion cohorts/cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,8 @@ class Cohort(Collection):
Verify that the cached provenance is equal to the current environment.
print_provenance : bool
Print a summary of cache file provenance.
print_filter : bool
Print the name of the default `filter_fn` on initialization.
polyphen_dump_path : str
Path to a Polyphen database dump.
pageant_coverage_path : str
Expand All @@ -128,6 +130,7 @@ def __init__(self,
responder_pfs_equals_os=False,
check_provenance=False,
print_provenance=True,
print_filter=True,
polyphen_dump_path=None,
pageant_coverage_path=None,
benefit_plot_name="Benefit",
Expand Down Expand Up @@ -177,6 +180,9 @@ def __init__(self,
"polyphen": "cached-polyphen-annotations",
"isovar": "cached-isovar-output"}

if print_filter:
print("Applying %s filter by default" % self.filter_fn.__name__ if
self.filter_fn is not None else "None")
if print_provenance:
pprint.pprint(self.summarize_data_sources())

Expand Down Expand Up @@ -506,7 +512,7 @@ def load_variants(self, patients=None, filter_fn=None, **kwargs):
if variants is not None:
patient_variants[patient.id] = variants
return patient_variants

def _hash_filter_fn(self, filter_fn, **kwargs):
""" Construct string representing state of filter_fn
Used to cache filtered variants or effects uniquely depending on filter fn values
Expand Down
35 changes: 34 additions & 1 deletion cohorts/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@
from __future__ import print_function

from .variant_filters import no_filter, effect_expressed_filter
from .varcode_utils import FilterableVariant
from .utils import first_not_none_param
from .variant_stats import variant_stats_from_variant

from functools import wraps
import numpy as np
from varcode.effects import Substitution
import pandas as pd
from varcode.effects import Substitution, FrameShift
from varcode.common import memoize
from varcode.effects.effect_classes import Exonic
import inspect
Expand Down Expand Up @@ -207,6 +210,20 @@ def count_filter_fn(filterable_effect, **kwargs):
isinstance(filterable_effect.effect, Exonic) and
filterable_effect.variant.is_insertion))

frameshift_count = count_effects_function_builder(
"frameshift_count",
only_nonsynonymous=False, # Should not matter, because FrameShift extends NonsilentCodingMutation
filterable_effect_function=lambda filterable_effect: (
isinstance(filterable_effect.effect, FrameShift)))

missense_snv_and_nonsynonymous_indel_count = count_effects_function_builder(
"missense_snv_and_nonsynonymous_indel_count",
only_nonsynonymous=True,
filterable_effect_function=lambda filterable_effect: (
(filterable_effect.variant.is_indel) or
(type(filterable_effect.effect) == Substitution and
filterable_effect.variant.is_snv)))

@count_function
def neoantigen_count(row, cohort, filter_fn, normalized_per_mb, **kwargs):
patient = cohort.patient_from_id(row["patient_id"])
Expand Down Expand Up @@ -242,3 +259,19 @@ def expressed_neoantigen_count(row, cohort, filter_fn, normalized_per_mb, **kwar
normalized_per_mb=normalized_per_mb,
only_expressed=True,
**kwargs)

def median_vaf_purity(row, cohort):
"""
Estimate purity based on 2 * median VAF.
Even if the Cohort has a default filter_fn, ignore it: we want to use all variants for
this estimate.
"""
patient_id = row["patient_id"]
patient = cohort.patient_from_id(patient_id)
variants = cohort.load_variants(patients=[patient], filter_fn=no_filter)[patient_id]
def grab_vaf(variant):
filterable_variant = FilterableVariant(variant, variants, patient)
return variant_stats_from_variant(variant, filterable_variant.variant_metadata).tumor_stats.variant_allele_frequency
vafs = [grab_vaf(variant) for variant in variants]
return 2 * pd.Series(vafs).median()
6 changes: 5 additions & 1 deletion cohorts/patient.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from __future__ import print_function

from .utils import require_id_str
from .utils import require_id_str, set_attributes

class Patient(object):
"""
Expand Down Expand Up @@ -47,6 +47,7 @@ class Patient(object):
A list of this patient's HLA class I alleles.
additional_data : dict
A dictionary of additional data: name of datum mapping to value.
Will create these attributes in the Patient object.
"""
def __init__(self,
id,
Expand Down Expand Up @@ -76,6 +77,9 @@ def __init__(self,
self.hla_alleles = hla_alleles
self.additional_data = additional_data

if self.additional_data is not None:
set_attributes(self, self.additional_data)

# TODO: This can be removed once all patient-specific functions are
# removed from Cohort.
self.cohort = cohort
Expand Down
12 changes: 11 additions & 1 deletion cohorts/sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

from __future__ import print_function

from .utils import set_attributes

class Sample(object):
"""
Represents a single tumor or normal sample. It can point to DNA and/or
Expand All @@ -29,15 +31,23 @@ class Sample(object):
Path to the RNA BAM file.
cufflinks_path : str
Path to the Cufflinks output file.
additional_data : dict
A dictionary of additional data: name of datum mapping to value.
Will create these attributes in the Sample object.
"""
def __init__(self,
is_tumor,
bam_path_dna=None,
bam_path_rna=None,
cufflinks_path=None,
kallisto_path=None):
kallisto_path=None,
additional_data=None):
self.is_tumor = is_tumor
self.bam_path_dna = bam_path_dna
self.bam_path_rna = bam_path_rna
self.cufflinks_path = cufflinks_path
self.kallisto_path = kallisto_path
self.additional_data = additional_data

if self.additional_data is not None:
set_attributes(self, self.additional_data)
11 changes: 11 additions & 0 deletions cohorts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,14 @@ def get_logger(name, level=logging.INFO):
logger.handlers = []
logger.setLevel(level)
return logger

def set_attributes(obj, additional_data):
"""
Given an object and a dictionary, give the object new attributes from that dictionary.
Uses _strip_column_name to git rid of whitespace/uppercase/special characters.
"""
for key, value in additional_data.items():
if hasattr(obj, key):
raise ValueError("Key %s in additional_data already exists in this object" % key)
setattr(obj, _strip_column_name(key), value)
3 changes: 1 addition & 2 deletions cohorts/variant_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ def variant_qc_filter(filterable_variant,
min_tumor_vaf,
max_normal_vaf,
min_tumor_alt_depth):

logger.debug('Applying variant_qc_filter with params: min_tumor_depth={}, min_normal_depth={}, min_tumor_vaf={}, max_normal_vaf={}, min_tumor_alt_depth={}'.format(min_tumor_depth, min_normal_depth, min_tumor_vaf, max_normal_vaf, min_tumor_alt_depth))

somatic_stats = variant_stats_from_variant(filterable_variant.variant,
filterable_variant.variant_metadata)

Expand Down
15 changes: 7 additions & 8 deletions test/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,13 @@ def make_simple_cohort(merge_type="union",
clinical_dataframe = make_simple_clinical_dataframe(**kwargs)
patients = []
for i, row in clinical_dataframe.iterrows():
patient = Patient(id=row["id"],
os=row["OS"],
pfs=row["PFS"],
deceased=row["deceased"],
progressed_or_deceased=row["progressed_or_deceased"],
additional_data=row
)
row = dict(row)
patient = Patient(id=row.pop("id"),
os=row.pop("OS"),
pfs=row.pop("PFS"),
deceased=row.pop("deceased"),
progressed_or_deceased=row.pop("progressed_or_deceased"),
additional_data=row)
patients.append(patient)

Cohort.normalized_per_mb = False
Expand Down Expand Up @@ -74,7 +74,6 @@ def test_simple_cohort():
eq_(len(cohort.as_dataframe()), 3)

columns = set(cohort.as_dataframe().columns)
ok_("id" in columns)
ok_("patient_id" in columns)
ok_("age" in columns)
ok_("pfs" in columns)
Expand Down
14 changes: 7 additions & 7 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,13 @@ def make_alt_simple_cohort(merge_type="union", **kwargs):
clinical_dataframe = make_alt_simple_clinical_dataframe(**kwargs)
patients = []
for i, row in clinical_dataframe.iterrows():
patient = Patient(id=row["id"],
os=row["os"],
pfs=row["pfs"],
deceased=row["deceased"],
progressed_or_deceased=row["progressed_or_deceased"],
additional_data=row
)
row = dict(row)
patient = Patient(id=row.pop("id"),
os=row.pop("os"),
pfs=row.pop("pfs"),
deceased=row.pop("deceased"),
progressed_or_deceased=row.pop("progressed_or_deceased"),
additional_data=row)
patients.append(patient)

return Cohort(
Expand Down

0 comments on commit b47b8ef

Please sign in to comment.