Skip to content

Commit

Permalink
Add error bias filter
Browse files Browse the repository at this point in the history
  • Loading branch information
James Casbon committed Jun 25, 2012
1 parent 880ce55 commit 8a79f57
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 1 deletion.
12 changes: 12 additions & 0 deletions docs/FILTERS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,18 @@ operation that PyVCF offers an extensible script. ``vcf_filter.py`` does
the work of reading input, updating the metadata and filtering the records.


Existing Filters
----------------

.. autoclass:: vcf.filters.SiteQuality

.. autoclass:: vcf.filters.VariantGenotypeQuality

.. autoclass:: vcf.filters.ErrorBiasFilter




Adding a filter
---------------

Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
'vcf.filters': [
'site_quality = vcf.filters:SiteQuality',
'vgq = vcf.filters:VariantGenotypeQuality',
'eb = vcf.filters:ErrorBiasFilter',
]
},
url='https://github.com/jamescasbon/PyVCF',
Expand Down
86 changes: 85 additions & 1 deletion vcf/filters.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
try:
from rpy2 import robjects
except:
robjects = None


class Base(object):
Expand All @@ -22,13 +26,13 @@ def __call__(self):
""" filter a site, return not None if the site should be filtered """
raise NotImplementedError('Filters must implement this method')


def filter_name(self):
""" return the name to put in the VCF header, default is ``name`` + ``threshold`` """
return '%s%s' % (self.name, self.threshold)


class SiteQuality(Base):
""" Filter low quailty sites """

description = 'Filter sites by quality'
name = 'sq'
Expand All @@ -47,6 +51,10 @@ def __call__(self, record):


class VariantGenotypeQuality(Base):
""" Filters sites with low quality variants. i.e. it is possible
to have a high site quality with many low quality calls. This
filter demands at least one call be above a threshold quality.
"""

description = 'Demand a minimum quality associated with a non reference call'
name = 'mgq'
Expand All @@ -66,3 +74,79 @@ def __call__(self, record):
return vgq


class ErrorBiasFilter(Base):
""" Some sequencing technologies, notably pyrosequencing, produce mutation
hotspots where there is a constant level of noise, producing some reference
and some heterozygote calls.
This filter computes a Bayes Factor for each site by comparing
the binomial likelihood of the observed allelic depths under:
* A model with constant error equal to the MAF
* A model where each sample is the ploidy reported by the caller
The test value is the log of the bayes factor. Higher values
are more likely to be errors.
Note: this filter requires rpy2
"""

description = 'Filter sites with a constant level of mutation across all samples'
name = 'eb'

@classmethod
def customize_parser(self, parser):
parser.add_argument('--errlr', type=int, default=-10,
help='Filter sites above this error log odds ratio')

def __init__(self, args):
self.threshold = args.errlr
if robjects is None:
raise Exception('Please install rpy2')
self.ll_test = robjects.r('''
function(ra, aa, gt, diag=F) {
ra_sum = sum(ra)
aa_sum = sum(aa)
ab = aa_sum / (ra_sum + aa_sum)
gtp = 0.5 + (0.48*(gt-1))
error_likelihood = log(dbinom(aa, ra+aa, ab))
gt_likelihood = log(dbinom(aa, ra+aa, gtp))
if (diag) {
print(ra)
print(aa)
print(gtp)
print(ab)
print(error_likelihood)
print(gt_likelihood)
}
error_likelihood = sum(error_likelihood)
gt_likelihood = sum(gt_likelihood)
c(error_likelihood - gt_likelihood, ab)
}
''')

def __call__(self, record):
if record.is_monomorphic:
return None
passed, tv, ab = self.bias_test(record.samples)
if tv > self.threshold:
return tv

def bias_test(self, calls):
calls = [x for x in calls if x.called]
#TODO: single genotype assumption
try:
# freebayes
ra = robjects.IntVector([x['RO'][0] for x in calls])
aa = robjects.IntVector([x['AO'][0] for x in calls])
except KeyError:
# GATK
ra = robjects.IntVector([x['AD'][0] for x in calls])
aa = robjects.IntVector([x['AD'][1] for x in calls])

gt = robjects.IntVector([x.gt_type for x in calls])
test_val, ab = self.ll_test(ra, aa, gt)

return test_val < 0, test_val, ab

0 comments on commit 8a79f57

Please sign in to comment.