Skip to content

Commit

Permalink
Fix impossible stats & regression test
Browse files Browse the repository at this point in the history
fixes #588 fixes #516
  • Loading branch information
ihodes committed Jun 5, 2015
1 parent 2b133b9 commit b05a55c
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 8 deletions.
21 changes: 13 additions & 8 deletions cycledash/genotypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import flask.ext.restful as restful
from flask.ext.restful import Resource
from sqlalchemy import (select, func, types, cast, join, outerjoin, asc, desc,
and_, Integer, Float, String)
and_, Integer, Float, String, distinct)
from sqlalchemy.sql import text
from sqlalchemy.sql.expression import label, column, case, literal
from sqlalchemy.sql.functions import coalesce
Expand Down Expand Up @@ -52,9 +52,12 @@ def samples(vcf_id):
query = """SELECT sample_name FROM genotypes WHERE vcf_id = %s
GROUP BY sample_name ORDER BY sample_name
"""
with db.engine.connect() as connection:
samples = connection.execute(query, (vcf_id,)).fetchall()
return [sample for (sample,) in samples]
with tables(db.engine, 'genotypes') as (con, genotypes):
samples = (select([func.count(distinct(genotypes.c.sample_name))])
.where(genotypes.c.vcf_id == vcf_id))
samples = [sample_name for (sample_name,)
in samples.execute().fetchall()]
return samples


@forever.memoize
Expand Down Expand Up @@ -162,7 +165,8 @@ def genotype_statistics(query, vcf_id, truth_vcf_id, count, total_count):
joined_q = join(g, gt, and_(g.c.contig == gt.c.contig,
g.c.position == gt.c.position,
g.c.reference == gt.c.reference,
g.c.alternates == gt.c.alternates))
g.c.alternates == gt.c.alternates,
g.c.sample_name == gt.c.sample_name))
true_pos_q = select([func.count()]).select_from(joined_q).where(
and_(g.c.vcf_id == vcf_id, gt.c.vcf_id == truth_vcf_id))
true_pos_q = _add_filters(true_pos_q, g, query.get('filters'))
Expand All @@ -180,7 +184,7 @@ def genotype_statistics(query, vcf_id, truth_vcf_id, count, total_count):
total_truth_q = _add_filters(total_truth_q, g, query.get('filters'))
total_truth_q = _add_range(total_truth_q, g, query.get('range'))
(total_truth,) = con.execute(total_truth_q).fetchone()
total_truth *= len(samples(vcf_id))
total_truth *= len(samples(truth_vcf_id))

stats.update(_true_false_pos_neg(true_positives, total_truth, count))
return stats
Expand Down Expand Up @@ -448,10 +452,11 @@ def _annotate_query_with_types(query, vcf_spec):
return query


def _whitelist_query_filters(query, ok_fields=['reference', 'alternates']):
def _whitelist_query_filters(
query, ok_fields=['reference', 'alternates', 'sample_name']):
"""Returns query with only validation-appropriate filters.
These are, by default: ['reference', 'alternates']"""
These are, by default: ['reference', 'alternates', 'sample_name']"""
query = copy.deepcopy(query)
query['filters'] = [f for f in query['filters']
if f.get('columnName') in ok_fields]
Expand Down
16 changes: 16 additions & 0 deletions tests/data/snv.truth.vcf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
##fileformat=VCFv4.1
##source=BespokeByIsaac
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">
##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">
##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">
##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL TUMOR
20 61795 . G T . FOUND . GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:44:22:22:50%:16,6,9,13 0/1:.:37:18:19:51.35%:10,8,10,9
20 52731 . C A . NOTFOUND . GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:32:17:15:46.88%:9,8,9,6 0/1:.:36:21:15:41.67%:8,13,8,7
20 65288 . G T . FOUND . GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:21:13:8:38.1%:4,9,0,8 0/1:.:14:10:4:28.57%:2,8,0,4
20 65900 . G A . FOUND . GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:26:0:26:100%:0,0,12,14 1/1:.:27:0:27:100%:0,0,15,12
20 68090 . G C . NOTFOUND . GT:GQ:DP:RD:AD:FREQ:DP4 1/1:.:23:0:23:100%:0,0,7,16 1/1:.:41:0:41:100%:0,0,21,20
20 75254 . C A . FOUND . GT:GQ:DP:RD:AD:FREQ:DP4 0/1:.:34:22:11:33.33%:13,9,5,6 0/1:.:40:20:20:50%:5,15,14,6
107 changes: 107 additions & 0 deletions tests/python/test_genotypes_stats_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import mock
import nose
import unittest
import nose.tools as asserts
import json

from cycledash import db
from cycledash.genotypes import get
from common.helpers import tables, pick
from workers.genotype_extractor import _extract

from test_projects_api import create_project_with_name
from test_runs_api import create_run_with_uri
from helpers import delete_tables


class TestGenotypesAPI(object):

@classmethod
def setUpClass(cls):
cls.project = create_project_with_name('my project')
cls.run = create_run_with_uri(cls.project['id'], '/tests/data/snv.vcf')
_extract(cls.run['id'])
cls.truth_run = create_run_with_uri(cls.project['id'], '/tests/data/snv.truth.vcf')
_extract(cls.truth_run['id'])

@classmethod
def tearDownClass(cls):
delete_tables(db.engine, 'genotypes', 'vcfs', 'bams', 'projects')

def test_stats_with_entire_truth_vcf(self):
query = {
'range': {},
'filters': [],
'sortBy': [],
'compareToVcfId': self.truth_run['id']
}
stats = get(self.run['id'], query, with_stats=True)['stats']

asserts.eq_(stats['totalTruthRecords'], 12)

asserts.eq_(stats['truePositives'], 8)
asserts.eq_(stats['falsePositives'], 12)
asserts.eq_(stats['falseNegatives'], 4)

asserts.assert_almost_equals(stats['recall'], 2/3.0)
asserts.assert_almost_equals(stats['precision'], 1/2.5)
asserts.assert_almost_equals(stats['f1score'], 1/2.0)

def test_stats_with_self(self):
query = {
'range': {},
'filters': [],
'sortBy': [],
'compareToVcfId': self.run['id']
}
stats = get(self.run['id'], query, with_stats=True)['stats']

asserts.eq_(stats['totalTruthRecords'], 20)

asserts.eq_(stats['truePositives'], 20)
asserts.eq_(stats['falsePositives'], 0)
asserts.eq_(stats['falseNegatives'], 0)

asserts.assert_almost_equals(stats['recall'], 1.0)
asserts.assert_almost_equals(stats['precision'], 1.0)
asserts.assert_almost_equals(stats['f1score'], 1.0)

def test_stats_with_truth_and_range(self):
query = {
'range': {'start': 0, 'end': 66000, 'contig': '20'},
'filters': [],
'sortBy': [],
'compareToVcfId': self.truth_run['id']
}
stats = get(self.run['id'], query, with_stats=True)['stats']

asserts.eq_(stats['totalTruthRecords'], 8)

asserts.eq_(stats['truePositives'], 6)
asserts.eq_(stats['falsePositives'], 4)
asserts.eq_(stats['falseNegatives'], 2)

asserts.assert_almost_equals(stats['recall'], 3/4.0)
asserts.assert_almost_equals(stats['precision'], 3/5.0)
asserts.assert_almost_equals(stats['f1score'], 2/3.0)

def test_stats_with_truth_and_filters(self):
query = {
'range': {},
'filters': [{'columnName': 'sample_name',
'filterValue': 'TUMOR',
'type': '='}],
'sortBy': [],
'compareToVcfId': self.truth_run['id']
}
stats = get(self.run['id'], query, with_stats=True)['stats']

asserts.eq_(stats['totalTruthRecords'], 6)

asserts.eq_(stats['truePositives'], 4)
asserts.eq_(stats['falsePositives'], 6)
asserts.eq_(stats['falseNegatives'], 2)

asserts.assert_almost_equals(stats['recall'], 2/3.0)
asserts.assert_almost_equals(stats['precision'], 1/2.5)
asserts.assert_almost_equals(stats['f1score'], 1/2.0)

0 comments on commit b05a55c

Please sign in to comment.