Fix impossible stats & regression test

fixes #588 fixes #516
hammerlab · Jun 5, 2015 · b05a55c · b05a55c
1 parent 2b133b9
commit b05a55c
Show file tree

Hide file tree

Showing 3 changed files with 136 additions and 8 deletions.
diff --git a/cycledash/genotypes.py b/cycledash/genotypes.py
@@ -7,7 +7,7 @@
 import flask.ext.restful as restful
 from flask.ext.restful import Resource
 from sqlalchemy import (select, func, types, cast, join, outerjoin, asc, desc,
-                        and_, Integer, Float, String)
+                        and_, Integer, Float, String, distinct)
 from sqlalchemy.sql import text
 from sqlalchemy.sql.expression import label, column, case, literal
 from sqlalchemy.sql.functions import coalesce
@@ -52,9 +52,12 @@ def samples(vcf_id):
     query = """SELECT sample_name FROM genotypes WHERE vcf_id = %s
     GROUP BY sample_name ORDER BY sample_name
     """
-    with db.engine.connect() as connection:
-        samples = connection.execute(query, (vcf_id,)).fetchall()
-    return [sample for (sample,) in samples]
+    with tables(db.engine, 'genotypes') as (con, genotypes):
+        samples = (select([func.count(distinct(genotypes.c.sample_name))])
+                   .where(genotypes.c.vcf_id == vcf_id))
+        samples = [sample_name for (sample_name,)
+                   in samples.execute().fetchall()]
+    return samples
 
 
 @forever.memoize
@@ -162,7 +165,8 @@ def genotype_statistics(query, vcf_id, truth_vcf_id, count, total_count):
         joined_q = join(g, gt, and_(g.c.contig == gt.c.contig,
                                     g.c.position == gt.c.position,
                                     g.c.reference == gt.c.reference,
-                                    g.c.alternates == gt.c.alternates))
+                                    g.c.alternates == gt.c.alternates,
+                                    g.c.sample_name == gt.c.sample_name))
         true_pos_q = select([func.count()]).select_from(joined_q).where(
             and_(g.c.vcf_id == vcf_id, gt.c.vcf_id == truth_vcf_id))
         true_pos_q = _add_filters(true_pos_q, g, query.get('filters'))
@@ -180,7 +184,7 @@ def genotype_statistics(query, vcf_id, truth_vcf_id, count, total_count):
         total_truth_q = _add_filters(total_truth_q, g, query.get('filters'))
         total_truth_q = _add_range(total_truth_q, g, query.get('range'))
         (total_truth,) = con.execute(total_truth_q).fetchone()
-        total_truth *= len(samples(vcf_id))
+        total_truth *= len(samples(truth_vcf_id))
 
     stats.update(_true_false_pos_neg(true_positives, total_truth, count))
     return stats
@@ -448,10 +452,11 @@ def _annotate_query_with_types(query, vcf_spec):
     return query
 
 
-def _whitelist_query_filters(query, ok_fields=['reference', 'alternates']):
+def _whitelist_query_filters(
+        query, ok_fields=['reference', 'alternates', 'sample_name']):
     """Returns query with only validation-appropriate filters.
 
-    These are, by default: ['reference', 'alternates']"""
+    These are, by default: ['reference', 'alternates', 'sample_name']"""
     query = copy.deepcopy(query)
     query['filters'] = [f for f in query['filters']
                         if f.get('columnName') in ok_fields]

diff --git a/tests/data/snv.truth.vcf b/tests/data/snv.truth.vcf
@@ -0,0 +1,16 @@
+##fileformat=VCFv4.1
+##source=BespokeByIsaac
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
+##FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">
+##FORMAT=<ID=AD,Number=1,Type=Integer,Description="Depth of variant-supporting bases (reads2)">
+##FORMAT=<ID=FREQ,Number=1,Type=String,Description="Variant allele frequency">
+##FORMAT=<ID=DP4,Number=4,Type=Integer,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	NORMAL	TUMOR
+20	61795	.	G	T	.	FOUND	.	GT:GQ:DP:RD:AD:FREQ:DP4	0/1:.:44:22:22:50%:16,6,9,13	0/1:.:37:18:19:51.35%:10,8,10,9
+20	52731	.	C	A	.	NOTFOUND	.	GT:GQ:DP:RD:AD:FREQ:DP4	0/1:.:32:17:15:46.88%:9,8,9,6	0/1:.:36:21:15:41.67%:8,13,8,7
+20	65288	.	G	T	.	FOUND	.	GT:GQ:DP:RD:AD:FREQ:DP4	0/1:.:21:13:8:38.1%:4,9,0,8	0/1:.:14:10:4:28.57%:2,8,0,4
+20	65900	.	G	A	.	FOUND	.	GT:GQ:DP:RD:AD:FREQ:DP4	1/1:.:26:0:26:100%:0,0,12,14	1/1:.:27:0:27:100%:0,0,15,12
+20	68090	.	G	C	.	NOTFOUND	.	GT:GQ:DP:RD:AD:FREQ:DP4	1/1:.:23:0:23:100%:0,0,7,16	1/1:.:41:0:41:100%:0,0,21,20
+20	75254	.	C	A	.	FOUND	.	GT:GQ:DP:RD:AD:FREQ:DP4	0/1:.:34:22:11:33.33%:13,9,5,6	0/1:.:40:20:20:50%:5,15,14,6
diff --git a/tests/python/test_genotypes_stats_api.py b/tests/python/test_genotypes_stats_api.py
@@ -0,0 +1,107 @@
+import mock
+import nose
+import unittest
+import nose.tools as asserts
+import json
+
+from cycledash import db
+from cycledash.genotypes import get
+from common.helpers import tables, pick
+from workers.genotype_extractor import _extract
+
+from test_projects_api import create_project_with_name
+from test_runs_api import create_run_with_uri
+from helpers import delete_tables
+
+
+class TestGenotypesAPI(object):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.project = create_project_with_name('my project')
+        cls.run = create_run_with_uri(cls.project['id'], '/tests/data/snv.vcf')
+        _extract(cls.run['id'])
+        cls.truth_run = create_run_with_uri(cls.project['id'], '/tests/data/snv.truth.vcf')
+        _extract(cls.truth_run['id'])
+
+    @classmethod
+    def tearDownClass(cls):
+        delete_tables(db.engine, 'genotypes', 'vcfs', 'bams', 'projects')
+
+    def test_stats_with_entire_truth_vcf(self):
+        query = {
+            'range': {},
+            'filters': [],
+            'sortBy': [],
+            'compareToVcfId': self.truth_run['id']
+        }
+        stats = get(self.run['id'], query, with_stats=True)['stats']
+
+        asserts.eq_(stats['totalTruthRecords'], 12)
+
+        asserts.eq_(stats['truePositives'], 8)
+        asserts.eq_(stats['falsePositives'], 12)
+        asserts.eq_(stats['falseNegatives'], 4)
+
+        asserts.assert_almost_equals(stats['recall'], 2/3.0)
+        asserts.assert_almost_equals(stats['precision'], 1/2.5)
+        asserts.assert_almost_equals(stats['f1score'], 1/2.0)
+
+    def test_stats_with_self(self):
+        query = {
+            'range': {},
+            'filters': [],
+            'sortBy': [],
+            'compareToVcfId': self.run['id']
+        }
+        stats = get(self.run['id'], query, with_stats=True)['stats']
+
+        asserts.eq_(stats['totalTruthRecords'], 20)
+
+        asserts.eq_(stats['truePositives'], 20)
+        asserts.eq_(stats['falsePositives'], 0)
+        asserts.eq_(stats['falseNegatives'], 0)
+
+        asserts.assert_almost_equals(stats['recall'], 1.0)
+        asserts.assert_almost_equals(stats['precision'], 1.0)
+        asserts.assert_almost_equals(stats['f1score'], 1.0)
+
+    def test_stats_with_truth_and_range(self):
+        query = {
+            'range': {'start': 0, 'end': 66000, 'contig': '20'},
+            'filters': [],
+            'sortBy': [],
+            'compareToVcfId': self.truth_run['id']
+        }
+        stats = get(self.run['id'], query, with_stats=True)['stats']
+
+        asserts.eq_(stats['totalTruthRecords'], 8)
+
+        asserts.eq_(stats['truePositives'], 6)
+        asserts.eq_(stats['falsePositives'], 4)
+        asserts.eq_(stats['falseNegatives'], 2)
+
+        asserts.assert_almost_equals(stats['recall'], 3/4.0)
+        asserts.assert_almost_equals(stats['precision'], 3/5.0)
+        asserts.assert_almost_equals(stats['f1score'], 2/3.0)
+
+    def test_stats_with_truth_and_filters(self):
+        query = {
+            'range': {},
+            'filters': [{'columnName': 'sample_name',
+                         'filterValue': 'TUMOR',
+                         'type': '='}],
+            'sortBy': [],
+            'compareToVcfId': self.truth_run['id']
+        }
+        stats = get(self.run['id'], query, with_stats=True)['stats']
+
+        asserts.eq_(stats['totalTruthRecords'], 6)
+
+        asserts.eq_(stats['truePositives'], 4)
+        asserts.eq_(stats['falsePositives'], 6)
+        asserts.eq_(stats['falseNegatives'], 2)
+
+        asserts.assert_almost_equals(stats['recall'], 2/3.0)
+        asserts.assert_almost_equals(stats['precision'], 1/2.5)
+        asserts.assert_almost_equals(stats['f1score'], 1/2.0)