fix #816

hammerlab · Aug 28, 2015 · 07f12c1 · 07f12c1
1 parent 5f3e8b5
commit 07f12c1
Show file tree

Hide file tree

Showing 5 changed files with 39 additions and 13 deletions.
diff --git a/cycledash/api/runs.py b/cycledash/api/runs.py
@@ -36,7 +36,8 @@
     'notes': unicode,
     'dataset': unicode,
     'project_name': unicode,
-    'vcf_header': unicode
+    'vcf_header': unicode,
+    'vcf_release': Coerce(int)
 })
 
 UpdateRun = Schema({
@@ -49,6 +50,7 @@
 
     'notes': unicode,
     'vcf_header': unicode,
+    'vcf_release': Coerce(int),
 
     'true_positive': Coerce(int),
     'false_positive': Coerce(int),
@@ -75,6 +77,8 @@
         Any(basestring, None),
     Doc('vcf_header', 'The raw VCF text of the header.'):
         Any(basestring, None),
+    Doc('vcf_release', 'ENSEMBL Release for the reference'):
+        Any(long, None),
     Doc('project_id', 'The internal ID of the Project this Run belongs to.'):
         long,
     Doc('normal_bam_id',

diff --git a/schema.py b/schema.py
@@ -39,6 +39,7 @@
     Column('notes', String()),  # Any notes, params, etc the user might add. Ideally in JSON format.
     Column('uri', String()),  # URI of source file, if any
     Column('vcf_header', String()),  # Plaintext header of the VCF
+    Column('vcf_release', Integer),  # ENSEMBL compatible release for reference
     Column('extant_columns', String()),  # JSON list of non-null columns in the VCF
     Column('genotype_count', BigInteger),   # number of variants in this VCF
     UniqueConstraint('project_id', 'uri', name='vcfs_project_id_uri_key')

diff --git a/workers/genotype_extractor.py b/workers/genotype_extractor.py
@@ -40,12 +40,12 @@ def _extract(vcf_id):
         vcf = vcfs_table.select().where(vcfs_table.c.id == vcf_id).execute().fetchone()
 
         # Validate the contents of the VCF before modifying the database.
-        reader, header_text = load_vcf(vcf['uri'])
+        reader, header_text, release = load_vcf(vcf['uri'])
 
         # Fill in VCF header text, which is now available.
         (vcfs_table.update()
          .where(vcfs_table.c.id == vcf_id)
-         .values(vcf_header=header_text)
+         .values(vcf_header=header_text, vcf_release=release)
          .execute())
 
         insert_genotypes_with_copy(reader, engine,

diff --git a/workers/shared.py b/workers/shared.py
@@ -10,6 +10,7 @@
 from sqlalchemy import select, func, create_engine, MetaData, bindparam, not_, or_
 import pywebhdfs.webhdfs
 import pywebhdfs.errors
+import varcode
 
 import config
 from common.helpers import tables
@@ -86,20 +87,32 @@ def hdfs_to_local_path(hdfs_path):
 
     return filename
 
+def guess_ensembl_release(filepath):
+    try:
+        release = varcode.load_vcf(filepath)[0].ensembl.release
+    except ValueError:  # no guesses from varcode, return default
+        release = config.ENSEMBL_RELEASE
+    finally:
+        return release
 
 def load_vcf(vcf_path):
     """Return a vcf.Reader, header text for the given VCF."""
     if config.ALLOW_LOCAL_VCFS and vcf_path.startswith('/tests/'):
-        text = open(vcf_path[1:]).read()
+        filepath = vcf_path[1:];
+        text = open(filepath).read()
+        release = guess_ensembl_release(filepath)
     elif vcf_path.startswith('file://'):
-        text = open(vcf_path[6:]).read()
+        filepath = vcf_path[6:];
+        text = open(filepath).read()
+        release = guess_ensembl_release(filepath)
     elif vcf_path.startswith('hdfs://'):
         return load_vcf(vcf_path[6:])
     else:
         text = get_contents_from_hdfs(vcf_path)
+        release = guess_ensembl_release(hdfs_to_local_path(vcf_path))
     header = '\n'.join(l for l in text.split('\n') if l.startswith('#'))
 
-    return pyvcf.Reader(l for l in text.split('\n')), header
+    return pyvcf.Reader(l for l in text.split('\n')), header, release
 
 
 def initialize_database(database_uri):

diff --git a/workers/varcode_annotator.py b/workers/varcode_annotator.py
@@ -42,16 +42,24 @@ def _annotate(vcf_id):
     if vcf_id == False:
         return  # An error must have occurred earlier.
 
+    engine = sqlalchemy.create_engine(DATABASE_URI)
+    # See if we have any guesses for a release version
+    with tables(engine, 'vcfs') as (con, vcfs):
+        metadata = sqlalchemy.MetaData(bind=con)
+        metadata.reflect()
+        vcf = vcfs.select().where(vcfs.c.id == vcf_id).execute().fetchone()
+        release = vcf['vcf_release']
+
+    if not release:
+        release = config.ENSEMBL_RELEASE
+
     # Only runs the first time for this release.
-    EnsemblRelease(config.ENSEMBL_RELEASE).install()
+    EnsemblRelease(release).install()
 
-    engine = sqlalchemy.create_engine(DATABASE_URI)
     with tables(engine, 'genotypes') as (con, genotypes):
         metadata = sqlalchemy.MetaData(bind=con)
         metadata.reflect()
-        annotations = get_varcode_annotations(genotypes,
-                                              vcf_id,
-                                              config.ENSEMBL_RELEASE)
+        annotations = get_varcode_annotations(genotypes, vcf_id, release)
 
         tmp_table = Table('gene_annotations',
                           metadata,
@@ -101,7 +109,7 @@ def write_to_table_via_csv(table, rows, connection):
 
 
 def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num):
-    """Get contig, position, ref and alt data from the genotypes table, 
+    """Get contig, position, ref and alt data from the genotypes table,
     and get the best effect from Varcode library. Return a list of the form:
     [[contig, position, "NAME,NAME,..."], [contig...], ...]
     """
@@ -139,4 +147,4 @@ def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num):
                                     notation,
                                     effect_type])
 
-    return varcode_annotations
+    return varcode_annotations