Skip to content

Commit

Permalink
fix #816
Browse files Browse the repository at this point in the history
  • Loading branch information
armish committed Aug 28, 2015
1 parent 5f3e8b5 commit 07f12c1
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 13 deletions.
6 changes: 5 additions & 1 deletion cycledash/api/runs.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
'notes': unicode,
'dataset': unicode,
'project_name': unicode,
'vcf_header': unicode
'vcf_header': unicode,
'vcf_release': Coerce(int)
})

UpdateRun = Schema({
Expand All @@ -49,6 +50,7 @@

'notes': unicode,
'vcf_header': unicode,
'vcf_release': Coerce(int),

'true_positive': Coerce(int),
'false_positive': Coerce(int),
Expand All @@ -75,6 +77,8 @@
Any(basestring, None),
Doc('vcf_header', 'The raw VCF text of the header.'):
Any(basestring, None),
Doc('vcf_release', 'ENSEMBL Release for the reference'):
Any(long, None),
Doc('project_id', 'The internal ID of the Project this Run belongs to.'):
long,
Doc('normal_bam_id',
Expand Down
1 change: 1 addition & 0 deletions schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
Column('notes', String()), # Any notes, params, etc the user might add. Ideally in JSON format.
Column('uri', String()), # URI of source file, if any
Column('vcf_header', String()), # Plaintext header of the VCF
Column('vcf_release', Integer), # ENSEMBL compatible release for reference
Column('extant_columns', String()), # JSON list of non-null columns in the VCF
Column('genotype_count', BigInteger), # number of variants in this VCF
UniqueConstraint('project_id', 'uri', name='vcfs_project_id_uri_key')
Expand Down
4 changes: 2 additions & 2 deletions workers/genotype_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ def _extract(vcf_id):
vcf = vcfs_table.select().where(vcfs_table.c.id == vcf_id).execute().fetchone()

# Validate the contents of the VCF before modifying the database.
reader, header_text = load_vcf(vcf['uri'])
reader, header_text, release = load_vcf(vcf['uri'])

# Fill in VCF header text, which is now available.
(vcfs_table.update()
.where(vcfs_table.c.id == vcf_id)
.values(vcf_header=header_text)
.values(vcf_header=header_text, vcf_release=release)
.execute())

insert_genotypes_with_copy(reader, engine,
Expand Down
19 changes: 16 additions & 3 deletions workers/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from sqlalchemy import select, func, create_engine, MetaData, bindparam, not_, or_
import pywebhdfs.webhdfs
import pywebhdfs.errors
import varcode

import config
from common.helpers import tables
Expand Down Expand Up @@ -86,20 +87,32 @@ def hdfs_to_local_path(hdfs_path):

return filename

def guess_ensembl_release(filepath):
try:
release = varcode.load_vcf(filepath)[0].ensembl.release
except ValueError: # no guesses from varcode, return default
release = config.ENSEMBL_RELEASE
finally:
return release

def load_vcf(vcf_path):
"""Return a vcf.Reader, header text for the given VCF."""
if config.ALLOW_LOCAL_VCFS and vcf_path.startswith('/tests/'):
text = open(vcf_path[1:]).read()
filepath = vcf_path[1:];
text = open(filepath).read()
release = guess_ensembl_release(filepath)
elif vcf_path.startswith('file://'):
text = open(vcf_path[6:]).read()
filepath = vcf_path[6:];
text = open(filepath).read()
release = guess_ensembl_release(filepath)
elif vcf_path.startswith('hdfs://'):
return load_vcf(vcf_path[6:])
else:
text = get_contents_from_hdfs(vcf_path)
release = guess_ensembl_release(hdfs_to_local_path(vcf_path))
header = '\n'.join(l for l in text.split('\n') if l.startswith('#'))

return pyvcf.Reader(l for l in text.split('\n')), header
return pyvcf.Reader(l for l in text.split('\n')), header, release


def initialize_database(database_uri):
Expand Down
22 changes: 15 additions & 7 deletions workers/varcode_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,24 @@ def _annotate(vcf_id):
if vcf_id == False:
return # An error must have occurred earlier.

engine = sqlalchemy.create_engine(DATABASE_URI)
# See if we have any guesses for a release version
with tables(engine, 'vcfs') as (con, vcfs):
metadata = sqlalchemy.MetaData(bind=con)
metadata.reflect()
vcf = vcfs.select().where(vcfs.c.id == vcf_id).execute().fetchone()
release = vcf['vcf_release']

if not release:
release = config.ENSEMBL_RELEASE

# Only runs the first time for this release.
EnsemblRelease(config.ENSEMBL_RELEASE).install()
EnsemblRelease(release).install()

engine = sqlalchemy.create_engine(DATABASE_URI)
with tables(engine, 'genotypes') as (con, genotypes):
metadata = sqlalchemy.MetaData(bind=con)
metadata.reflect()
annotations = get_varcode_annotations(genotypes,
vcf_id,
config.ENSEMBL_RELEASE)
annotations = get_varcode_annotations(genotypes, vcf_id, release)

tmp_table = Table('gene_annotations',
metadata,
Expand Down Expand Up @@ -101,7 +109,7 @@ def write_to_table_via_csv(table, rows, connection):


def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num):
"""Get contig, position, ref and alt data from the genotypes table,
"""Get contig, position, ref and alt data from the genotypes table,
and get the best effect from Varcode library. Return a list of the form:
[[contig, position, "NAME,NAME,..."], [contig...], ...]
"""
Expand Down Expand Up @@ -139,4 +147,4 @@ def get_varcode_annotations(genotypes, vcf_id, ensembl_release_num):
notation,
effect_type])

return varcode_annotations
return varcode_annotations

0 comments on commit 07f12c1

Please sign in to comment.