diff --git a/ENVTEMPLATE.sh b/ENVTEMPLATE.sh index 48cdf4a..1278be2 100644 --- a/ENVTEMPLATE.sh +++ b/ENVTEMPLATE.sh @@ -5,6 +5,7 @@ export CELERY_BROKER='amqp://localhost' export WEBHDFS_USER=username export WEBHDFS_URL=http://example.com:5000 export IGV_HTTPFS_URL=http://example.com:9876 +export ALLOW_VCF_OVERWRITES=False # True for automatic reloading & debugging JS insertion. export USE_RELOADER=False diff --git a/config.py b/config.py index 2988712..df73df4 100644 --- a/config.py +++ b/config.py @@ -1,17 +1,20 @@ import os -# ensure that false in config isn't interpreted as True -use_reloader = os.environ.get('USE_RELOADER', False) -if use_reloader and use_reloader.lower() == 'false': - use_reloader = False -USE_RELOADER = use_reloader +def handle_false(value): + if value and value.lower() == 'false': + value = False + return value +# ensure that false in config isn't interpreted as True +USE_RELOADER = handle_false(os.environ.get('USE_RELOADER', False)) SQLALCHEMY_DATABASE_URI = os.environ['DATABASE_URI'] PORT = int(os.environ.get('PORT', 5000)) WEBHDFS_USER = os.environ['WEBHDFS_USER'] WEBHDFS_URL = os.environ['WEBHDFS_URL'] IGV_HTTPFS_URL = os.environ['IGV_HTTPFS_URL'] ALLOW_LOCAL_VCFS = os.environ.get('ALLOW_LOCAL_VCFS', USE_RELOADER) +ALLOW_VCF_OVERWRITES = handle_false( + os.environ.get('ALLOW_VCF_OVERWRITES', False)) TYPEKIT_URL = os.environ.get('TYPEKIT_URL', None) diff --git a/schema.sql b/schema.sql index 3bee9a2..0145027 100644 --- a/schema.sql +++ b/schema.sql @@ -17,7 +17,7 @@ CREATE TABLE vcfs ( ); CREATE TABLE vcf_annotations ( - vcf_id BIGINT REFERENCES vcfs NOT NULL, + vcf_id BIGINT REFERENCES vcfs ON DELETE CASCADE NOT NULL, annotation TEXT NOT NULL, type TEXT NOT NULL, "contig" TEXT, @@ -35,7 +35,7 @@ CREATE TABLE data_annotations ( ); CREATE TABLE genotypes ( - vcf_id BIGINT REFERENCES vcfs NOT NULL, + vcf_id BIGINT REFERENCES vcfs ON DELETE CASCADE NOT NULL, sample_name TEXT, contig TEXT, position INTEGER, diff --git a/workers/genotype_extractor.py b/workers/genotype_extractor.py index d642f8b..9240f3b 100644 --- a/workers/genotype_extractor.py +++ b/workers/genotype_extractor.py @@ -6,6 +6,7 @@ adding them to the genotypes table. Finally, determines which columns in the vcf actually contain values, and stores a list of them in the vcf table. """ +import config import json from sqlalchemy import create_engine, MetaData @@ -23,8 +24,13 @@ def extractor(run): engine, connection, metadata = initialize_database(DATABASE_URI) if vcf_exists(connection, run): - print 'VCF already exists with URI {}'.format(run['vcf_path']) - return False + if config.ALLOW_VCF_OVERWRITES: + was_deleted = delete_vcf(metadata, connection, run['vcf_path']) + assert was_deleted, ("Rows should have been deleted if we are " + "deleting a VCF that exists") + else: + print 'VCF already exists with URI {}'.format(run['vcf_path']) + return False reader, header = load_vcf_from_hdfs(run['vcf_path']) insert_vcf_metadata(metadata, run, header) @@ -85,6 +91,12 @@ def get_vcf_id(con, run): return con.execute(query).first().id +def delete_vcf(metadata, connection, uri): + """Delete VCFs with this URI, and return True if rows were deleted.""" + vcfs = metadata.tables.get('vcfs') + result = vcfs.delete().where(vcfs.c.uri == uri).execute() + return result.rowcount > 0 + def vcf_exists(connection, run): """Return True if the VCF exists in the vcfs table, else return False.""" query = "SELECT * FROM vcfs WHERE uri = '" + run['vcf_path'] + "'"