# Annotation task submission notebook

## Init libraries

In [1]:
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import pdb
import concurrent.futures
from requests import request
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])
project = 'd3b-bixu/dev-wgsa'

## Draft task def

In [2]:
def draft_task(task_name, input_dict, app_name, project):
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=input_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()


## Run task def

In [18]:
def run_tasks(project, prefix):
    # set prefix to all if you don't need to be selective
    draft_tasks = list(api.tasks.query(project=project, status='DRAFT').all())
    for i in range(0, len(draft_tasks), 1):
        if prefix == 'ALL' or re.search(prefix, draft_tasks[i].name):
            draft_tasks[i].run()
            print('Running task ' + draft_tasks[i].id + ' ' + draft_tasks[i].name)
        else:
            print('Task ' + draft_tasks[i].id + ' ' + draft_tasks[i].name + ' skipped, prefix ' + prefix + ' did not match')


### execute run tasks

In [81]:
prefix = 'WGSA SNP INDEL ALL RPT'
print("You sure you want to run all tasks with prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    run_tasks(project, prefix)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure you want to run all tasks with prefix: WGSA SNP INDEL ALL RPT? Type "YASS" if so
YASS
Task c08f6294-92c9-4c1d-982d-7605ebcfd12d kfdrc-snp-eff-wgsa run - 03-04-20 16:44:34 skipped, prefix WGSA SNP INDEL ALL RPT did not match
Task 57e2f78d-f9d6-41c4-8a54-41fa928ce245 kfdrc-annovar run - 03-16-20 15:04:13 skipped, prefix WGSA SNP INDEL ALL RPT did not match
Task 2bc272c5-e052-4047-9838-5806ee28a71b kfdrc-annovar run - 03-16-20 15:33:49 skipped, prefix WGSA SNP INDEL ALL RPT did not match
Task 40510325-3665-4b7d-9d7b-e280fdc02b64 kfdrc-vep99-wgsa run - 03-19-20 14:40:04 skipped, prefix WGSA SNP INDEL ALL RPT did not match
Task facbcdec-3b86-4888-b7d7-f1939fa9563f snpsift-annotate run - 03-24-20 18:36:45 skipped, prefix WGSA SNP INDEL ALL RPT did not match
Task 4bb11c87-88f1-4169-8a12-33ce3ffcaea5 snpsift-annotate run - 03-24-20 18:39:45 skipped, prefix WGSA SNP INDEL ALL RPT did not match
Task 1af51098-05a7-4dbc-b2c8-8a5cd903d120 snpsift-annotate run - 03-24-20 19:58:50 skipped, p

## Remove old annotation task set up

In [8]:
manifest = open('/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/original_trio_vcf_test-manifest.csv')
head = next(manifest)
app = project + '/bcftools-strip-info'
strip_info = 'INFO/ANN'
tool_name = 'gatk.denovo.trio'
for line in manifest:
    info = line.split(',')
    in_dict = {}
    in_dict['input_vcf'] = api.files.get(info[0])
    in_dict['tool_name'] = tool_name
    in_dict['strip_info'] = strip_info
    task_name = "BCFTOOLS STRIP ANNO: " + in_dict['input_vcf'].metadata['Kids First Family ID']
    draft_task(task_name, in_dict, app, project)
    

## Copy metadata to outputs

In [8]:
def add_metadata_to_outputs(task, phrase, in_key):
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        metadata = task.inputs[in_key].metadata
        for out_key in task.outputs:
            # pdb.set_trace()
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        if type(output) is not list:
                            file_obj = api.files.get(output.id)
                            for key in metadata:
                                file_obj.metadata[key] = metadata[key]
                            file_obj.save()
                        else:
                            for item in output:
                                if item is not None:
                                    file_obj = api.files.get(item.id)
                                    for key in metadata:
                                        file_obj.metadata[key] = metadata[key]
                                    file_obj.save()

            except Exception as e:
                print(e)
                print("Skipping " + out_key + " for " + task.name + " due to error")

#### Add metadata to file normal tasks

In [9]:
prefix = 'MARAZITA CALLER'
key = 'input_vcf'
print("You sure tag outputs with task prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    tasks = api.tasks.query(project=project, status="COMPLETED").all()
    for task in tasks:
        add_metadata_to_outputs(task, prefix, key)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure tag outputs with task prefix: MARAZITA CALLER? Type "YASS" if so
YASS


Valid task found MARAZITA CALLER W DB TEST: FM_DFW18WG8
Valid task found MARAZITA CALLER W DB TEST: FM_0ADXQA3J
Valid task found MARAZITA CALLER W DB TEST: FM_0DB3211J
Valid task found MARAZITA CALLER W DB TEST: FM_ZFMCW3G3
Valid task found MARAZITA CALLER W DB TEST: FM_SHJNRP8S
Valid task found MARAZITA CALLER W DB TEST: FM_4Y21X6PP
Valid task found MARAZITA CALLER W DB TEST: FM_0FJQBB97
Valid task found MARAZITA CALLER W DB TEST: FM_RT6VGNSJ
Valid task found MARAZITA CALLER W DB TEST: FM_1PCY4Y30
Valid task found MARAZITA CALLER W DB TEST: FM_V9TXBYS6
Valid task found MARAZITA CALLER W DB TEST: FM_11ASN8XN
Valid task found MARAZITA CALLER W DB TEST: FM_DKX82ZD7
Valid task found MARAZITA CALLER W DB TEST: FM_3FWV6NG5
Valid task found MARAZITA CALLER W DB TEST: FM_ZETA86QZ
Valid task found MARAZITA CALLER W DB TEST: FM_PR07SNT9
Valid task found MARAZITA CALLER W DB TEST: FM_2C8A094S
Valid task found MARAZITA CALLER W DB TEST: FM_0RWXQH9X
Valid task found MARAZITA CALLER W DB TEST: FM_N

### Add file tag to task outputs

In [11]:
def tag_file_outputs(task, phrase, tags):
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        for out_key in task.outputs:
            # pdb.set_trace()
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    file_obj.tags = tags
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        if type(output) is not list:
                            file_obj = api.files.get(output.id)
                            file_obj.tags = tags
                            file_obj.save()
                        else:
                            for item in output:
                                if item is not None:
                                    file_obj = api.files.get(item.id)
                                    file_obj.tags = tags
                                    file_obj.save()

            except Exception as e:
                print(e)
                print("Skipping " + out_key + " for " + task.name + " due to error")

In [16]:
prefix = 'MARAZITA CALLER ONLY TEST'
tags = ['ANNOTATED', 'CALLER_ONLY_ALL_VAR']
print("You sure tag outputs with task prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    tasks = api.tasks.query(project=project, status="COMPLETED").all()
    for task in tasks:
        tag_file_outputs(task, prefix, tags)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure tag outputs with task prefix: MARAZITA CALLER ONLY TEST? Type "YASS" if so
YASS


Valid task found MARAZITA CALLER ONLY TEST: FM_DFW18WG8
Valid task found MARAZITA CALLER ONLY TEST: FM_0ADXQA3J
Valid task found MARAZITA CALLER ONLY TEST: FM_0DB3211J
Valid task found MARAZITA CALLER ONLY TEST: FM_ZFMCW3G3
Valid task found MARAZITA CALLER ONLY TEST: FM_SHJNRP8S
Valid task found MARAZITA CALLER ONLY TEST: FM_4Y21X6PP
Valid task found MARAZITA CALLER ONLY TEST: FM_0FJQBB97
Valid task found MARAZITA CALLER ONLY TEST: FM_RT6VGNSJ
Valid task found MARAZITA CALLER ONLY TEST: FM_1PCY4Y30
Valid task found MARAZITA CALLER ONLY TEST: FM_V9TXBYS6
Valid task found MARAZITA CALLER ONLY TEST: FM_11ASN8XN
Valid task found MARAZITA CALLER ONLY TEST: FM_DKX82ZD7
Valid task found MARAZITA CALLER ONLY TEST: FM_3FWV6NG5
Valid task found MARAZITA CALLER ONLY TEST: FM_ZETA86QZ
Valid task found MARAZITA CALLER ONLY TEST: FM_PR07SNT9
Valid task found MARAZITA CALLER ONLY TEST: FM_2C8A094S
Valid task found MARAZITA CALLER ONLY TEST: FM_0RWXQH9X
Valid task found MARAZITA CALLER ONLY TEST: FM_N

#### Tag batch tasks

In [32]:
prefix = 'bcftools-filter-vcf run - RM SNP'
key = 'input_vcf'
print("You sure tag outputs with task prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    tasks = api.tasks.query(project=project, status="COMPLETED").all()
    for task in tasks:
        if re.search(prefix, task.name):
            for child in task.get_batch_children():
                tag_outputs(child, prefix, key)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure tag outputs with task prefix: bcftools-filter-vcf run - RM SNP? Type "YASS" if so
YASS


Valid task found bcftools-filter-vcf run - RM SNP: file: 63529c06-97ae-4d2b-b328-e773ff99fdc4.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: 28d7410a-55b7-482b-83b1-598efb639046.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: 10319470-535e-4426-a720-f583ae3924fb.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: 6ba066c9-ac41-46d4-a6c4-0518eb5d37cd.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: c42499db-13ff-4e32-9645-b279502c9604.gatk.denovo.trio.INFO_stripped.vcf.gz


#### Delete task outputs

In [77]:
prefix = 'WGSA SNP INDEL ALL'
print("You sure you want to delete outputs from tasks with prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    tasks = api.tasks.query(project=project, status="COMPLETED").all()
    for task in tasks:
        try:
            if re.search(prefix, task.name):
                for key in task.outputs:
                    if isinstance(task.outputs[key], list):
                        for f_obj in task.outputs[key]:
                            f_obj.delete()
                    elif task.outputs[key] is not None:
                        task.outputs[key].delete()
        except Exception as e:
            sys.stderr.write(str(e) + "\nError processing " + task.name + "task id: " + task.id + ". Review error if ok\n")


You sure you want to delete outputs from tasks with prefix: WGSA SNP INDEL ALL? Type "YASS" if so
YASS


Requested file does not exist.
Error processing WGSA SNP INDEL ALL: FM_DFW18WG8task id: bd56fdf5-ee42-4664-9125-d6dae66e6ed1. Review error if ok


## Annotation runs

In [5]:
stripped_vcf_manifest = '/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/stripped_vcf-manifest.csv'
snp_rm_vcf_manifest = '/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/snp_rm-manifest.csv'
subset_test = '/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/subset_test-manifest.csv'

### Set up snpEff run

In [34]:
def get_snpEff_refs(reference_name, tool_name):
    ref_dict = {}
    ref_dict['reference_name'] = reference_name
    ref_dict['ref_tar_gz'] = api.files.query(project=project, names=['snpeff_hg38_grch38.tgz'])[0]
    ref_dict['tool_name'] = tool_name
    # db file name list has optional databases to run
#     if len(vcf_list) > 0:
#         ref_dict['db_vcfs'] = []
#         for vcf in vcf_list:
#             ref_dict['db_vcfs'].append(api.files.query(project=project, names=[vcf])[0])
#     if gwas_bool:
#         ref_dict['gwas_catalog_txt'].append(api.files.query(project=project, names=['gwas_catalog_v1.0-associations_e98_r2020-03-08.tsv'])[0])
#     if dbnsfp_txt_bool:
#         ref_dict['dbnsfp_txt'].append(api.files.query(project=project, names=['dbNSFP4.0a.gz'])[0])
    return ref_dict


In [39]:
# check all vars
app = project + "/snpeff-annotate"
manifest = open(snp_rm_vcf_manifest)
task_prefix = 'snpEff NO DB NO SNP refGene: '
# hg38 or GRCh38.86
ref_gene_model = "hg38"
# run gwas?
# gwas_bool = False
# run dbnsfp?
# dbnsfp_txt_bool = False
tool_name = "gatk.denovo.trio.stripped"

ref_obj = get_snpEff_refs(ref_gene_model, tool_name)
head = next(manifest)
for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)


### Set up annovar run

In [44]:
def get_ANNOVAR_refs(db_list, db_run_bool, protocol_name):
    ref_dict = {}
    ref_dict['run_dbs'] = db_run_bool
    ref_dict['cache'] = api.files.query(project=project, names=['annovar_2019Oct24.tgz'])[0]
    ref_dict['protocol_name'] = protocol_name
    # db file name list has optional databases to run
    if len(db_list) > 0:
        ref_dict['additional_dbs'] = []
        for db in db_list:
            ref_dict['additional_dbs'].append(api.files.query(project=project, names=[db])[0])
    return ref_dict


In [50]:
app = project + "/kfdrc-annovar"
manifest = open(snp_rm_vcf_manifest)
task_prefix = 'ANNOVAR NO DB NO SNP refGene: '
additional_dbs = ['esp6500siv2_all.tgz']
db_run_bool = False
# choices are refGene, ensGene, knownGene
protocol_name = 'refGene'


ref_obj = get_ANNOVAR_refs(additional_dbs, db_run_bool, protocol_name)
head = next(manifest)
for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)


### Set up VEP run

In [53]:
def get_VEP_refs(db_key_list, db_run_bool):
    extra_db_dict = {'cadd_indels': 'CADDv1.5-38-InDels.tsv.gz', 'cadd_snvs': 'CADDv1.5-38-whole_genome_SNVs.tsv.gz',
                     'dbnsfp': 'dbNSFP4.0a.gz', 'dbscsnv': 'dbscSNV1.1_GRCh38.txt.gz', 'phylop': 'hg38.phyloP100way.bw'}
    ref_dict = {}
    ref_dict['run_cache_dbs'] = db_run_bool
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens.GRCh38.dna.toplevel.fa.gz'])[0]
    ref_dict['cache'] = api.files.query(project=project, names=['homo_sapiens_merged_vep_99_GRCh38.tar.gz'])[0]
    ref_dict['tool_name'] = 'VEP99'
    # db_key list has optional databases to run
    for key in db_key_list:
        ref_dict[key] = api.files.query(project=project, names=[extra_db_dict[key]])[0]
    return ref_dict
    

In [55]:
app = project + "/kfdrc-vep99-wgsa"
manifest = open(snp_rm_vcf_manifest)
task_prefix = 'VEP NO DB NO SNP: '
additional_dbs = []
db_run_bool = False

ref_obj = get_VEP_refs(additional_dbs, db_run_bool)
head = next(manifest)
for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)


### Set up WGSA run

In [None]:
def get_WGSA_refs(settings, tool_name, db_list):
    ref_dict = {}
    # db_list = ['dbSNP.tgz', 'GWAS_catalog.tgz', 'wgsa_hg38_resource.tgz', '1000Gp3.tgz', 'UK10K.tgz', 'ESP6500.tgz', 'ExACr0.3.tgz',
#                'dbNSFP.tgz', 'CADDv1.4.tgz', 'clinvar.tgz', 'wgsa_hg19_resource.tgz', 'COSMIC_hg38.tgz', 'PhyloP_hg38.tgz', 'gnomAD.tgz',
#                'crossmap.tgz']
    ref_dict['tool_name'] = tool_name
    ref_dict['annovar_ref'] = api.files.query(project=project, names=['annovar_2019Oct24.tgz'])[0]
    ref_dict['snpeff_ref'] = api.files.query(project=project, names=['snpeff_hg38_grch38.tgz'])[0]
    ref_dict['VEP_cache'] = api.files.query(project=project, names=['homo_sapiens_merged_vep_99_GRCh38.tar.gz'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens.GRCh38.dna.toplevel.fa.gz'])[0]
    ref_dict['settings'] = api.files.query(project=project, names=[settings])[0]
    # db file name list has optional databases to run
    if len(db_list) > 0:
        ref_dict['resources'] = []
        for db in db_list:
            ref_dict['resources'].append(api.files.query(project=project, names=[db])[0])
    return ref_dict


In [80]:
app = project + "/kfdrc-wgsa-annotate"
manifest = open(stripped_vcf_manifest)
task_prefix = 'WGSA SNP INDEL ALL RPT: '
tool_name = "gatk.denovo.trio.stripped.all_annot"
settings = "wgsa_all_desired_settings.txt"
# settings = "WGSA_indel_only_ALL.txt"
db_list = ['precomputed_hg38.tgz', 'dbSNP.tgz', 'GWAS_catalog.tgz', 'wgsa_hg38_resource.tgz', '1000Gp3.tgz', 'UK10K.tgz', 'ESP6500.tgz', 'ExACr0.3.tgz',
           'dbNSFP.tgz', 'CADDv1.4.tgz', 'clinvar.tgz', 'wgsa_hg19_resource.tgz', 'COSMIC_hg38.tgz', 'PhyloP_hg38.tgz', 'gnomAD.tgz',
           'crossmap.tgz']

ref_obj = get_WGSA_refs(settings, tool_name, db_list)
head = next(manifest)

for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)


### Set up all caller + db run

In [16]:
def get_caller_db_refs(tool_name, strip_info, filter_vcf):
    ref_dict = {}
    ref_dict['tool_name'] = tool_name
    ref_dict['strip_info'] = strip_info
    if filter_vcf:
        ref_dict['include_expression'] = filter_vcf
    ref_dict['ANNOVAR_cache'] = api.files.query(project=project, names=['annovar_2019Oct24.tgz'])[0]
    ref_dict['ANNOVAR_cosmic_db'] = api.files.query(project=project, names=['cosmic90.tgz'])[0]
    ref_dict['ANNOVAR_dbscsnv_db'] = api.files.query(project=project, names=['dbscsnv11.tgz'])[0]
    ref_dict['ANNOVAR_kg_db'] = api.files.query(project=project, names=['1000g2015aug.tgz'])[0]
    ref_dict['ANNOVAR_esp_db'] = api.files.query(project=project, names=['esp6500siv2_all.tgz'])[0]
    ref_dict['ANNOVAR_gnomad_db'] = api.files.query(project=project, names=['gnomad30_genome.tgz'])[0]
    ref_dict['ANNOVAR_run_dbs_refGene'] = True
    ref_dict['ANNOVAR_run_dbs_ensGene'] = False
    ref_dict['ANNOVAR_run_dbs_knownGene'] = False
    ref_dict['snpEff_ref_tar_gz'] = api.files.query(project=project, names=['snpeff_hg38_grch38.tgz'])[0]
    ref_dict['gwas_cat_db_file'] = api.files.query(project=project, names=['gwas_catalog_v1.0-associations_e98_r2020-03-08.tsv'])[0]
    ref_dict['SnpSift_vcf_db_name'] = "ClinVar"
    ref_dict['SnpSift_vcf_fields'] = "AF_ESP,AF_EXAC,AF_TGP,ALLELEID,CLNDN,CLNDNINCL,CLNDISDB,CLNDISDBINCL,CLNHGVS,CLNREVSTAT,CLNSIG,CLNSIGCONF,CLNSIGINCL,CLNVC,CLNVCSO,CLNVI,DBVARID,GENEINFO,MC,ORIGIN,RS,SSR"
    ref_dict['clinvar_vcf'] = api.files.query(project=project, names=['clinvar-2020-03-17.vcf.gz'])[0]
    ref_dict['VEP_cache'] = api.files.query(project=project, names=['homo_sapiens_merged_vep_99_GRCh38.tar.gz'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens.GRCh38.dna.toplevel.fa.gz'])[0]
    ref_dict['VEP_run_cache_existing'] = True
    ref_dict['VEP_run_cache_af'] = True
    ref_dict['VEP_cadd_indels'] = api.files.query(project=project, names=['CADDv1.5-38-InDels.tsv.gz'])[0]
    ref_dict['VEP_cadd_snvs'] = api.files.query(project=project, names=['CADDv1.5-38-whole_genome_SNVs.tsv.gz'])[0]
    ref_dict['VEP_dbnsfp'] = api.files.query(project=project, names=['dbNSFP4.0a.gz'])[0]
    return ref_dict

In [17]:
app = project + "/kf-caller-db-wf"
manifest = open(subset_test)
task_prefix = 'ALL CALLER W DB NO SNP TEST: '
tool_name = "gatk.denovo.trio"
strip_info = "INFO/ANN"
filter_vcf = "TYPE!=\"snp\""
# db_list = ['cosmic90.tgz', 'dbscsnv11.tgz','gnomad30_genome.tgz']
ref_obj = get_caller_db_refs(tool_name, strip_info, filter_vcf)
head = next(manifest)

for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)


### Set up caller only run

In [18]:
def get_caller_only_refs(tool_name, strip_info, filter_vcf):
    ref_dict = {}
    ref_dict['tool_name'] = tool_name
    ref_dict['strip_info'] = strip_info
    if filter_vcf:
        ref_dict['include_expression'] = filter_vcf
    ref_dict['ANNOVAR_cache'] = api.files.query(project=project, names=['annovar_2019Oct24.tgz'])[0]
    ref_dict['ANNOVAR_run_dbs_refGene'] = False
    ref_dict['ANNOVAR_run_dbs_ensGene'] = False
    ref_dict['ANNOVAR_run_dbs_knownGene'] = False
    ref_dict['snpEff_ref_tar_gz'] = api.files.query(project=project, names=['snpeff_hg38_grch38.tgz'])[0]
    ref_dict['VEP_cache'] = api.files.query(project=project, names=['homo_sapiens_merged_vep_99_GRCh38.tar.gz'])[0]
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens.GRCh38.dna.toplevel.fa.gz'])[0]
    ref_dict['VEP_run_cache_existing'] = False
    ref_dict['VEP_run_cache_af'] = False
    return ref_dict

In [20]:
app = project + "/kf-caller-only-wf"
manifest = open(subset_test)
task_prefix = 'ALL CALLER ONLY NO SNP TEST: '
tool_name = "gatk.denovo.trio"
strip_info = "INFO/ANN"
filter_vcf = "TYPE!=\"snp\""
ref_obj = get_caller_only_refs(tool_name, strip_info, filter_vcf)
head = next(manifest)

for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)

#### simple utility to expand ipython view in browser

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

## Get Task Cost/Run analysis

In [32]:
def merge_and_sum_time_intervals(task):
    data = []
    for job in task.get_execution_details().jobs:
        if job.status != "COMPLETED":
            sys.stderr.write("Skipping job likely killed due to spot instance kill for " + job.name + " from task " + task.id + "\n")
        else:
            try:
                pair = (job.start_time, job.end_time)
                data.append(pair)
            except Exception as e:
                print (e)
                pdb.set_trace()
                hold = 1
    data = sorted(data, key=lambda x: x[0])


    # from https://stackoverflow.com/questions/34797525/how-to-correctly-merge-overlapping-datetime-ranges-in-python
    result = []
    try:
        t_old = data[0]
        for t in data[1:]:
            if t_old[1] >= t[0]:  #I assume that the data is sorted already
                t_old = ((min(t_old[0], t[0]), max(t_old[1], t[1])))
            else:
                result.append(t_old)
                t_old = t

        else:
            result.append(t_old)
    except Exception as e:
        print(e)
        pdb.set_trace()
        hold = 1
    total_seconds = 0
    for t_int in result:
        total_seconds += (t_int[1] - t_int[0]).seconds
    #print ('Task ran in ' + str(total_seconds) + ', which is ' + str(total_seconds/3600) + ' hours')
    return [task.id, task.name, str(total_seconds/3600)]


In [33]:
def get_cost(task):
    try:
        return [task.id, task.name, str(task.price.amount), task.app]
    except Exception as e:
        sys.stderr.write('Could not process task ' + task.id + ' ' + task.name + ' got error ' + str(e) + '\n')
        sys.exit()

In [35]:
task_prefix = "^WGSA"
task_summary = open('/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/WGSA_test,txt', 'w')
task_summary.write("task_id\ttask_name\trun_hrs\tcost\tworkflow\n")
for task in api.tasks.query(project=project, status="COMPLETED").all():
    if re.search(task_prefix, task.name):
        (rid, rname, duration) = merge_and_sum_time_intervals(task)
        (cid, cname, cost, wf) = get_cost(task)
        task_summary.write("\t".join([rid, rname, duration, cost, wf]) + "\n")
task_summary.close()

Skipping job likely killed due to spot instance kill for kfdrc-wgsa-annotate from task 3d5e1f0c-8740-496d-844f-189aecb77f48
Skipping job likely killed due to spot instance kill for kfdrc-wgsa-annotate from task cba3817b-6e0f-4354-9941-1218690544b4
