# Annotation task submission notebook

## Get refs defs

In [1]:
import sevenbridges as sbg
from sevenbridges.errors import SbgError
from sevenbridges.http.error_handlers import rate_limit_sleeper, maintenance_sleeper
import sys
import re
import pdb
import concurrent.futures
from requests import request
config = sbg.Config(profile='turbo')
api = sbg.Api(config=config, error_handlers=[rate_limit_sleeper, maintenance_sleeper])
project = 'd3b-bixu/dev-wgsa'

In [3]:
def get_VEP_refs(db_key_list, db_run_bool):
    extra_db_dict = {'cadd_indels': 'CADDv1.5-38-InDels.tsv.gz', 'cadd_snvs': 'CADDv1.5-38-whole_genome_SNVs.tsv.gz',
                     'dbnsfp': 'dbNSFP4.0a.gz', 'dbscsnv': 'dbscSNV1.1_GRCh38.txt.gz', 'phylop': 'hg38.phyloP100way.bw'}
    ref_dict = {}
    ref_dict['run_cache_dbs'] = db_run_bool
    ref_dict['reference'] = api.files.query(project=project, names=['Homo_sapiens.GRCh38.dna.toplevel.fa.gz'])[0]
    ref_dict['cache'] = api.files.query(project=project, names=['homo_sapiens_merged_vep_99_GRCh38.tar.gz'])[0]
    ref_dict['tool'] = 'VEP99'
    # db_key list has optional databases to run
    for key in db_key_list:
        ref_dict[key] = api.files.query(project=project, names=[extra_db_dict[key]])[0]
    return ref_dict
    

In [4]:
def get_ANNOVAR_refs(db_list, db_run_bool, protocol_name):
    ref_dict = {}
    ref_dict['run_dbs'] = db_run_bool
    ref_dict['reference_name'] = api.files.query(project=project, names=['annovar_2019Oct24.tgz'])[0]
    ref_dict['protocol_name'] = protocol_name
    # db file name list has optional databases to run
    if len(db_list) > 0:
        ref_dict['additional_dbs'] = []
        for db in db_list:
            ref_dict['additional_dbs'].append(api.files.query(project=project, names=[db])[0])
    return ref_dict


In [34]:
def get_snpEff_refs(reference_name, tool_name):
    ref_dict = {}
    ref_dict['reference_name'] = reference_name
    ref_dict['ref_tar_gz'] = api.files.query(project=project, names=['snpeff_hg38_grch38.tgz'])[0]
    ref_dict['tool_name'] = tool_name
    # db file name list has optional databases to run
#     if len(vcf_list) > 0:
#         ref_dict['db_vcfs'] = []
#         for vcf in vcf_list:
#             ref_dict['db_vcfs'].append(api.files.query(project=project, names=[vcf])[0])
#     if gwas_bool:
#         ref_dict['gwas_catalog_txt'].append(api.files.query(project=project, names=['gwas_catalog_v1.0-associations_e98_r2020-03-08.tsv'])[0])
#     if dbnsfp_txt_bool:
#         ref_dict['dbnsfp_txt'].append(api.files.query(project=project, names=['dbNSFP4.0a.gz'])[0])
    return ref_dict


## Draft task def

In [6]:
def draft_task(task_name, input_dict, app_name, project):
    task = api.tasks.create(name=task_name, project=project, app=app_name, inputs=input_dict, run=False)
    task.inputs['output_basename'] = task.id
    task.save()


## Run task def

In [18]:
def run_tasks(project, prefix):
    # set prefix to all if you don't need to be selective
    draft_tasks = list(api.tasks.query(project=project, status='DRAFT').all())
    for i in range(0, len(draft_tasks), 1):
        if prefix == 'ALL' or re.search(prefix, draft_tasks[i].name):
            draft_tasks[i].run()
            print('Running task ' + draft_tasks[i].id + ' ' + draft_tasks[i].name)
        else:
            print('Task ' + draft_tasks[i].id + ' ' + draft_tasks[i].name + ' skipped, prefix ' + prefix + ' did not match')


### execute run tasks

In [40]:
prefix = 'snpEff NO DB'
print("You sure you want to run all tasks with prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    run_tasks(project, prefix)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure you want to run all tasks with prefix: snpEff NO DB? Type "YASS" if so
YASS
Task c08f6294-92c9-4c1d-982d-7605ebcfd12d kfdrc-snp-eff-wgsa run - 03-04-20 16:44:34 skipped, prefix snpEff NO DB did not match
Task 57e2f78d-f9d6-41c4-8a54-41fa928ce245 kfdrc-annovar run - 03-16-20 15:04:13 skipped, prefix snpEff NO DB did not match
Task 2bc272c5-e052-4047-9838-5806ee28a71b kfdrc-annovar run - 03-16-20 15:33:49 skipped, prefix snpEff NO DB did not match
Task 40510325-3665-4b7d-9d7b-e280fdc02b64 kfdrc-vep99-wgsa run - 03-19-20 14:40:04 skipped, prefix snpEff NO DB did not match
Task facbcdec-3b86-4888-b7d7-f1939fa9563f snpsift-annotate run - 03-24-20 18:36:45 skipped, prefix snpEff NO DB did not match
Task 4bb11c87-88f1-4169-8a12-33ce3ffcaea5 snpsift-annotate run - 03-24-20 18:39:45 skipped, prefix snpEff NO DB did not match
Task 1af51098-05a7-4dbc-b2c8-8a5cd903d120 snpsift-annotate run - 03-24-20 19:58:50 skipped, prefix snpEff NO DB did not match
Task 14621595-dbae-4f96-9320-eebc1fd2

## Remove old annotation task set up

In [8]:
manifest = open('/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/original_trio_vcf_test-manifest.csv')
head = next(manifest)
app = project + '/bcftools-strip-info'
strip_info = 'INFO/ANN'
tool_name = 'gatk.denovo.trio'
for line in manifest:
    info = line.split(',')
    in_dict = {}
    in_dict['input_vcf'] = api.files.get(info[0])
    in_dict['tool_name'] = tool_name
    in_dict['strip_info'] = strip_info
    task_name = "BCFTOOLS STRIP ANNO: " + in_dict['input_vcf'].metadata['Kids First Family ID']
    draft_task(task_name, in_dict, app, project)
    

## Copy metadata to outputs

In [26]:
def tag_outputs(task, phrase, in_key):
    if re.search(phrase, task.name):
        sys.stderr.write('Valid task found ' + task.name + '\n')
        metadata = task.inputs[in_key].metadata
        for out_key in task.outputs:
            try:
                if type(task.outputs[out_key]) is not list:
                    file_obj = api.files.get(task.outputs[out_key].id)
                    for key in metadata:
                        file_obj.metadata[key] = metadata[key]
                    file_obj.save()
                else:
                    for output in task.outputs[out_key]:
                        file_obj = api.files.get(output.id)
                        for key in metadata:
                            file_obj.metadata[key] = metadata[key]
                        file_obj.save()
            except Exception as e:
                print(e)
                print("Skipping " + task.name + " due to error")

#### Tag normal tasks

In [28]:
prefix = 'BCFTOOLS STRIP ANNO'
key = 'input_vcf'
print("You sure tag outputs with task prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    tasks = api.tasks.query(project=project, status="COMPLETED").all()
    for task in tasks:
        tag_outputs(task, prefix, key)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure tag outputs with task prefix: BCFTOOLS STRIP ANNO? Type "YASS" if so
YASS


Valid task found BCFTOOLS STRIP ANNO: FM_DFW18WG8
Valid task found BCFTOOLS STRIP ANNO: FM_0ADXQA3J
Valid task found BCFTOOLS STRIP ANNO: FM_0DB3211J
Valid task found BCFTOOLS STRIP ANNO: FM_ZFMCW3G3
Valid task found BCFTOOLS STRIP ANNO: FM_SHJNRP8S


#### Tag batch tasks

In [32]:
prefix = 'bcftools-filter-vcf run - RM SNP'
key = 'input_vcf'
print("You sure tag outputs with task prefix: " + prefix + "? Type \"YASS\" if so")
check = input()
if check == "YASS":
    tasks = api.tasks.query(project=project, status="COMPLETED").all()
    for task in tasks:
        if re.search(prefix, task.name):
            for child in task.get_batch_children():
                tag_outputs(child, prefix, key)
else:
    sys.stderr.write("User did not type YASS, skipping\n")

You sure tag outputs with task prefix: bcftools-filter-vcf run - RM SNP? Type "YASS" if so
YASS


Valid task found bcftools-filter-vcf run - RM SNP: file: 63529c06-97ae-4d2b-b328-e773ff99fdc4.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: 28d7410a-55b7-482b-83b1-598efb639046.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: 10319470-535e-4426-a720-f583ae3924fb.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: 6ba066c9-ac41-46d4-a6c4-0518eb5d37cd.gatk.denovo.trio.INFO_stripped.vcf.gz
Valid task found bcftools-filter-vcf run - RM SNP: file: c42499db-13ff-4e32-9645-b279502c9604.gatk.denovo.trio.INFO_stripped.vcf.gz


## Annotation runs

In [33]:
stripped_vcf_manifest = '/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/stripped_vcf-manifest.csv'
snp_rm_vcf_manifest = '/Users/brownm28/Documents/2020-Mar-4_WGSA/TASK_RUN/snp_rm-manifest.csv'

### Set up snpEff run

In [39]:
# check all vars
app = project + "/snpeff-annotate"
manifest = open(snp_rm_vcf_manifest)
task_prefix = 'snpEff NO DB NO SNP refGene: '
# hg38 or GRCh38.86
ref_gene_model = "hg38"
# run gwas?
# gwas_bool = False
# run dbnsfp?
# dbnsfp_txt_bool = False
tool_name = "gatk.denovo.trio.stripped"

ref_obj = get_snpEff_refs(ref_gene_model, tool_name)
head = next(manifest)
for line in manifest:
    info = line.split(',')
    in_dict = {}
    for key in ref_obj:
        in_dict[key] = ref_obj[key]
    in_vcf = api.files.get(info[0])
    task_name = task_prefix + in_vcf.metadata['Kids First Family ID']
    in_dict['input_vcf'] = in_vcf
    draft_task(task_name, in_dict, app, project)


#### simple utility to expand ipython view in browser

In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))