In [1]:
import os
import pandas as pd
import sevenbridges as sbg

In [2]:
token = os.environ['CAVATICA_ZYK_TOKEN']
url = 'https://cavatica-api.sbgenomics.com/v2/'
base = 'yuankun/kf-genomics-harmonization-pilot'
bcm    = base + '-bcm'
hudson = base + '-hudsonalpha'
broad  = base + '-broad'
washu  = base + '-washu'
workflow = base + '/kf-alignment-cavatica'

In [3]:
api = sbg.Api(url=url, token=token)
app = api.apps.get(id=workflow)

In [4]:
def get_step_time(step, task):
    for job in task.jobs:
        if step in job.name:
            try:
                step_start
            except:
                step_start = job.start_time
                step_end = job.end_time
            # handle scatter start/end time
            step_start = job.start_time if job.start_time < step_start else step_start
            step_end = job.end_time if job.end_time > step_end else step_end     
    return (step_end - step_start).total_seconds()

def get_task_df(project):
    tasks = api.tasks.query(project=project).all()
    data = {}
    for task in tasks:
        if task.batch: # only check batch running
            for child in task.get_batch_children().all():
                child_detail = child.get_execution_details()
                try:
                    runtime = (child.end_time - child.start_time).total_seconds()
                    data[child.name] = {
                        'batch': task.name,
                        'runtime': runtime,
                        'cost': child.price.amount,
                        'status': child_detail.status
                    }
                    for step in app.raw['steps']:
                        data[child.name][step['id']] = get_step_time(step['id'], child_detail)
                except:
                    continue
    df = pd.DataFrame.from_dict(data, orient='index')
    return df

In [5]:
df_broad = get_task_df(broad)
df_washu = get_task_df(washu)
df_hudson = get_task_df(hudson)

In [6]:
df_merge = pd.concat([df_broad, df_washu, df_hudson])
df_merge.to_csv('data/task.csv')

In [7]:
steps = [ step['id'] for step in app.raw['steps']]
steps

[u'bwa_mem',
 u'checkcontamination',
 u'createsequencegrouping',
 u'gatk_applybqsr',
 u'gatk_baserecalibrator',
 u'gatk_gatherbqsrreports',
 u'gatk_haplotypecaller',
 u'gatk_validategvcf',
 u'getbasename',
 u'picard_calculatereadgroupchecksum',
 u'picard_collectaggregationmetrics',
 u'picard_collectgvcfcallingmetrics',
 u'picard_collectqualityyieldmetrics',
 u'picard_collectreadgroupbamqualitymetrics',
 u'picard_collectunsortedreadgroupbamqualitymetrics',
 u'picard_collectwgsmetrics',
 u'picard_gatherbamfiles',
 u'picard_intervallisttools',
 u'picard_markduplicates',
 u'picard_mergevcfs',
 u'picard_revertsam',
 u'picard_sortsam',
 u'samtools_coverttocram',
 u'verifybamid']