In [1]:
import os
import re
from collections import defaultdict


### Functions for parsing the yw models file. 

In [2]:
### function extracting the programs.

def extract_steps(line):
    
    #% FACT: program(program_id, program_name, qualified_program_name, begin_annotation_id, end_annotation_id).
    #print("Extracting steps.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    
    data = data.split(',')
    #print(data)
    steps[data[0]] = {
                      "program_name" :data[1].strip().strip("'").strip('"'),
                      "qualified_program_name": data[2].strip().strip("'").strip('"')
                    }
    
    #print(steps)
    return

In [45]:
### function extracting the workflow.

def extract_workflows(line):
    
    #print(line)
    data = line[line.index("(")+1:line.index(")")]
    #print(data)
    workflow[data] = steps[data]['program_name']
       
    return 

In [46]:
read_yw_model(yw_model_file)

In [47]:
### function extracting the ports

def extract_ports(line):
    #print("Extracting ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    
    #% FACT: port(port_id, port_type, port_name, qualified_port_name, port_annotation_id, data_id).
    
    data = data.split(',')
    
    ports[data[0]] ={
                     "port_type": data[1].strip().strip("'").strip('"').upper(),
                     "port_name": data[2].strip().strip("'").strip('"'),
                     "qualified_port_name" : data[3].strip().strip("'").strip('"'), 
                     "data_id":data[5].strip().strip("'").strip('"')
                    }
    return 

In [48]:
### function extracting the input ports

def input_ports(line,in_ports):
    #print("Extracting Input ports.")
    
    ## Extract data between "(" and ")".
    ## % FACT: has_in_port(step_id, port_id).

    data = line[line.index("(")+1:line.index(")")]
    
    data = data.split(',')
    
    in_ports[data[0]].append((data[0].strip(), data[1].strip()))
    
    return

In [49]:
### function extracting the output ports 

def output_ports(line,out_ports): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    out_ports[data[0]].append((data[0].strip(), data[1].strip()))
    #print(data)
    return

In [50]:
### function extracting the output ports 

def get_port_data(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    port_data[data[0]]= { 
                          "data_name" : data[1].strip(), 
                          "qual_data_name" : data[2].strip()
                        } 
    #print(data)
    return

In [51]:
### function extracting the output ports 

def get_channel(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    channel[data[0]]= data[1].strip()
    #print(data)
    return

In [52]:
### function extracting the output ports 

def get_port_connects_to_channel(line): 
     
    data = line[line.index("(")+1 : line.index(")")]
    data = data.split(',')
    port_connects_to_channel[data[0]]= data[1].strip()
    #print(data)
    return

In [53]:
def get_inflow_connections(line): 
     
    #print(line)
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    inflow_conn[data[0]]= data[1].strip()
    #print(data)
    return

In [54]:
def get_outflow_connections(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    outflow_conn[data[0]]= data[1].strip()
    #print(data)
    return 

In [55]:
### function extracting the subprograms 

def has_subprogram(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    list1 = []
    #print(data[0],data[1])
    if data[0] in sub_programs.keys():
        list1 = sub_programs[data[0]]
        list1.append(data[1].strip())
        sub_programs[data[0]] = list1
    else:
        list1.append(data[1].strip())
        sub_programs[data[0]]= list1
    #print(data)
    return

In [56]:
### function extracting the port alias

def port_alias(line):
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    port_id = data[0].strip()
    alias = data[1].strip().strip("'")
    
    qualified_name = ports[port_id]['qualified_port_name']
    port_name = ports[port_id]['port_name'] 
    
    port_alias_name[port_id] = alias
    
    #print(port_name, port_id,qualified_name)
    
    ### regex for splitting the string and getting qualified program name
    regex = re.compile(r'>|<')
    qname = regex.split(qualified_name)
    
    pname = qname[0].split('.')[-1] 
    
    if pname.find('-') > -1:
        port_alt_name[alias] = pname[:-1]+ '/' +port_name
    else:
        port_alt_name[alias] = pname + '/' +port_name
        
    return 



In [57]:
## Method for getting the input ports 
## for a specific program 

def get_in_ports(program_id):
    #print("inside in ports")
    input_ports =[]
    for s in in_ports[program_id]:
        #print(s)
        input_ports.append(ports[s[1]]['port_name'])
        #print(ports[int(s[1])]['port_name'])
    return input_ports

In [58]:
## Method for getting the output ports 
## for a specific program 

def get_out_ports(program_id):
    output_ports =[]
    for s in out_ports[program_id]:
        output_ports.append(ports[s[1]]['port_name'])
    return output_ports

In [59]:
## Method for create a cwl file 
## It has the required header and formatting

def create_file(filename, program_id):
    header = '''
#!/usr/bin/env cwl-runner


cwlVersion: v1.0
class: CommandLineTool
'''
    with open(filename, "w+") as writer:
        input_buffer = 'inputs: \n'
        for p in get_in_ports(program_id):
            input_buffer = input_buffer + ' '+ p + ': \n'+ '  type: string \n \n'

        #print(input_buffer)
        output_buffer = 'outputs: \n'
        output_buffer.strip()

        for p in get_out_ports(program_id):
            #print(len(p))
            if p =='' : 
                print("No output ports")
            else:
                output_buffer = output_buffer + ' '+ p + ': \n'+ '  type: string \n \n'

        writer.write(header)    
        writer.write(input_buffer)
        writer.write(output_buffer)
        writer.write("\n")
        writer.write("baseCommand: ")
    
    return

In [60]:
### This function passes file name 
### for creating cwl files. 

def create_cwl_files(program_id,wf_name): 
    pname= steps[program_id]['program_name']
    filename = "yw_cwl_parser/Examples/" + wf_name +'/' + pname + ".cwl"
    create_file(filename, program_id)
    
    return 

In [61]:
### Function for exrtacting the steps in a wf. 

def get_wf_steps(wf_id, wf_name):
    
    wf_steps = "steps: \n " 
    
    ## first get the sub-programs of a workflow: 
    for s in sub_programs[wf_id]:
        prog_name= steps[s]["program_name"]
        step_outports =[]
        if s in workflow.keys():
            filename ='wf_'+ prog_name + '.cwl'
        else:
            filename =  prog_name + '.cwl'
            
        step_buffer = prog_name + ': \n ' + ' run: ' + filename + ' \n ' + ' in: \n'
        
        for pid in in_ports[s]:
            out_in_ports(s)
            pname, qual_pname= get_inports_subprograms(pid[1], wf_id)
            #print(pname, qual_pname, pid[1])
            step_buffer = step_buffer + '   ' + pname + ': ' + qual_pname + '\n' 
            '''
            Get the outports of the program.
            '''
            
            step_outports= get_out_ports(s)
            if len(step_outports) == 0: 
                print("The step " + prog_name + " doesn't have output ports. CWL model will fail.")
                
        step_buffer =  step_buffer + "  out: [" + ' , '.join(step_outports) + '] \n ' 
        
        wf_steps = wf_steps + step_buffer
    
    return wf_steps

In [62]:
def get_set_outports(program_id):
    set_outports = {}
    for p in range(int(program_id), 1, -1):
        #print(p)
        for o in out_ports[str(p)]:
            set_outports[ports[o[1]]['data_id']] = p
    
    #print(set_outports)
    return set_outports

In [63]:
def out_in_ports(program_id):
    
    #print(steps[program_id]["program_name"])
    for ip in in_ports[program_id]:
        op_set = get_set_outports(program_id)
        #print(op_set)
        data_id = ports[ip[1]]['data_id']
        common_data_id = set([data_id]).intersection(set(op_set.keys()))
      
    ## Check if input port dataid is an output port 
    ## dataid of some previous or same step
    
        if common_data_id:
            pid = list(common_data_id)[0]
            prid = op_set[pid] 
            prog_name= steps[str(prid)]["program_name"] +'/'
            #print(prog_name)
            #if program_id in sub_programs.keys():
            #    prog_name=''

            for o in out_ports[str(prid)]:
                if ports[o[1]]["data_id"] == pid:
                    port_name = ports[o[1]]["port_name"]
                    out_in_port[data_id]= prog_name + port_name
            
            '''
            if pid in port_alias_name.keys():
                port_name = ports[pid]["port_name"]
            else:
                port_name = port_data[pid]["data_name"].strip("'")
                
            out_in_port[data_id]= prog_name +'/'+ port_name
            '''

In [64]:
def get_qual_port_name(port_name):
    regex = re.compile(r'>|<')
    for port in sorted(ports):
        #print(ports[port]["port_name"])
        if ports[port]["port_name"] == port_name:
            
            temp = regex.split(ports[port]['qualified_port_name'])
            step_name= temp[0].split('.')[-1]
            qname = step_name[:-1]
            if qname == steps['1']["program_name"] and str(temp).find('.') > -1:          
                return qname
            elif qname != steps['1']["program_name"]:
                return qname 

In [65]:
## Get the inports of the subprograms

def get_inports_subprograms(port_id,wf_id):
    
    qualified_port=''
    pname = ports[port_id]["port_name"]
    data_id = ports[port_id]["data_id"]
    #print(pname)
    a = set([port_id]).intersection(set(list(port_alias_name.keys())))
    #print(a)

    if data_id in out_in_port.keys():
        qualified_port = out_in_port[data_id]
    elif pname in get_in_ports(wf_id):
        qualified_port = pname
    #elif pname in in_out_ports.keys():
    #    qualified_port = in_out_ports[pname][1]
    elif a:
        for e in a:
            #print(port_alias_name[e])
            if port_alias_name[e] in get_in_ports('1'):
                qualified_port = port_alias_name[e]
            else:
                qualified_port = get_qual_port_name(port_alias_name[e])
    
    return pname, qualified_port 

In [66]:
def get_wf_outports(port_id):
    
    pname = ports[port_id]["port_name"]   

    ch_id = outflow_conn[port_id]
    #conn_pid = 
    for p in port_connects_to_channel:
        if port_connects_to_channel[p] == ch_id:
            conn_pid = p
    port_name = ports[conn_pid]["port_name"]
    #print(port_name)
    qname = get_qual_port_name(port_name) + '/' + port_name


    return pname, qname

In [67]:
def get_in_out_ports():
    for sid in steps.keys(): 
        for id in range(0,len(steps)+1):
            a = set(get_in_ports(str(id))).intersection(set(get_out_ports(str(sid))))
            #print(a)
            if len(a) > 0 : 
                for elem in a: 
                    #print(steps[str(id)]["program_name"] , "\t\t", steps[sid]["program_name"]+"/"+ elem )
                    in_out_ports[elem] = [steps[str(id)]["program_name"], steps[sid]["program_name"]+"/"+ elem]
                

In [68]:
## function for getting the ports with the program name
def link_wf_ports(workflow_id):
    wf_out_ports=[]
    
    for sp in sub_programs[workflow_id]:
        for p in set(get_out_ports(workflow_id)).intersection(set(get_out_ports(sp))):
            wf_out_ports.append(steps[sp]['program_name']+ '/' + p)
    
    return wf_out_ports

In [69]:
def workflow_detail(program_id, wf_name):
    header = '''
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: Workflow
requirements:
    - class: SubworkflowFeatureRequirement
'''
    input_buffer = 'inputs: \n'

    for p in get_in_ports(program_id):
        input_buffer = input_buffer.strip('') + '  '+ p + ': \n'+ '   type: string \n'
        
    output_buffer = 'outputs: \n'
    #out_wf_ports = link_wf_ports(program_id)
    
    for port in out_ports[program_id]:
        #print(port)
        pname, qname = get_wf_outports(port[1])         
        output_buffer = output_buffer + '  ' +  pname  + ': \n'+ '   type: string \n   outputSource: '+ qname +'\n'


    wf_step = get_wf_steps(program_id, wf_name)
    
    #print(wf_step)
    return header, input_buffer, output_buffer, wf_step 

In [70]:
def create_wf_file(filename, program_id,wf_name):
    #filename = 'wf_' + steps['1']["program_name"] + '.cwl'

    with open(filename, 'w+') as file_writer:
        hdr, inp_buff, out_buff, wf_step = workflow_detail(program_id,wf_name)
        file_writer.write(hdr)
        file_writer.write(inp_buff)
        file_writer.write(out_buff)
        file_writer.write(wf_step)

In [71]:
def read_yw_model(filename):
    regex = re.compile(r'^%')
    chk_channel = re.compile(r'^ch')
    with open(filename, "r") as yw_struct:
        line = yw_struct.readline()
        while line:
            #print(line)
            comments = regex.match(line)
            if comments: 
                #print(line)
                line = yw_struct.readline()
            else:  
                if("program(" in line and "has_subprogram" not in line):
                    extract_steps(line)
                elif("workflow(" in line):
                    extract_workflows(line)            
                elif("has_subprogram(" in line):
                    has_subprogram(line)            
                elif("port(" in line  and "has_in_port(" not in line and "has_out_port" not in line):
                #    print(line)
                    extract_ports(line)
                elif("has_in_port(" in line):
                    input_ports(line,in_ports)
                elif("has_out_port(" in line ):
                    output_ports(line,out_ports)
                elif ("port_alias(" in line ): 
                    port_alias(line)
                elif("data(" in line):
                    get_port_data(line)
                elif("port_connects_to_channel(" in line):
                    #print(line)
                    get_port_connects_to_channel(line)
                elif("inflow_connects_to_channel("in line):
                    #print(line)
                    get_inflow_connections(line)                    
                elif("outflow_connects_to_channel("in line):
                    get_outflow_connections(line)
                elif( chk_channel.match(line) ):
                    get_channel(line)

                line = yw_struct.readline()
        
        #get_in_out_ports()

In [72]:
steps = {}
ports = {}
workflow = {}
in_ports = defaultdict(list)
out_ports = defaultdict(list)
sub_programs = {}
port_alias_name= {}
port_alt_name={}
in_out_ports={}
out_in_port={}
cnt =0

qualified_port=''
channel={}
port_data={}
port_connects_to_channel={}
inflow_conn={}
outflow_conn={}

yw_model_file = "yw_cwl_parser/Examples/clean_name_and_date_workflow/clean_name_and_date_workflow.P"

read_yw_model(yw_model_file)
for w in sorted(workflow, reverse=True):
    dir_name = steps['1']["program_name"]
    
    wf_name = steps[w]["program_name"]
    
    #print(w)
    '''
        Get the steps name and generate the files
        for those steps.
    '''
    for sp in sub_programs[w]:
        #print(sp)
        create_cwl_files(sp,dir_name)
    '''
        Get the workflow name and generate the files
        for those workflows.
    '''
    filename = 'yw_cwl_parser/Examples/' + dir_name + '/wf_' + steps[w]["program_name"] + '.cwl'
    #print(filename)
    create_wf_file(filename,w, wf_name)


In [None]:
workflow

In [31]:
port_alias_name

{'101': '"updated_eventDate"',
 '104': '"updated_eventDate"',
 '105': '"compliant_eventDate"',
 '107': '"accepted_record_count"',
 '109': '"others"',
 '110': '"updated_eventDate"',
 '111': '"compliant_eventDate"',
 '112': '"empty_eventDate"',
 '114': '"accepted_record_count"',
 '115': '"rejected_record_count"',
 '15': '"scientificName"',
 '16': '"authorship"',
 '18': '"others"',
 '21': '"scientificName"',
 '23': '"empty_scientificName"',
 '24': '"nonEmpty_scientificName"',
 '27': '"empty_scientificName"',
 '32': '"nonEmpty_scientificName"',
 '35': '"exactMatching_local_authority_source_record"',
 '36': '"fuzzyMatching_local_authority_source_record"',
 '37': '"nonMatching_local_authority_source_record"',
 '42': '"nonEmpty_scientificName"',
 '43': '"nonMatching_local_authority_source_record"',
 '46': '"nonEmpty_scientificName"',
 '47': '"fuzzyMatching_local_authority_source_record"',
 '49': '"fuzzyMatching_local_authority_source_record"',
 '50': '"authorship"',
 '54': '"authorship"',
 '5

for w in sorted(workflow, reverse=True): 
    wf_name = steps[w]["program_name"]
    #print(wf_name)
    #wf_name ='main'
    for sp in sub_programs[w]:
        create_cwl_files(sp,wf_name)

    filename = 'Examples/' + wf_name + '/wf_' + steps[w]["program_name"] + '.cwl'
    create_wf_file(filename,w, wf_name)
    