In [1]:
import os
import re
from collections import defaultdict


### Functions for parsing the yw models file. 

In [2]:
### function extracting the programs.

def extract_steps(line):
    
    #% FACT: program(program_id, program_name, qualified_program_name, begin_annotation_id, end_annotation_id).
    #print("Extracting steps.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    
    data = data.split(',')
    #print(data)
    steps[data[0]] = {
                      "program_name" :data[1].strip().strip("'").strip('"'),
                      "qualified_program_name": data[2].strip().strip("'").strip('"')
                    }
    
    #print(steps)
    return

In [3]:
### function extracting the workflow.

def extract_workflows(line):
    
    #print(line)
    data = line[line.index("(")+1:line.index(")")]
    #print(data)
    workflow[data] = steps[data]['program_name']
       
    return 

In [5]:
### function extracting the ports

def extract_ports(line):
    #print("Extracting ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    
    #% FACT: port(port_id, port_type, port_name, qualified_port_name, port_annotation_id, data_id).
    
    data = data.split(',')
    
    ports[data[0]] ={
                     "port_type": data[1].strip().strip("'").strip('"').upper(),
                     "port_name": data[2].strip().strip("'").strip('"'),
                     "qualified_port_name" : data[3].strip().strip("'").strip('"'), 
                     "data_id":data[5].strip().strip("'").strip('"')
                    }
    return 

In [6]:
### function extracting the input ports

def input_ports(line,in_ports):
    #print("Extracting Input ports.")
    
    ## Extract data between "(" and ")".
    ## % FACT: has_in_port(step_id, port_id).

    data = line[line.index("(")+1:line.index(")")]
    
    data = data.split(',')
    
    in_ports[data[0]].append((data[0].strip(), data[1].strip()))
    
    return

In [7]:
### function extracting the output ports 

def output_ports(line,out_ports): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    out_ports[data[0]].append((data[0].strip(), data[1].strip()))
    #print(data)
    return

In [8]:
### function extracting the output ports 

def get_port_data(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    port_data[data[0]]= { 
                          "data_name" : data[1].strip(), 
                          "qual_data_name" : data[2].strip()
                        } 
    #print(data)
    return

In [9]:
### function extracting the output ports 

def get_channel(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    channel[data[0]]= data[1].strip()
    #print(data)
    return

In [10]:
### function extracting the output ports 

def get_port_connects_to_channel(line): 
     
    data = line[line.index("(")+1 : line.index(")")]
    data = data.split(',')
    port_connects_to_channel[data[0]]= data[1].strip()
    #print(data)
    return

In [11]:
def get_inflow_connections(line): 
     
    #print(line)
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    inflow_conn[data[0]]= data[1].strip()
    #print(data)
    return

In [12]:
def get_outflow_connections(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    outflow_conn[data[0]]= data[1].strip()
    #print(data)
    return 

In [13]:
### function extracting the subprograms 

def has_subprogram(line): 
    #print("Extracting output ports.")
    ## Extract data between "(" and ")".
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    list1 = []
    #print(data[0],data[1])
    if data[0] in sub_programs.keys():
        list1 = sub_programs[data[0]]
        list1.append(data[1].strip())
        sub_programs[data[0]] = list1
    else:
        list1.append(data[1].strip())
        sub_programs[data[0]]= list1
    #print(data)
    return

In [14]:
### function extracting the port alias

def port_alias(line):
    data = line[line.index("(")+1:line.index(")")]
    data = data.split(',')
    port_id = data[0].strip()
    alias = data[1].strip().strip("'")
    
    qualified_name = ports[port_id]['qualified_port_name']
    port_name = ports[port_id]['port_name'] 
    
    port_alias_name[port_id] = alias
    
    #print(port_name, port_id,qualified_name)
    
    ### regex for splitting the string and getting qualified program name
    regex = re.compile(r'>|<')
    qname = regex.split(qualified_name)
    
    pname = qname[0].split('.')[-1] 
    
    if pname.find('-') > -1:
        port_alt_name[alias] = pname[:-1]+ '/' +port_name
    else:
        port_alt_name[alias] = pname + '/' +port_name
        
    return 



In [15]:
## Method for getting the input ports 
## for a specific program 

def get_in_ports(program_id):
    #print("inside in ports")
    input_ports =[]
    for s in in_ports[program_id]:
        #print(s)
        input_ports.append(ports[s[1]]['port_name'])
        #print(ports[int(s[1])]['port_name'])
    return input_ports

In [16]:
## Method for getting the output ports 
## for a specific program 

def get_out_ports(program_id):
    output_ports =[]
    for s in out_ports[program_id]:
        output_ports.append(ports[s[1]]['port_name'])
    return output_ports

In [322]:
## Method for create a cwl file 
## It has the required header and formatting

def create_file(filename, program_id):
    header = '''
#!/usr/bin/env cwl-runner


cwlVersion: v1.0
class: CommandLineTool
'''
    with open(filename, "w+") as writer:
        input_buffer = 'inputs: \n'
        inp_temp = []
        out_temp = []
        for p in get_in_ports(program_id):
            inp_temp.append(p)
            input_buffer = input_buffer + ' '+ p + ': \n'+ '  type: string \n \n'

        #print(input_buffer)
        output_buffer = 'outputs: \n'
        output_buffer.strip()

        for p in get_out_ports(program_id):
            #print(len(p))
            if len(p)==0 : 
                print("No output ports")
            else:
                output_buffer = output_buffer + ' '+ p + ': \n'+ '  type: string \n \n'
            out_temp.append(p)

        writer.write(header)    
        writer.write(input_buffer)
        writer.write(output_buffer)
        writer.write("\n")
        writer.write("baseCommand: ")
        cwl_file_data [program_id]= {"program_name" : steps[program_id]["program_name"],
                                     "input_ports"  : inp_temp,
                                     "output_ports" : out_temp
                                    }
    return

In [323]:
### This function passes file name 
### for creating cwl files. 

def create_cwl_files(program_id,dir_name): 
    pname= steps[program_id]['program_name']
    filename = dir_name + pname + ".cwl"
    create_file(filename, program_id)
    
    return 

In [324]:
### Function for exrtacting the steps in a wf. 

def get_wf_steps(wf_id, wf_name):
    
    wf_steps = "steps: \n " 
    
    ## first get the sub-programs of a workflow: 
    for s in sub_programs[wf_id]:
        prog_name= steps[s]["program_name"]
        step_outports =[]
        if s in workflow.keys():
            filename ='wf_'+ prog_name + '.cwl'
        else:
            filename =  prog_name + '.cwl'
            
        step_buffer = prog_name + ': \n ' + ' run: ' + filename + ' \n ' + ' in: \n'
        
        for pid in in_ports[s]:
            out_in_ports(s)
            pname, qual_pname= get_inports_subprograms(pid[1], wf_id)
            print(pname, qual_pname, pid[1])
            step_buffer = step_buffer + '   ' + pname + ': ' + qual_pname + '\n' 
            '''
            Get the outports of the program.
            '''
            
            step_outports= get_out_ports(s)
            if len(step_outports) == 0: 
                print("The step " + prog_name + " doesn't have output ports. CWL model will fail.")
                
        step_buffer =  step_buffer + "  out: [" + ' , '.join(step_outports) + '] \n ' 
        
        wf_steps = wf_steps + step_buffer
    
    return wf_steps

In [325]:
def get_set_outports(program_id):
    set_outports = {}
    for p in steps.keys():
        #print(p)
        for o in out_ports[str(p)]:
            set_outports[ports[o[1]]['data_id']] = p
    
    #print(set_outports)
    return set_outports

In [326]:
def out_in_ports(program_id):
    
    #print(steps[program_id]["program_name"])
    for ip in in_ports[program_id]:
        op_set = get_set_outports(program_id)
        #print(op_set)
        data_id = ports[ip[1]]['data_id']
        common_data_id = set([data_id]).intersection(set(op_set.keys()))
      
    ## Check if input port dataid is an output port 
    ## dataid of some previous or same step
    
        if common_data_id:
            pid = list(common_data_id)[0]
            prid = op_set[pid] 
            prog_name= steps[str(prid)]["program_name"] +'/'
            #print(prog_name)
            #if program_id in sub_programs.keys():
            #    prog_name=''

            for o in out_ports[str(prid)]:
                if ports[o[1]]["data_id"] == pid:
                    port_name = ports[o[1]]["port_name"]
                    out_in_port[data_id]= prog_name + port_name
            
            '''
            if pid in port_alias_name.keys():
                port_name = ports[pid]["port_name"]
            else:
                port_name = port_data[pid]["data_name"].strip("'")
                
            out_in_port[data_id]= prog_name +'/'+ port_name
            '''

In [327]:
def get_qual_port_name(port_name):
    regex = re.compile(r'>|<')
    for port in ports:
        #print(ports[port]["port_name"])
        if ports[port]["port_name"] == port_name:
            
            temp = regex.split(ports[port]['qualified_port_name'])
            step_name= temp[0].split('.')[-1]
            qname = step_name[:-1]
            if qname == steps['1']["program_name"] and str(temp).find('.') > -1:          
                return qname
            elif qname != steps['1']["program_name"]:
                return qname 

In [328]:
## Get the inports of the subprograms

def get_inports_subprograms(port_id,wf_id):
    
    qualified_port=''
    pname = ports[port_id]["port_name"]
    data_id = ports[port_id]["data_id"]
    #print(pname)
    a = set([port_id]).intersection(set(list(port_alias_name.keys())))
    #print(a)

    if data_id in out_in_port.keys():
        qualified_port = out_in_port[data_id]
    elif pname in get_in_ports(wf_id):
        qualified_port = pname
    #elif pname in in_out_ports.keys():
    #    qualified_port = in_out_ports[pname][1]
    elif a:
        for e in a:
            #print(port_alias_name[e])
            if port_alias_name[e] in get_in_ports(wf_id):
                qualified_port = port_alias_name[e]
            else:
                qualified_port = get_qual_port_name(port_alias_name[e])
    else:
        qualified_port = pname
        
    return pname, qualified_port 

In [329]:
def get_wf_outports(port_id):
    
    pname = ports[port_id]["port_name"]   

    ch_id = outflow_conn[port_id]
    #conn_pid = 
    for p in port_connects_to_channel:
        if port_connects_to_channel[p] == ch_id:
            conn_pid = p
    port_name = ports[conn_pid]["port_name"]
    #print(port_name)
    qname = get_qual_port_name(port_name) + '/' + port_name


    return pname, qname

In [330]:
def get_in_out_ports():
    for sid in steps.keys(): 
        for id in range(0,len(steps)+1):
            a = set(get_in_ports(str(id))).intersection(set(get_out_ports(str(sid))))
            #print(a)
            if len(a) > 0 : 
                for elem in a: 
                    #print(steps[str(id)]["program_name"] , "\t\t", steps[sid]["program_name"]+"/"+ elem )
                    in_out_ports[elem] = [steps[str(id)]["program_name"], steps[sid]["program_name"]+"/"+ elem]
                

In [331]:
## function for getting the ports with the program name
def link_wf_ports(workflow_id):
    wf_out_ports=[]
    
    for sp in sub_programs[workflow_id]:
        for p in set(get_out_ports(workflow_id)).intersection(set(get_out_ports(sp))):
            wf_out_ports.append(steps[sp]['program_name']+ '/' + p)
    
    return wf_out_ports

In [343]:
def workflow_detail(program_id, wf_name):
    header = '''
#!/usr/bin/env cwl-runner
cwlVersion: v1.0
class: Workflow
requirements:
    - class: SubworkflowFeatureRequirement
'''
    input_buffer = 'inputs: \n'
    inp_temp = []
    out_temp = []
    for p in get_in_ports(program_id):
        inp_temp.append(p)
        input_buffer = input_buffer.strip('') + '  '+ p + ': \n'+ '   type: string \n'
        
    output_buffer = 'outputs: \n'
    #out_wf_ports = link_wf_ports(program_id)
    
    for port in out_ports[program_id]:
        #print(port)
        pname, qname = get_wf_outports(port[1])
        out_temp.append((pname,qname))
        output_buffer = output_buffer + '  ' +  pname  + ': \n'+ '   type: string \n   outputSource: '+ qname +'\n'


    wf_step = get_wf_steps(program_id, wf_name)
    
    cwl_wf_data[program_id]  = {"program_name" : steps[program_id]["program_name"],
                                  "input_ports" : inp_temp,
                                  "wf_steps"    : wf_step,
                                  "output_ports": out_temp
                                 }
    #print(wf_step)
    return header, input_buffer, output_buffer, wf_step 

In [344]:
def create_wf_file(filename, program_id,wf_name):
    #filename = 'wf_' + steps['1']["program_name"] + '.cwl'

    with open(filename, 'w+') as file_writer:
        hdr, inp_buff, out_buff, wf_step = workflow_detail(program_id,wf_name)
        file_writer.write(hdr)
        file_writer.write(inp_buff)
        file_writer.write(out_buff)
        file_writer.write(wf_step)

In [345]:
def read_yw_model(filename):
    regex = re.compile(r'^%')
    chk_channel = re.compile(r'^ch')
    with open(filename, "r") as yw_struct:
        line = yw_struct.readline()
        while line:
            #print(line)
            comments = regex.match(line)
            if comments: 
                #print(line)
                line = yw_struct.readline()
            else:  
                if("program(" in line and "has_subprogram" not in line):
                    extract_steps(line)
                elif("workflow(" in line):
                    extract_workflows(line)            
                elif("has_subprogram(" in line):
                    has_subprogram(line)            
                elif("port(" in line  and "has_in_port(" not in line and "has_out_port" not in line):
                #    print(line)
                    extract_ports(line)
                elif("has_in_port(" in line):
                    input_ports(line,in_ports)
                elif("has_out_port(" in line ):
                    output_ports(line,out_ports)
                elif ("port_alias(" in line ): 
                    port_alias(line)
                elif("data(" in line):
                    get_port_data(line)
                elif("port_connects_to_channel(" in line):
                    #print(line)
                    get_port_connects_to_channel(line)
                elif("inflow_connects_to_channel("in line):
                    #print(line)
                    get_inflow_connections(line)                    
                elif("outflow_connects_to_channel("in line):
                    get_outflow_connections(line)
                elif( chk_channel.match(line) ):
                    get_channel(line)

                line = yw_struct.readline()
        
        #get_in_out_ports()

In [346]:
steps = {}
ports = {}
workflow = {}
in_ports = defaultdict(list)
out_ports = defaultdict(list)
sub_programs = {}
port_alias_name= {}
port_alt_name={}
in_out_ports={}
out_in_port={}
cnt =0

qualified_port=''
channel={}
port_data={}
port_connects_to_channel={}
inflow_conn={}
outflow_conn={}
cwl_file_data = {}
cwl_wf_data = {}

yw_model_file = "Examples/simulate_data_collection/simulate_data_collection.P"

read_yw_model(yw_model_file)
for w in sorted(workflow, reverse=True):
    dir_name = 'Examples/' + steps['1']["program_name"] + '/'
    
    wf_name = steps[w]["program_name"]
    
    #print(w)
    '''
        Get the steps name and generate the files
        for those steps.
    '''
    for sp in sub_programs[w]:
        #print(sp)
        create_cwl_files(sp,dir_name)
    '''
        Get the workflow name and generate the files
        for those workflows.
    '''
    filename =  dir_name + 'wf_' + steps[w]["program_name"] + '.cwl'
    #print(filename)
    create_wf_file(filename,w, wf_name)


cassette_id cassette_id 10
sample_score_cutoff sample_score_cutoff 11
cassette_id cassette_id 13
sample_spreadsheet sample_spreadsheet 14
sample_score_cutoff sample_score_cutoff 18
data_redundancy data_redundancy 19
sample_name load_screening_results/sample_name 20
sample_quality load_screening_results/sample_quality 21
cassette_id cassette_id 26
rejected_sample calculate_strategy/rejected_sample 27
cassette_id cassette_id 29
num_images calculate_strategy/num_images 30
accepted_sample calculate_strategy/accepted_sample 31
energies calculate_strategy/energies 32
sample_id collect_data_set/sample_id 38
energy collect_data_set/energy 39
frame_number collect_data_set/frame_number 40
raw_image_path collect_data_set/raw_image_path 41
calibration_image calibration_image 42
cassette_id cassette_id 48
sample_id collect_data_set/sample_id 49
frame_number collect_data_set/frame_number 50
corrected_image_path transform_images/corrected_image_path 51
total_intensity transform_images/total_intensity

In [347]:
for op in out_ports['1']:
    
    print(ports[op[1]]['port_name'], op[1])

corrected_image 6
run_log 7
collection_log 8
rejection_log 9


In [317]:
port_id = '3'
pname = ports[port_id]["port_name"]   

ch_id = outflow_conn[port_id]
print(ch_id)
#conn_pid = 
for p in port_connects_to_channel:
    if port_connects_to_channel[p] == ch_id:
        conn_pid = p
port_name = ports[conn_pid]["port_name"]
print(port_name, conn_pid)
qname = get_qual_port_name(port_name) + '/' + port_name

print(qname)

KeyError: '3'

In [113]:
for i in in_ports['2']:
    print(ports[i[1]])

{'port_type': 'IN', 'port_name': 'input1_step1', 'qualified_port_name': 'loopback.step1<-input1_step1', 'data_id': '3'}
{'port_type': 'IN', 'port_name': 'input2_step1', 'qualified_port_name': 'loopback.step1<-input2_step1', 'data_id': '4'}


In [148]:
sub_programs

{'1': ['2', '3', '4', '5', '6', '7', '8']}

In [348]:
cwl_wf_data

{'1': {'input_ports': ['cassette_id',
   'sample_score_cutoff',
   'data_redundancy',
   'sample_spreadsheet',
   'calibration_image'],
  'output_ports': [('corrected_image', 'transform_images/corrected_image'),
   ('run_log', 'initialize_run/run_log'),
   ('collection_log', 'log_average_image_intensity/collection_log'),
   ('rejection_log', 'log_rejected_sample/rejection_log')],
  'program_name': 'simulate_data_collection',
  'wf_steps': 'steps: \n initialize_run: \n  run: initialize_run.cwl \n  in: \n   cassette_id: cassette_id\n   sample_score_cutoff: sample_score_cutoff\n  out: [run_log] \n load_screening_results: \n  run: load_screening_results.cwl \n  in: \n   cassette_id: cassette_id\n   sample_spreadsheet: sample_spreadsheet\n  out: [sample_name , sample_quality , run_log] \n calculate_strategy: \n  run: calculate_strategy.cwl \n  in: \n   sample_score_cutoff: sample_score_cutoff\n   data_redundancy: data_redundancy\n   sample_name: load_screening_results/sample_name\n   sample

In [349]:
cwl_file_data

{'2': {'input_ports': ['cassette_id', 'sample_score_cutoff'],
  'output_ports': ['run_log'],
  'program_name': 'initialize_run'},
 '3': {'input_ports': ['cassette_id', 'sample_spreadsheet'],
  'output_ports': ['sample_name', 'sample_quality', 'run_log'],
  'program_name': 'load_screening_results'},
 '4': {'input_ports': ['sample_score_cutoff',
   'data_redundancy',
   'sample_name',
   'sample_quality'],
  'output_ports': ['accepted_sample',
   'rejected_sample',
   'num_images',
   'energies'],
  'program_name': 'calculate_strategy'},
 '5': {'input_ports': ['cassette_id', 'rejected_sample'],
  'output_ports': ['rejection_log'],
  'program_name': 'log_rejected_sample'},
 '6': {'input_ports': ['cassette_id',
   'num_images',
   'accepted_sample',
   'energies'],
  'output_ports': ['sample_id',
   'energy',
   'frame_number',
   'raw_image_path',
   'run_log'],
  'program_name': 'collect_data_set'},
 '7': {'input_ports': ['sample_id',
   'energy',
   'frame_number',
   'raw_image_path',


In [189]:
import pandas as pd


In [338]:
df = pd.DataFrame.from_dict(cwl_file_data,orient='index')
indexes = range(0,df.shape[0])
#cwl_file_data

In [377]:
print(df)

                  program_name  \
2               initialize_run   
3       load_screening_results   
4           calculate_strategy   
5          log_rejected_sample   
6             collect_data_set   
7             transform_images   
8  log_average_image_intensity   

                                         input_ports  \
2                 [cassette_id, sample_score_cutoff]   
3                  [cassette_id, sample_spreadsheet]   
4  [sample_score_cutoff, data_redundancy, sample_...   
5                     [cassette_id, rejected_sample]   
6  [cassette_id, num_images, accepted_sample, ene...   
7  [sample_id, energy, frame_number, raw_image_pa...   
8  [cassette_id, sample_id, frame_number, correct...   

                                        output_ports  
2                                          [run_log]  
3             [sample_name, sample_quality, run_log]  
4  [accepted_sample, rejected_sample, num_images,...  
5                                    [rejection_log]  
6  

In [350]:
df1 = pd.DataFrame.from_dict(cwl_wf_data, orient='index')

In [272]:
dict = {"country": ["Brazil", "Russia", "India", "China", "South Africa"],
       "capital": ["Brasilia", "Moscow", "New Dehli", "Beijing", "Pretoria"],
       "area": [8.516, 17.10, 3.286, 9.597, 1.221],
       "population": [200.4, 143.5, 1252, 1357, 52.98] }

import pandas as pd
brics = pd.DataFrame(dict)
print(brics)

     area    capital       country  population
0   8.516   Brasilia        Brazil      200.40
1  17.100     Moscow        Russia      143.50
2   3.286  New Dehli         India     1252.00
3   9.597    Beijing         China     1357.00
4   1.221   Pretoria  South Africa       52.98


In [355]:
df1[:2]

Unnamed: 0,program_name,input_ports,wf_steps,output_ports
1,simulate_data_collection,"[cassette_id, sample_score_cutoff, data_redund...",steps: \n initialize_run: \n run: initialize_...,"[(corrected_image, transform_images/corrected_..."
