In [201]:
import pandas as pd
import numpy as np
import json
import os
import urllib3
import csv
import logging
import re
import uuid
from urllib.parse import quote
from lxml import etree
from utils import dac2idmc

# Set the pandas options
pd.set_option('display.max_columns', None)

# Initialise the job

In [202]:
# Initialise the log file
logging.basicConfig(
    filename='logs/console.log',
    level=logging.DEBUG,
    format='%(asctime)s:%(levelname)s:%(message)s',
)

In [203]:
# Read the config file
logging.info('Reading the config file')
with open('config/config.json', 'r') as infile:
    config = json.load(infile)


In [204]:
# Define the namespace map
namespaces = {
    'aetgt': 'http://schemas.active-endpoints.com/appmodules/repository/2010/10/avrepository.xsd',
    'types1': 'http://schemas.active-endpoints.com/appmodules/repository/2010/10/avrepository.xsd'
}

# Parse the template Taskflow
tree = etree.parse('templates/tf_Base_Taskflow.TASKFLOW.xml')
root = tree.getroot()

# Read the Inputs

In [205]:
# Read the execution plans
logging.info('Reading the execution plans')
dfPlans = pd.read_csv('in/plans.csv', dtype='str', encoding='utf-8', na_filter=False)
dfPlans['plan_step_order'] = dfPlans['plan_step_order'].astype(int)

# Lookup the Converted Mapping Task IDs

In [206]:
# Login to IDMC
logging.info('Logging into IDMC')
http = urllib3.PoolManager()

data = '{ "username": "' + config['idmc']['user'] + '", "password": "' + config['idmc']['password'] + '" }'

url = 'https://' + config['idmc']['host'] + '/saas/public/core/v3/login'
r = http.request(
    'POST', 
    url,
    timeout=3000,
    body=data,
    headers={
            'Accept': 'application/json',
            'Content-Type': 'application/json'
        }
    )
    
# Convert the response into a datframe
result = json.loads(r.data.decode('utf-8'))
sessionID = result['userInfo']['sessionId']


In [207]:
# Get the secure agent ID
logging.info('Getting the secure agent ID')
url = 'https://' + config['idmc']['pod'] + '.' + config['idmc']['host'] + '/saas/api/v2/runtimeEnvironment/name/' + quote(config['idmc']['agentGroupName'])
r = http.request(
    'GET', 
    url,
    timeout=3000,
    headers={
            'Accept': 'application/json',
            'icSessionId': sessionID
        }
    )
    
# Convert the response into a datframe
result = json.loads(r.data.decode('utf-8'))
agentGroupID = result['id']
agentGroupGUID = result['federatedId']
agentGroupName = config['idmc']['agentGroupName']

In [208]:
# Initialise the tasks data frame
logging.info('Getting a list of the mapping tasks')
skip = 0
limit = 200
i = 0
dfTasks = pd.DataFrame()

# Page through mapping task queries
while True:

    # Get a list of the mapping tasks
    url = 'https://' + config['idmc']['pod'] + '.' + config['idmc']['host'] + '/saas/public/core/v3/objects?q=type==%27MTT%27&limit=' + str(limit) + '&skip=' + str(skip)
    r = http.request(
        'GET', 
        url,
        timeout=3000,
        headers={
                'Accept': 'application/json',
                'INFA-SESSION-ID': sessionID
            }
        )
        
    # Convert the response into a datframe
    result = json.loads(r.data.decode('utf-8'))
    dfTmp = pd.json_normalize(result)
    dfResp = dfTmp.copy()
    taskCount = dfResp.iloc[0]['count']
    dfResp = dfResp['objects'].explode()
    dfResp = pd.DataFrame(dfResp)
    dfResp = pd.json_normalize(dfResp['objects'])
    dfTasks = pd.concat([dfTasks, dfResp], ignore_index=True)

    # Break if all records have been returned
    i = i + limit
    if i > taskCount:
        break



In [209]:
# Join the IDMC info onto the plans
dfResp['step_name'] = dfResp['path'].apply(lambda x: os.path.basename(x))
dfResp = dfResp[['step_name','id','path']]
dfResp = dfResp.rename(columns={'id': 'infa_id', 'path': 'infa_path'})
dfPlans = dfPlans.merge(dfResp, how='left', on='step_name')
dfPlans['agent_id'] = agentGroupID
dfPlans['agent_guid'] = agentGroupGUID
dfPlans['agent_name'] = agentGroupName
dfPlans['script_dir'] = config['local']['scriptsDir']
dfPlans['script_args'] = '' # TODO placeholder for any args that need to be passed to the step script

In [210]:
# Generate the unique identifiers
dfPlans['dac2idmc_step_id'] = dfPlans.apply(lambda x: str(uuid.uuid4()).replace('-',''), axis=1)
map_order_uuid = { step_order: str(uuid.uuid4()).replace('-','') for step_order in dfPlans['plan_step_order'].unique() }
dfPlans['dac2idmc_group_id'] = dfPlans['plan_step_order'].map(map_order_uuid)


In [211]:
dfPlans.head()

Unnamed: 0,plan_wid,plan_name,plan_inactive_flag,plan_step_wid,plan_step_order,plan_step_type,step_guid,step_wid,step_cmd,step_name,infa_id,infa_path,agent_id,agent_guid,agent_name,script_dir,script_args,dac2idmc_step_id,dac2idmc_group_id
0,201C9CC7C59D167A79A2E247C6189A67,Echo Employee Snapshot Oracle R12.1.3,N,4F418CAB76FCD515C5A7F3C94D552CF,0,REGULAR,4F418CAB76FCD515C5A7F3C94D552CF,1911f67f35d9a487283f503fc7ab2ac,SDE_ORA_EmployeeDailySnapshotFact_2,SDE_ORA_EmployeeDailySnapshotFact_2,a0Ks8uNXYKLg38LRNzw6gv,Default/SDE_ORA_EmployeeDailySnapshotFact_2,010SU125000000000002,cPbb2XLzpoweqfBP9W6kOJ,AUW487V7S3-AAD,C:\Informatica\scripts,,191c8b43dc8e469ea5f4fa4b99796e0b,8f7d23925d5044e480570706998a8650
1,201C9CC7C59D167A79A2E247C6189A67,Echo Employee Snapshot Oracle R12.1.3,N,D1A6418BA9B721A3282ECC5948E8866D,0,REGULAR,,15cf56791e1279889d61b836271199a,SDE_ORA_EmployeeDailySnapshotFact_3,SDE_ORA_EmployeeDailySnapshotFact_3,8gORkCCxfM8hk1VxABRRqd,Default/SDE_ORA_EmployeeDailySnapshotFact_3,010SU125000000000002,cPbb2XLzpoweqfBP9W6kOJ,AUW487V7S3-AAD,C:\Informatica\scripts,,a1ddcf36679f48dabab02018f8397a7d,8f7d23925d5044e480570706998a8650
2,201C9CC7C59D167A79A2E247C6189A67,Echo Employee Snapshot Oracle R12.1.3,N,C4FA37D5638DC49425B9A3E8572EF1,0,REGULAR,,17d561355624ced1967afc8fb7a1836e,SDE_ORA_EmployeeDailySnapshotFact_4,SDE_ORA_EmployeeDailySnapshotFact_4,6vPlMGJ7sw6juVvUtKpxYq,Default/SDE_ORA_EmployeeDailySnapshotFact_4,010SU125000000000002,cPbb2XLzpoweqfBP9W6kOJ,AUW487V7S3-AAD,C:\Informatica\scripts,,455d14dc8ec6475ab92315a723e3ac19,8f7d23925d5044e480570706998a8650
3,201C9CC7C59D167A79A2E247C6189A67,Echo Employee Snapshot Oracle R12.1.3,N,C1CB82CC89178BE3424CEE1896ED,0,REGULAR,,88818EC577E3BDEA5FAF5D56ED6E8442,,TASK_GROUP_Extract_EmployeeDailySnapshotFact_P...,7PjDO1zxryekytQ7Oh1zEP,Default/TASK_GROUP_Extract_EmployeeDailySnapsh...,010SU125000000000002,cPbb2XLzpoweqfBP9W6kOJ,AUW487V7S3-AAD,C:\Informatica\scripts,,3be7e35490a44199aad3e2a462cdad74,8f7d23925d5044e480570706998a8650
4,201C9CC7C59D167A79A2E247C6189A67,Echo Employee Snapshot Oracle R12.1.3,N,C9EA5C489D4F8CE8B019A694E906B6B,1,REGULAR,,6bcdbdd6812a9ea517908ca566436fb,SDE_ORA_EmployeeDailySnapshotFact_1,SDE_ORA_EmployeeDailySnapshotFact_1,aeOQ6VugmCxipbYyKlDRUr,Default/SDE_ORA_EmployeeDailySnapshotFact_1,010SU125000000000002,cPbb2XLzpoweqfBP9W6kOJ,AUW487V7S3-AAD,C:\Informatica\scripts,,ae855091cdf94966bd83fd6afcdd537d,f6296dba61c749ee91214cf8c38c4f8a


In [212]:
# Log an error if any plans did not find an existing matching task
dfMissing = dfPlans[(dfPlans['infa_id'] == '') | (dfPlans['infa_id'].isna())].copy()
if len(dfMissing.index) > 0:
    logging.error('Some plans are missing a converted mapping task. Please see "out/missing_tasks.csv" for more details')
    dfMissing.to_csv('out/missing_tasks.csv', index=False, quoting=csv.QUOTE_ALL)

# Generate the Taskflows

In [213]:
#TODO remove after debugging
dfPlans = dfPlans.iloc[[0]]
dfPlans

Unnamed: 0,plan_wid,plan_name,plan_inactive_flag,plan_step_wid,plan_step_order,plan_step_type,step_guid,step_wid,step_cmd,step_name,infa_id,infa_path,agent_id,agent_guid,agent_name,script_dir,script_args,dac2idmc_step_id,dac2idmc_group_id
0,201C9CC7C59D167A79A2E247C6189A67,Echo Employee Snapshot Oracle R12.1.3,N,4F418CAB76FCD515C5A7F3C94D552CF,0,REGULAR,4F418CAB76FCD515C5A7F3C94D552CF,1911f67f35d9a487283f503fc7ab2ac,SDE_ORA_EmployeeDailySnapshotFact_2,SDE_ORA_EmployeeDailySnapshotFact_2,a0Ks8uNXYKLg38LRNzw6gv,Default/SDE_ORA_EmployeeDailySnapshotFact_2,010SU125000000000002,cPbb2XLzpoweqfBP9W6kOJ,AUW487V7S3-AAD,C:\Informatica\scripts,,191c8b43dc8e469ea5f4fa4b99796e0b,8f7d23925d5044e480570706998a8650


In [214]:
# Insert placeholder element for temp fields
# <tempFields>
# </tempFields>
tmpFieldsEl = etree.Element("tempFields")
parents = root.xpath("//*[local-name() = 'taskflow'][1]")

# Append the temp fields element if found
if parents:
    for parent in parents:
        parent.append(tmpFieldsEl)
else:
    logging.error("The specified parent tag was not found.")


In [215]:
# Insert placeholder element for dependencies
# <dependencies>
# </dependencies>
tmpDepEl = etree.Element("dependencies")
parents = root.xpath("//*[local-name() = 'taskflow'][1]")

# Append the dependencies element if found
if parents:
    for parent in parents:
        parent.append(tmpDepEl)
else:
    logging.error("The specified parent tag was not found.")


In [216]:
# Get the list of unique plans
uniquePlans = dfPlans['plan_name'].unique()

# Process each of the plans
for plan in uniquePlans:
    dfTmpPlans = dfPlans.copy()
    dfTmpPlans = dfTmpPlans[dfTmpPlans['plan_name'] == plan]
    
    # Process the step order
    uniqueSteps = np.sort( dfTmpPlans['plan_step_order'].unique() )
    for stepIdx, step in enumerate(uniqueSteps):
        dfTmpSteps = dfTmpPlans.copy()
        dfTmpSteps = dfTmpSteps[dfTmpSteps['plan_step_order'] == step]
        numTasks = len(dfTmpSteps)
        
        
        # Process the tasks for the current step
        for taskIdx, task in dfTmpSteps.iterrows():
            containerID = str(uuid.uuid4()).replace('-','')
            stepID = str(uuid.uuid4()).replace('-','')

            # Update the start link if the first container
            # <start id="b">
            #   <link id="m6yllm0z" targetId="m6yllm1m"/>
            # </start>
            if stepIdx == 0 and taskIdx == 0:
                startLink = root.xpath("//*[local-name() = 'flow'][1]/*[local-name() = 'start'][1]/*[local-name() = 'link'][1]")
                startLink[0].set('targetId', containerID)

            # Insert a new temp field element for the task
            # <field description="" name="Data Task 1" type="reference">
            #     <options>
            #         <option name="failOnNotRun">false</option>
            #         <option name="failOnFault">false</option>
            #         <option name="referenceTo">$po:mt-Example-1jY0fuy0iEUhkrHVLx78WK</option>
            #     </options>
            # </field>
            fieldEl = etree.Element("field", description="", name=task['step_name'], type="reference")
            optsEl = etree.SubElement(fieldEl, "options")
            etree.SubElement(optsEl, "option", name="failOnNotRun").text = "false"
            etree.SubElement(optsEl, "option", name="failOnFault").text = "false"
            if task['plan_step_type'] == 'CREATE_QUERY_INDEXES':
                tmpName = 'INFA-commandTask'
            else:
                tmpName = re.sub( r'[^A-Za-z0-9\-]+', '-', task['step_name'] )
            etree.SubElement(optsEl, "option", name="referenceTo").text = f"$po:{ tmpName }-{ task['infa_id'] }"

            # Append the field element
            parents = root.xpath("//*[local-name() = 'taskflow'][1]/*[local-name() = 'tempFields'][1]")
            if parents:
                for parent in parents:
                    parent.append(fieldEl)
            else:
                logging.error("The specified parent tag was not found.")

            # Insert a dependency for the task
            # <processObject xmlns="http://schemas.active-endpoints.com/appmodules/screenflow/2011/06/avosHostEnvironment.xsd"
            #                displayName="SDE-ORA-EmployeeDailySnapshotFact-2-a0Ks8uNXYKLg38LRNzw6gv"
            #                isByCopy="true"
            #                name="SDE-ORA-EmployeeDailySnapshotFact-2-a0Ks8uNXYKLg38LRNzw6gv">
            #    <description/>
            #    <tags/>
            #    <detail>
            #        <field label="TaskProperties Parameters"
            #                name="taskProperties"
            #                nullable="true"
            #                required="false"
            #                type="reference"/>
            #        <field label="Output Parameters"
            #                name="output"
            #                nullable="true"
            #                required="false"
            #                type="reference"/>
            #        <field label="Fault"
            #                name="fault"
            #                nullable="true"
            #                required="false"
            #                type="reference"/>
            #        <field label="Max Wait (Seconds)"
            #                name="Max_Wait"
            #                nullable="true"
            #                required="false"
            #                type="int"/>
            #    </detail>
            #</processObject>
            # Create the root element 'processObject' with attributes
            poEl = etree.Element(
                "processObject",
                xmlns="http://schemas.active-endpoints.com/appmodules/screenflow/2011/06/avosHostEnvironment.xsd",
                displayName=f"{ tmpName }-{ task['infa_id'] }",
                isByCopy="true",
                name=f"{ tmpName }-{ task['infa_id'] }"
            )

            # Add 'description' and 'tags' sub-elements
            etree.SubElement(poEl, "description")
            etree.SubElement(poEl, "tags")

            # Add 'detail' sub-element
            detail = etree.SubElement(poEl, "detail")

            # Define fields and add them as sub-elements to 'detail'
            fields = [
                {"label": "TaskProperties Parameters", "name": "taskProperties", "nullable": "true", "required": "false", "type": "reference"},
                {"label": "Output Parameters", "name": "output", "nullable": "true", "required": "false", "type": "reference"},
                {"label": "Fault", "name": "fault", "nullable": "true", "required": "false", "type": "reference"},
                {"label": "Max Wait (Seconds)", "name": "Max_Wait", "nullable": "true", "required": "false", "type": "int"},
            ]

            for field_attrs in fields:
                etree.SubElement(detail, "field", **field_attrs)

            # Append the dependency element
            parents = root.xpath("//*[local-name() = 'taskflow'][1]/*[local-name() = 'dependencies'][1]")
            if parents:
                for parent in parents:
                    parent.append(poEl)
            else:
                logging.error("The specified parent tag was not found.")

            

            # Insert the container
            # <eventContainer id="m6yllm1m">
            #      <service id="m6yllm0y">
            #         <title>Data Task 1</title>
            #         <serviceName>ICSExecuteDataTask</serviceName>
            #         <serviceGUID/>
            #         <serviceInput>
            #            <parameter name="Wait for Task to Complete" source="constant" updatable="true">true</parameter>
            #            <parameter name="Max Wait" source="constant" updatable="true">604800</parameter>
            #            <parameter name="Task Name" source="constant" updatable="true">mt_Example</parameter>
            #            <parameter name="GUID" source="constant" updatable="true">1jY0fuy0iEUhkrHVLx78WK</parameter>
            #            <parameter name="Task Type" source="constant" updatable="true">MCT</parameter>
            #            <parameter name="Has Inout Parameters" source="constant" updatable="true">false</parameter>
            #            <parameter name="taskField" source="nested">
            #               <operation source="field" to="mt-Example-1jY0fuy0iEUhkrHVLx78WK">temp.Data Task 1</operation>
            #            </parameter>
            #         </serviceInput>
            #         <serviceOutput>
            #            <operation source="field" to="temp.Data Task 1/output/Object_Name">Object Name</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Run_Id">Run Id</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Log_Id">Log Id</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Task_Id">Task Id</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Task_Status">Task Status</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Success_Source_Rows">Success Source Rows</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Failed_Source_Rows">Failed Source Rows</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Success_Target_Rows">Success Target Rows</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Failed_Target_Rows">Failed Target Rows</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Start_Time">Start Time</operation>
            #            <operation source="field" to="temp.Data Task 1/output/End_Time">End Time</operation>
            #            <operation source="field" to="temp.Data Task 1/output/Error_Message">Error Message</operation>
            #            <operation source="field" to="temp.Data Task 1/output/TotalTransErrors">Total Transformation Errors</operation>
            #            <operation source="field" to="temp.Data Task 1/output/FirstErrorCode">First Error Code</operation>
            #         </serviceOutput>
            #      </service>
            #      <link id="m6zv1b1q" targetId="m6zv1b2f"/>
            #      <events>
            #         <catch faultField="temp.Data Task 1/fault"
            #                id="m6yllm1k"
            #                interrupting="true"
            #                name="error">
            #            <suspend/>
            #         </catch>
            #         <catch faultField="temp.Data Task 1/fault"
            #                id="m6yllm1l"
            #                interrupting="true"
            #                name="warning"/>
            #      </events>
            #   </eventContainer>
            eventEl = etree.Element("eventContainer", id=containerID)
            svcEl = etree.SubElement(eventEl, "service", id=stepID)
            
            # Create child elements under 'service'
            etree.SubElement(svcEl, "title").text = task['step_name']
            etree.SubElement(svcEl, "serviceName").text = "ICSExecuteDataTask"
            etree.SubElement(svcEl, "serviceGUID")

            # Create 'serviceInput' element
            service_input = etree.SubElement(svcEl, "serviceInput")

            # Create 'parameter' elements under 'serviceInput'
            parameters = [
                {"name": "Wait for Task to Complete", "value": "true"},
                {"name": "Max Wait", "value": "604800"},
                {"name": "Task Name", "value": task['step_name']},
                {"name": "GUID", "value": task['infa_id']},
                {"name": "Task Type", "value": "MCT"},
                {"name": "Has Inout Parameters", "value": "false"},
            ]
            for param in parameters:
                etree.SubElement(service_input, "parameter", name=param["name"], source="constant", updatable="true").text = param["value"]

            # Create 'parameter' with nested 'operation' element
            nested_param = etree.SubElement(service_input, "parameter", name="taskField", source="nested")
            etree.SubElement(nested_param, "operation", source="field", to=f"{ tmpName }").text = f"temp.{ task['step_name'] }"

            # Create 'serviceOutput' element
            service_output = etree.SubElement(svcEl, "serviceOutput")

            # Create 'operation' elements under 'serviceOutput'
            output_fields = [
                "Object_Name", "Run_Id", "Log_Id", "Task_Id", "Task_Status",
                "Success_Source_Rows", "Failed_Source_Rows", "Success_Target_Rows",
                "Failed_Target_Rows", "Start_Time", "End_Time", "Error_Message",
                "TotalTransErrors", "FirstErrorCode"
            ]

            for field in output_fields:
                etree.SubElement(service_output, "operation", source="field", to=f"temp.{ task['step_name'] }/output/{field}").text = field.replace('_', ' ')

            # Create 'link' element
            # TODO update link target ID
            linkID = str(uuid.uuid4()).replace('-','')
            
            # If the last step then link to the end
            if taskIdx == dfPlans.index[-1]:
                etree.SubElement(eventEl, "link", id=linkID, targetId="c")
            # Else link to the next step
            else:
                print('link to next step')

            # Create 'events' element
            events = etree.SubElement(eventEl, "events")

            # Create 'catch' elements under 'events'
            catch1ID = str(uuid.uuid4()).replace('-','')
            catch1 = etree.SubElement(events, "catch", faultField=f"temp.{ task['step_name'] }/fault", id=catch1ID, interrupting="true", name="error")
            etree.SubElement(catch1, "suspend")
            catchID = str(uuid.uuid4()).replace('-','')
            etree.SubElement(events, "catch", faultField=f"temp.{ task['step_name'] }/fault", id=catchID, interrupting="true", name="warning")

            # Append the container element
            parents = root.xpath("//*[local-name() = 'taskflow'][1]/*[local-name() = 'flow'][1]")
            if parents:
                for parent in parents:
                    parent.append(eventEl)
            else:
                logging.error("The specified parent tag was not found.")


    #TODO if the last container then add a link to the end



# Save the Taskflow XML

In [217]:
# Save the Taskflow the xml file
tree.write('out/tf_Base_Taskflow/Explore/Default/tf_Base_Taskflow.TASKFLOW.xml', pretty_print=True, xml_declaration=False, encoding='UTF-8')

# Example adding a new element

In [218]:
# Test append a new element
new_element = etree.Element("newElement")
new_element.text = "This is a new element"
parents = root.xpath("//*[local-name() = 'taskflow'][1]")

# Append the new element if found
if parents:
    for parent in parents:
        parent.append(new_element)
else:
    print("The specified parent tag was not found.")

# Test writing the xml file
tree.write('out/test.xml', pretty_print=True, xml_declaration=False, encoding='UTF-8')

# Testing Below This Line

In [219]:
dac2idmc.greet('jon')

'Hello jon!'

In [220]:
dfPlans.iloc[0]

plan_wid                         201C9CC7C59D167A79A2E247C6189A67
plan_name                   Echo Employee Snapshot Oracle R12.1.3
plan_inactive_flag                                              N
plan_step_wid                     4F418CAB76FCD515C5A7F3C94D552CF
plan_step_order                                                 0
plan_step_type                                            REGULAR
step_guid                         4F418CAB76FCD515C5A7F3C94D552CF
step_wid                          1911f67f35d9a487283f503fc7ab2ac
step_cmd                      SDE_ORA_EmployeeDailySnapshotFact_2
step_name                     SDE_ORA_EmployeeDailySnapshotFact_2
infa_id                                    a0Ks8uNXYKLg38LRNzw6gv
infa_path             Default/SDE_ORA_EmployeeDailySnapshotFact_2
agent_id                                     010SU125000000000002
agent_guid                                 cPbb2XLzpoweqfBP9W6kOJ
agent_name                                         AUW487V7S3-AAD
script_dir