In [58]:
import pandas as pd
import re
import subprocess
import uuid
import csv
import json
import hashlib
import os
import shutil
import urllib3
from urllib.parse import quote

## Read the Config

In [59]:
with open('config/config.json', 'r') as infile:
    config = json.load(infile)


In [60]:
with open('config/exportMetadata.v2.json', 'r') as infile:
    exportMetadata = json.load(infile)


In [61]:
with open('config/Default.Project.json', 'r') as infile:
    defaultProject = json.load(infile)


## Read the plans

In [62]:
# Read the execution plans
dfPlans = pd.read_csv('in/plans.csv', dtype='str', encoding='utf-8', na_filter=False)

In [63]:
# TODO remove below testing filter
#dfPlans = dfPlans.head(1)

In [64]:
# Get a list of unique execution plans
plans = dfPlans['plan_name'].unique()

## Lookup the Converted Mapping Task IDs

In [65]:
# Login to IDMC
http = urllib3.PoolManager()

data = '{ "username": "' + config['idmc']['user'] + '", "password": "' + config['idmc']['password'] + '" }'

url = 'https://' + config['idmc']['host'] + '/saas/public/core/v3/login'
r = http.request(
    'POST', 
    url,
    timeout=3000,
    body=data,
    headers={
            'Accept': 'application/json',
            'Content-Type': 'application/json'
        }
    )
    
# Convert the response into a datframe
result = json.loads(r.data.decode('utf-8'))
sessionID = result['userInfo']['sessionId']


In [66]:
# Initialise the tasks data frame
skip = 0
limit = 200
i = 0
dfTasks = pd.DataFrame()

# Page through mapping task queries
while True:

    # Get a list of the mapping tasks
    url = 'https://' + config['idmc']['pod'] + '.' + config['idmc']['host'] + '/saas/public/core/v3/objects?q=type==%27MTT%27&limit=' + str(limit) + '&skip=' + str(skip)
    r = http.request(
        'GET', 
        url,
        timeout=3000,
        headers={
                'Accept': 'application/json',
                'INFA-SESSION-ID': sessionID
            }
        )
        
    # Convert the response into a datframe
    result = json.loads(r.data.decode('utf-8'))
    dfTmp = pd.json_normalize(result)
    dfResp = dfTmp.copy()
    taskCount = dfResp.iloc[0]['count']
    dfResp = dfResp['objects'].explode()
    dfResp = pd.DataFrame(dfResp)
    dfResp = pd.json_normalize(dfResp['objects'])
    dfTasks = pd.concat([dfTasks, dfResp], ignore_index=True)

    # Break if all records have been returned
    i = i + limit
    if i > taskCount:
        break



In [67]:
# Join the task id's onto the plans
dfResp['step_name'] = dfResp['path'].apply(lambda x: os.path.basename(x))
dfResp = dfResp[['step_name','id','path']]
dfResp = dfResp.rename(columns={'id': 'infa_id', 'path': 'infa_path'})
dfPlans = dfPlans.merge(dfResp, how='left', on='step_name')

In [68]:
# Log an error if any plans did not find an existing matching task
dfMissing = dfPlans[(dfPlans['infa_id'] == '') | (dfPlans['infa_id'].isna())].copy()
if len(dfMissing.index) > 0:
    dfMissing.to_csv('out/missing_tasks.csv', index=False, quoting=csv.QUOTE_ALL)

## Create the Import Packages

In [69]:
# For each unique plan
for plan in plans:
    
    # Create IDs and a clean version of the name for the new taskflow
    tName = re.sub(r'[^\w\d]+', '_', plan)
    
    # Reset the output directories
    if os.path.exists(f'out/{tName}'):
        shutil.rmtree(f'out/{tName}')
    os.makedirs(f"out/{tName}/Explore/Default")
    
    # Generate the unique IDs
    projectID = str(uuid.uuid4()).replace('-','')
    taskflowID = str(uuid.uuid4()).replace('-','')

    # Create the ContentsofExportPackage File
    content = {'objectPath': ['/Explore/Default','/Explore'], 'objectName': [tName, 'Default'], 'objectType': ['TASKFLOW', 'Project'], 'id': [taskflowID, projectID]}
    contentDf = pd.DataFrame(data=content)
    contentDf.to_csv(f'out/{tName}/ContentsofExportPackage_{tName}.csv', index=False, quoting=csv.QUOTE_NONE)

    # Create the exportMetadata file
    exportMetadata['name'] = tName
    objList = []
    taskflowObj = list(filter(lambda x: x['objectType'] == 'TASKFLOW', exportMetadata['exportedObjects']))[0].copy()
    taskflowObj['objectGuid'] = taskflowID
    taskflowObj['objectName'] = tName
    taskflowObj['metadata']['objectRefs'] = dfPlans['infa_id'].to_list()
    objList.append(taskflowObj)
    
    projectObj = list(filter(lambda x: x['objectType'] == 'Project', exportMetadata['exportedObjects']))[0].copy()
    projectObj['objectGuid'] = projectID
    objList.append(projectObj)
    
    for index, row in dfPlans.iterrows():
        mappingObj = list(filter(lambda x: x['objectType'] == 'MTT', exportMetadata['exportedObjects']))[0].copy()
        mappingObj['objectGuid'] = row['infa_id']
        mappingObj['objectName'] = row['step_name']
        objList.append(mappingObj)
    
    exportMetadata['exportedObjects'] = objList
    
    with open(f'out/{tName}/exportMetadata.v2.json', 'w') as outfile:
        outfile.write(json.dumps(exportMetadata))

    # Create the Default.Project.json file
    defaultProject['id'] = f'Projects({projectID})'
    with open(f'out/{tName}/Explore/Default.Project.json', 'w') as outfile:
        outfile.write(json.dumps(defaultProject))

    # Create the taskflow XML file
    dfPlans.to_xml('in/plans.xml', index=False, row_name='row')
    console = subprocess.run(["java", "-Xmx14000M", "-cp", "../saxon-he-10.5.jar", "net.sf.saxon.Query", "-q:convert.xq", f"-o:out/{tName}/Explore/Default/{tName}.TASKFLOW.xml", f"tname={tName}", f"tflowid={taskflowID}"], capture_output=True)
    
    # Create the exportPackage.chksum file
    with open(f'out/{tName}/Explore/Default.Project.json', 'rb') as infile:
        infileBytes = infile.read()
        defaultProjectJsonHash = hashlib.sha256(infileBytes).hexdigest().upper()

    with open(f'out/{tName}/exportMetadata.v2.json', 'rb') as infile:
        infileBytes = infile.read()
        exportMetadataHash = hashlib.sha256(infileBytes).hexdigest().upper()

    with open(f'out/{tName}/Explore/Default/{tName}.TASKFLOW.xml', 'rb') as infile:
        infileBytes = infile.read()
        taskflowHash = hashlib.sha256(infileBytes).hexdigest().upper()
    
    lines = ['#\n', '#Fri Dec 01 22:59:59 UTC 2023\n', f'Explore/Default.Project.json={defaultProjectJsonHash}\n', f'exportMetadata.v2.json={exportMetadataHash}\n', f'Explore/Default/{tName}.TASKFLOW.xml={taskflowHash}\n']
    with open(f'out/{tName}/exportPackage.chksum', 'w') as outfile:
        outfile.writelines(lines)

    # Create the import zip file
    shutil.make_archive(f'out/{tName}', 'zip', os.path.join('out',tName))
    

# TEMP BELOW:

In [70]:
dfPlans['infa_id'].to_list()

['a0Ks8uNXYKLg38LRNzw6gv',
 '8gORkCCxfM8hk1VxABRRqd',
 '6vPlMGJ7sw6juVvUtKpxYq',
 '7PjDO1zxryekytQ7Oh1zEP',
 'aeOQ6VugmCxipbYyKlDRUr',
 'bspgWgKqNAAkdFCD8X1EGw',
 '980nZ1ZyECqlJELIoT6seu',
 '48EkOr6iGA3jPFoSNlYhmt',
 'hrqZyoLnJWAbb4e5QoamgT',
 'bacDD1nRpeZcyoy0ts7ZqA',
 '0DdyYEpJbWNbQ0MHY0se20',
 '32jnMVPICzEjhkWQonRvw9']