In [2]:
import json
from ruamel.yaml import YAML

import openai
import sys
import os
from pathlib import Path
import openai
from dotenv import load_dotenv

from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser

# Local
# filepath = "/home/UNT/ae0589/Desktop/HPCC/AutomaticWorkflowGeneration/ActionEngine/eval/answers/"

# Chameleon
filepath = "/home/cc/AutomaticWorkflowGeneration/ActionEngine/eval/answers/"
filepath_apiinfo = "/home/cc/AutomaticWorkflowGeneration/ActionEngine/db/api_info/api_information.json"

load_dotenv()
model = ChatOpenAI(
    model="gpt-3.5-turbo",
  temperature=0,
  api_key=os.getenv("OPENAI_API_KEY"),
)
try:
    # If running as a script, use __file__ to find the directory
    __file__
    base_path = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
    sys.path.append(base_path)
except NameError:
    # If running in an interactive environment (e.g., Jupyter Notebook)
    base_path = Path().resolve().parent
    sys.path.append(str(base_path))

def read_json_to_dict(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return data

  warn_deprecated(


# Generate Test Data

## Level Easy (1-2 nodes)

In [17]:
filename_1to2 = filepath + 'test_tasklist/tasklist_GTlabel_1-2_nodes.json'

### Function Selection

In [None]:
"""
Generate test data for both different eval data: number of APIs and domains
"""
from utils.func_identifier import func_identifier

num_split = [20, 40, 60, 80, 100, 120, 140, 160, 180, 194]
domain_split = [5, 10, 15, 20, 25, 30, 35, 40, 48]
file_names_num = [f"eval_api_vec_num_{num}" for num in num_split]
file_names_dom = [f"eval_api_vec_domain_{num}" for num in domain_split]

In [12]:
"""
Read Data & Check
"""
test_data = read_json_to_dict(filename_1to2)
print(json.dumps(test_data[-1], indent=4))

{
    "id": 10,
    "user_query": "Generate a mosaic art image with the theme 'Ancient Roman architecture' and resize it to 1024x768 pixels for a digital frame.",
    "task_list": [
        {
            "task_number": 1,
            "task_description": "Generate a mosaic art image with the theme 'Ancient Roman architecture'."
        },
        {
            "task_number": 2,
            "task_description": "Resize the mosaic art image to 1024x768 pixels for a digital frame."
        }
    ],
    "selected_apis": [
        {
            "name": "tti_Mosaic_Art",
            "parameters": [
                "prompt"
            ]
        },
        {
            "name": "Image_Resizing",
            "parameters": [
                "image_file",
                "width",
                "height"
            ]
        }
    ],
    "topological_order": [
        {
            "state": "sequential",
            "task_nums": [
                1
            ]
        },
        {
            "

In [19]:

"""
number of APIs
"""
data = []
for i in range(len(file_names_num)):
    db_name = file_names_num[i]
    selected_func = []
    for j in range(len(test_data)):
        selected_functions, no_func, non_func_list = func_identifier(test_data[j]["task_list"], test_data[j]["user_query"], "eval_numbers/"+db_name)
        selected_func.append({"id": j+1,"api_names":[{"task_number": api["task_num"], "name": api["name"]} for api in selected_functions]})
    num = num_split[i]
    data.append({"type": num, "db_name": db_name, "label": selected_func})

with open(f'./eval_data/func_selector/api_nums/nums_1-2_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)



In [20]:
"""
number of Domains
"""
data = []
for i in range(len(file_names_dom)):
    db_name = file_names_dom[i]
    selected_func = []
    for j in range(len(test_data)):
        selected_functions, no_func, non_func_list = func_identifier(test_data[j]["task_list"], test_data[j]["user_query"], "eval_domains/"+db_name)
        selected_func.append({"id": j+1,  "api_names":[{"task_number": api["task_num"], "name": api["name"]} for api in selected_functions]})
    num = domain_split[i]
    data.append({"type": num, "db_name": db_name, "label": selected_func})

with open(f'./eval_data/func_selector/domain_nums/domains_1-2_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)



### Toplogical Ordering

In [19]:
from utils.wf_optimizer import wf_optimizer

"""
Read Data & Check
"""
filepath = "/home/cc/AutomaticWorkflowGeneration/ActionEngine/eval/answers/"
filename_1to2 = filepath + 'test_tasklist/tasklist_GTlabel_1-2_nodes.json'
test_set = read_json_to_dict(filename_1to2)

In [20]:
"""
Generate Test Data for Topological Order
"""
def generate_node_pairs(order_list):
    # Flatten the list_of_orders into a single list of numbers
    flattened_orders = [num for sublist in order_list for num in sublist]

    # Initialize the result list
    result = []

    # Generate the pairs for each number
    for j, current_num in enumerate(flattened_orders):
        pairs = []
        for k in range(j + 1, len(flattened_orders)):
            next_num = flattened_orders[k]
            pairs.append(f"{current_num} < {next_num}")
        
        # Add the result for the current number
        result.append({"num": current_num, "pairs": pairs})
    return result

data = []
truck_id = 0
for test_data in test_set:
    truck_id += 1
    semantic_wf = wf_optimizer(test_data["user_query"], test_data["task_list"])
    order_list = [d["task_nums"] for d in semantic_wf] 
    pairs = generate_node_pairs(order_list)
    data.append({"id": truck_id, "pairs": pairs})

with open(f'./eval_data/topological_order/easy_1-2_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)


### Data Dependency Management

In [21]:

"""
Dependency Management
"""
from utils.data_dependency import confirm_dependency
from utils.schemas.workflow import TaskOutputDescription, DependentParams

"""
Load Data
"""
def read_apiinfo(filename):
    api_info = []
    with open(filename, 'r') as file:
        for line in file:
            api_info.append(json.loads(line))

    return api_info

filename_tasklist = filepath + 'test_tasklist/tasklist_GTlabel_1-2_nodes.json'
filename_topologicalorder = filepath + 'test_topologicalorder/topologicalorder_GTlabel_1-2_nodes.json'

api_info = read_apiinfo(filepath_apiinfo)
top_orders = read_json_to_dict(filename_topologicalorder)
data = read_json_to_dict(filename_tasklist)

In [22]:


# Loop over both lists and add 'topological_order' to the corresponding item in data
for i in range(len(top_orders)):
    # Assuming the length of top_orders and data is the same
    data[i]['topological_order'] = top_orders[i]['topological_order']

extracted_api_names = [
    {
        'id': item['id'],
        'task_list': item["task_list"],
        'selected_apis': item['selected_apis']
    }
    for item in data
]

all_task_info = []
# Concatenate by same index
for i in range(len(extracted_api_names)):
    concatenated_list = [[task, api] for task, api in zip(extracted_api_names[i]["task_list"], extracted_api_names[i]["selected_apis"])]
    combined_dicts = [{**dict1, **dict2} for dict1, dict2 in concatenated_list]
    all_task_info.append(combined_dicts)

# Extract all api names from api repositories
all_api_names = [item["name"] for item in api_info]

# Retrieve apis' infomation
for i in range(len(all_task_info)):
    selected_functions = []
    for task in all_task_info[i]:
        if task["name"] in all_api_names:
            for func in api_info: 
                if task["name"] == func["name"]:
                    selected_func = func.copy()  # Make a copy to avoid mutating the original
                    selected_func["task_num"] = int(task["task_number"])
                    selected_func["task_description"] = task["task_description"]
                    selected_functions.append(selected_func)
        data[i]["selected_functions"] = selected_functions

"""
For classification of data dependency 
"""
for i in range(len(data)):
    selected_functions, user_inputs, depended_params = confirm_dependency(data[i]["topological_order"], data[i]["selected_functions"])
    func_list = []
    for api in selected_functions:
        all_params = [component['name'] for component in api["input_parameters_with_datatype"]]
        depended_params = [list(item.keys())[0] for item in api["depended_params"]]
        user_inputs = [param for param in all_params if param not in depended_params]
        func_list.append({"name": api["name"], "all_params": all_params, "user_input": user_inputs, "dependent_params": depended_params})
    data[i]["param_dependency_management"] = func_list

    """
Save Data Dependency Management test data
- classification of user_input and dependent_param
- correctness of dependednt_param 
"""
dd_data = [
    {
        'id': item['id'],
        "number_of_node": len(item["task_list"]),
        'task_list': item["task_list"],
        "topological_order": item["topological_order"],
        'selected_apis': [{
            "name": api["name"], 
            "input_params": api["input_parameters_with_datatype"],
            "dependencies": api["dependencies"],
            "depended_params": api["depended_params"],
            } for api in item['selected_functions']],
        'param_dependency_management': item["param_dependency_management"]
    }
    for item in data
]

with open('./eval_data/data_dependency_management/easy_1-2_nodes.json', 'w') as json_file:
    json.dump(dd_data, json_file, indent=4)


## Level Intermidiate (3-5 nodes)

In [23]:
filename_3to5 = filepath + 'test_tasklist/tasklist_GTlabel_3-5_nodes.json'

### Function Selection

In [3]:
"""
Generate test data for both different eval data: number of APIs and domains
"""
from utils.func_identifier import func_identifier

num_split = [20, 40, 60, 80, 100, 120, 140, 160, 180, 194]
domain_split = [5, 10, 15, 20, 25, 30, 35, 40, 48]
file_names_num = [f"eval_api_vec_num_{num}" for num in num_split]
file_names_dom = [f"eval_api_vec_domain_{num}" for num in domain_split]

  warn_deprecated(
  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [4]:
"""
Read Data & Check
"""
test_data = read_json_to_dict(filename_3to5)
print(json.dumps(test_data[-1], indent=4))

{
    "id": 10,
    "node_number": 5,
    "user_query": "Create a pastel art image of a serene beach at sunset with palm trees. Convert this image to a JPEG, resize it to 1024x768 pixels, enhance its quality, and send it to my friend's email.",
    "task_list": [
        {
            "task_number": 1,
            "task_description": "Create a pastel art image of a serene beach at sunset with palm trees."
        },
        {
            "task_number": 2,
            "task_description": "Convert the pastel art image of a serene beach at sunset with palm trees to a JPEG format."
        },
        {
            "task_number": 3,
            "task_description": "Resize the JPEG image of a serene beach at sunset with palm trees to 1024x768 pixels."
        },
        {
            "task_number": 4,
            "task_description": "Enhance the quality of the resized JPEG image of a serene beach at sunset with palm trees."
        },
        {
            "task_number": 5,
            "task

In [None]:

"""
number of APIs
"""
data = []
for i in range(len(file_names_num)):
    db_name = file_names_num[i]
    selected_func = []
    for j in range(len(test_data)):
        selected_functions, no_func, non_func_list = func_identifier(test_data[j]["task_list"], test_data[j]["user_query"], "eval_numbers/"+db_name)
        selected_func.append({"id": j+1,"api_names":[{"task_number": api["task_num"], "name": api["name"]} for api in selected_functions]})
    num = num_split[i]
    data.append({"type": num, "db_name": db_name, "label": selected_func})

In [7]:
with open(f'./eval_data/func_selector/api_nums/nums_3-5_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

In [8]:
"""
number of Domains
"""
data = []
for i in range(len(file_names_dom)):
    db_name = file_names_dom[i]
    selected_func = []
    for j in range(len(test_data)):
        selected_functions, no_func, non_func_list = func_identifier(test_data[j]["task_list"], test_data[j]["user_query"], "eval_domains/"+db_name)
        selected_func.append({"id": j+1,  "api_names":[{"task_number": api["task_num"], "name": api["name"]} for api in selected_functions]})
    num = domain_split[i]
    data.append({"type": num, "db_name": db_name, "label": selected_func})

with open(f'./eval_data/func_selector/domain_nums/domains_3-5_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)



### Toplogical Ordering

In [24]:
from utils.wf_optimizer import wf_optimizer

"""
Read Data & Check
"""
filename_3to5 = filepath + 'test_tasklist/tasklist_GTlabel_3-5_nodes.json'
test_set = read_json_to_dict(filename_3to5)

In [25]:
"""
Generate Test Data for Topological Order
"""
def generate_node_pairs(order_list):
    # Flatten the list_of_orders into a single list of numbers
    flattened_orders = [num for sublist in order_list for num in sublist]

    # Initialize the result list
    result = []

    # Generate the pairs for each number
    for j, current_num in enumerate(flattened_orders):
        pairs = []
        for k in range(j + 1, len(flattened_orders)):
            next_num = flattened_orders[k]
            pairs.append(f"{current_num} < {next_num}")
        
        # Add the result for the current number
        result.append({"num": current_num, "pairs": pairs})
    return result

data = []
truck_id = 0
for test_data in test_set:
    truck_id += 1
    semantic_wf = wf_optimizer(test_data["user_query"], test_data["task_list"])
    order_list = [d["task_nums"] for d in semantic_wf] 
    pairs = generate_node_pairs(order_list)
    data.append({"id": truck_id, "pairs": pairs})

with open(f'./eval_data/topological_order/inter_3-5_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)


### Data Dependency Management

In [26]:

"""
Dependency Management
"""
from utils.data_dependency import confirm_dependency
from utils.schemas.workflow import TaskOutputDescription, DependentParams

"""
Load Data
"""
def read_apiinfo(filename):
    api_info = []
    with open(filename, 'r') as file:
        for line in file:
            api_info.append(json.loads(line))

    return api_info


filename_tasklist = filepath + 'test_tasklist/tasklist_GTlabel_3-5_nodes.json'
filename_topologicalorder = filepath + 'test_topologicalorder/topologicalorder_GTlabel_3-5_nodes.json'

api_info = read_apiinfo(filepath_apiinfo)
top_orders = read_json_to_dict(filename_topologicalorder)
data = read_json_to_dict(filename_tasklist)

In [27]:


# Loop over both lists and add 'topological_order' to the corresponding item in data
for i in range(len(top_orders)):
    # Assuming the length of top_orders and data is the same
    data[i]['topological_order'] = top_orders[i]['topological_order']

extracted_api_names = [
    {
        'id': item['id'],
        'task_list': item["task_list"],
        'selected_apis': item['selected_apis']
    }
    for item in data
]

all_task_info = []
# Concatenate by same index
for i in range(len(extracted_api_names)):
    concatenated_list = [[task, api] for task, api in zip(extracted_api_names[i]["task_list"], extracted_api_names[i]["selected_apis"])]
    combined_dicts = [{**dict1, **dict2} for dict1, dict2 in concatenated_list]
    all_task_info.append(combined_dicts)

# Extract all api names from api repositories
all_api_names = [item["name"] for item in api_info]

# Retrieve apis' infomation
for i in range(len(all_task_info)):
    selected_functions = []
    for task in all_task_info[i]:
        if task["name"] in all_api_names:
            for func in api_info: 
                if task["name"] == func["name"]:
                    selected_func = func.copy()  # Make a copy to avoid mutating the original
                    selected_func["task_num"] = int(task["task_number"])
                    selected_func["task_description"] = task["task_description"]
                    selected_functions.append(selected_func)
        data[i]["selected_functions"] = selected_functions

"""
For classification of data dependency 
"""
for i in range(len(data)):
    selected_functions, user_inputs, depended_params = confirm_dependency(data[i]["topological_order"], data[i]["selected_functions"])
    func_list = []
    for api in selected_functions:
        all_params = [component['name'] for component in api["input_parameters_with_datatype"]]
        depended_params = [list(item.keys())[0] for item in api["depended_params"]]
        user_inputs = [param for param in all_params if param not in depended_params]
        func_list.append({"name": api["name"], "all_params": all_params, "user_input": user_inputs, "dependent_params": depended_params})
    data[i]["param_dependency_management"] = func_list

    """
Save Data Dependency Management test data
- classification of user_input and dependent_param
- correctness of dependednt_param 
"""
dd_data = [
    {
        'id': item['id'],
        "number_of_node": len(item["task_list"]),
        'task_list': item["task_list"],
        "topological_order": item["topological_order"],
        'selected_apis': [{
            "name": api["name"], 
            "input_params": api["input_parameters_with_datatype"],
            "dependencies": api["dependencies"],
            "depended_params": api["depended_params"],
            } for api in item['selected_functions']],
        'param_dependency_management': item["param_dependency_management"]
    }
    for item in data
]

with open('./eval_data/data_dependency_management/inter_3-5_nodes.json', 'w') as json_file:
    json.dump(dd_data, json_file, indent=4)


## Level Hard (6-10 nodes)

In [28]:
filename_6to10 = filepath + 'test_tasklist/tasklist_GTlabel_6-10_nodes.json'

### Function Selection

In [7]:
"""
Generate test data for both different eval data: number of APIs and domains
"""
from utils.func_identifier import func_identifier

num_split = [20, 40, 60, 80, 100, 120, 140, 160, 180, 194]
domain_split = [5, 10, 15, 20, 25, 30, 35, 40, 48]
file_names_num = [f"eval_api_vec_num_{num}" for num in num_split]
file_names_dom = [f"eval_api_vec_domain_{num}" for num in domain_split]

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [8]:
"""
Read Data & Check
"""
test_data = read_json_to_dict(filename_6to10)
print(json.dumps(test_data[-1], indent=4))

{
    "id": 10,
    "node_number": 10,
    "user_query": "I am working on a project that involves creating a sketch dog book. Please generate 10 diffrent images in sketching art style:Labrador Retriever, German Shepherd, Golden Retriever, Bulldog, Poodle, Beagle, Rottweiler, Dachshund, Siberian Husky, Shih Tzu",
    "task_list": [
        {
            "task_number": 1,
            "task_description": "Generate a sketching art style image of a Labrador Retriever."
        },
        {
            "task_number": 2,
            "task_description": "Generate a sketching art style image of a German Shepherd."
        },
        {
            "task_number": 3,
            "task_description": "Generate a sketching art style image of a Golden Retriever."
        },
        {
            "task_number": 4,
            "task_description": "Generate a sketching art style image of a Bulldog."
        },
        {
            "task_number": 5,
            "task_description": "Generate a sketching a

In [9]:

"""
number of APIs
"""
data = []
for i in range(len(file_names_num)):
    db_name = file_names_num[i]
    selected_func = []
    for j in range(len(test_data)):
        selected_functions, no_func, non_func_list = func_identifier(test_data[j]["task_list"], test_data[j]["user_query"], "eval_numbers/"+db_name)
        selected_func.append({"id": j+1,"api_names":[{"task_number": api["task_num"], "name": api["name"]} for api in selected_functions]})
    num = num_split[i]
    data.append({"type": num, "db_name": db_name, "label": selected_func})



In [None]:
with open(f'./eval_data/func_selector/api_nums/nums_6-10_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

In [None]:
"""
number of Domains
"""
data = []
for i in range(len(file_names_dom)):
    db_name = file_names_dom[i]
    selected_func = []
    for j in range(len(test_data)):
        selected_functions, no_func, non_func_list = func_identifier(test_data[j]["task_list"], test_data[j]["user_query"], "eval_domains/"+db_name)
        selected_func.append({"id": j+1,  "api_names":[{"task_number": api["task_num"], "name": api["name"]} for api in selected_functions]})
    num = domain_split[i]
    data.append({"type": num, "db_name": db_name, "label": selected_func})

with open(f'./eval_data/func_selector/domain_nums/domains_6-10_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)



### Toplogical Ordering

In [29]:
from utils.wf_optimizer import wf_optimizer

"""
Read Data & Check
"""
filename_6to10 = filepath + 'test_tasklist/tasklist_GTlabel_6-10_nodes.json'
test_set = read_json_to_dict(filename_6to10)

In [30]:
"""
Generate Test Data for Topological Order
"""
def generate_node_pairs(order_list):
    # Flatten the list_of_orders into a single list of numbers
    flattened_orders = [num for sublist in order_list for num in sublist]

    # Initialize the result list
    result = []

    # Generate the pairs for each number
    for j, current_num in enumerate(flattened_orders):
        pairs = []
        for k in range(j + 1, len(flattened_orders)):
            next_num = flattened_orders[k]
            pairs.append(f"{current_num} < {next_num}")
        
        # Add the result for the current number
        result.append({"num": current_num, "pairs": pairs})
    return result

data = []
truck_id = 0
for test_data in test_set:
    truck_id += 1
    semantic_wf = wf_optimizer(test_data["user_query"], test_data["task_list"])
    order_list = [d["task_nums"] for d in semantic_wf] 
    pairs = generate_node_pairs(order_list)
    data.append({"id": truck_id, "pairs": pairs})

with open(f'./eval_data/topological_order/inter_6-10_nodes.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)


### Data Dependency Management

In [31]:

"""
Dependency Management
"""
from utils.data_dependency import confirm_dependency
from utils.schemas.workflow import TaskOutputDescription, DependentParams

"""
Load Data
"""
def read_apiinfo(filename):
    api_info = []
    with open(filename, 'r') as file:
        for line in file:
            api_info.append(json.loads(line))

    return api_info


filename_tasklist = filepath + 'test_tasklist/tasklist_GTlabel_6-10_nodes.json'
filename_topologicalorder = filepath + 'test_topologicalorder/topologicalorder_GTlabel_6-10_nodes.json'

api_info = read_apiinfo(filepath_apiinfo)
top_orders = read_json_to_dict(filename_topologicalorder)
data = read_json_to_dict(filename_tasklist)

In [32]:


# Loop over both lists and add 'topological_order' to the corresponding item in data
for i in range(len(top_orders)):
    # Assuming the length of top_orders and data is the same
    data[i]['topological_order'] = top_orders[i]['topological_order']

extracted_api_names = [
    {
        'id': item['id'],
        'task_list': item["task_list"],
        'selected_apis': item['selected_apis']
    }
    for item in data
]

all_task_info = []
# Concatenate by same index
for i in range(len(extracted_api_names)):
    concatenated_list = [[task, api] for task, api in zip(extracted_api_names[i]["task_list"], extracted_api_names[i]["selected_apis"])]
    combined_dicts = [{**dict1, **dict2} for dict1, dict2 in concatenated_list]
    all_task_info.append(combined_dicts)

# Extract all api names from api repositories
all_api_names = [item["name"] for item in api_info]

# Retrieve apis' infomation
for i in range(len(all_task_info)):
    selected_functions = []
    for task in all_task_info[i]:
        if task["name"] in all_api_names:
            for func in api_info: 
                if task["name"] == func["name"]:
                    selected_func = func.copy()  # Make a copy to avoid mutating the original
                    selected_func["task_num"] = int(task["task_number"])
                    selected_func["task_description"] = task["task_description"]
                    selected_functions.append(selected_func)
        data[i]["selected_functions"] = selected_functions

"""
For classification of data dependency 
"""
for i in range(len(data)):
    selected_functions, user_inputs, depended_params = confirm_dependency(data[i]["topological_order"], data[i]["selected_functions"])
    func_list = []
    for api in selected_functions:
        all_params = [component['name'] for component in api["input_parameters_with_datatype"]]
        depended_params = [list(item.keys())[0] for item in api["depended_params"]]
        user_inputs = [param for param in all_params if param not in depended_params]
        func_list.append({"name": api["name"], "all_params": all_params, "user_input": user_inputs, "dependent_params": depended_params})
    data[i]["param_dependency_management"] = func_list

    """
Save Data Dependency Management test data
- classification of user_input and dependent_param
- correctness of dependednt_param 
"""
dd_data = [
    {
        'id': item['id'],
        "number_of_node": len(item["task_list"]),
        'task_list': item["task_list"],
        "topological_order": item["topological_order"],
        'selected_apis': [{
            "name": api["name"], 
            "input_params": api["input_parameters_with_datatype"],
            "dependencies": api["dependencies"],
            "depended_params": api["depended_params"],
            } for api in item['selected_functions']],
        'param_dependency_management': item["param_dependency_management"]
    }
    for item in data
]

with open('./eval_data/data_dependency_management/hard_6-10_nodes.json', 'w') as json_file:
    json.dump(dd_data, json_file, indent=4)


# Decompose YAML

In [8]:
yaml = YAML()
def read_yaml(filepath):
    yaml = YAML()
    try:
        with open(filepath, "r") as yaml_file:
            data = yaml.load(yaml_file)
        return data
    except Exception as e:
        return f"Error reading file: {e}"
f_path = "/home/cc/AutomaticWorkflowGeneration/ActionEngine/eval/answers/test_endtoend/ae_without_compiler/argo_workflow.yaml"
d = read_yaml(f_path)

In [9]:
d

{'apiVersion': 'argoproj.io/v1alpha1', 'kind': 'Workflow', 'metadata': {'generateName': 'image-processing-dag-'}, 'spec': {'entrypoint': 'dag-workflow', 'templates': [{'name': 'dag-workflow', 'dag': {'tasks': [{'name': 't1', 'template': 'tti-animation-art', 'arguments': {'parameters': [{'name': 'prompt', 'value': 'Generate an animated image of a dog.'}]}}, {'name': 't2', 'dependencies': ['t1'], 'template': 'image-resizing', 'arguments': {'parameters': [{'name': 'width', 'value': 500}, {'name': 'height', 'value': 500}, {'name': 'file', 'value': '{{tasks.t1.outputs.parameters.output_image}}'}]}}, {'name': 't3', 'dependencies': ['t2'], 'template': 'tti-animation-art', 'arguments': {'parameters': [{'name': 'prompt', 'value': 'Generate an animated image of a cat.'}]}}, {'name': 't4', 'dependencies': ['t3'], 'template': 'image-resizing', 'arguments': {'parameters': [{'name': 'width', 'value': 500}, {'name': 'height', 'value': 500}, {'name': 'file', 'value': '{{tasks.t3.outputs.parameters.out

In [11]:
dag = d['spec']['templates'][0]['dag']['tasks']
for item in dag:
    print(item)

{'name': 't1', 'template': 'tti-animation-art', 'arguments': {'parameters': [{'name': 'prompt', 'value': 'Generate an animated image of a dog.'}]}}
{'name': 't2', 'dependencies': ['t1'], 'template': 'image-resizing', 'arguments': {'parameters': [{'name': 'width', 'value': 500}, {'name': 'height', 'value': 500}, {'name': 'file', 'value': '{{tasks.t1.outputs.parameters.output_image}}'}]}}
{'name': 't3', 'dependencies': ['t2'], 'template': 'tti-animation-art', 'arguments': {'parameters': [{'name': 'prompt', 'value': 'Generate an animated image of a cat.'}]}}
{'name': 't4', 'dependencies': ['t3'], 'template': 'image-resizing', 'arguments': {'parameters': [{'name': 'width', 'value': 500}, {'name': 'height', 'value': 500}, {'name': 'file', 'value': '{{tasks.t3.outputs.parameters.output_image}}'}]}}
{'name': 't5', 'dependencies': ['t4'], 'template': 'send-email', 'arguments': {'parameters': [{'name': 'sender_address', 'value': 'sender@example.com'}, {'name': 'receiver_address', 'value': 'rece

### Selected API Names

In [12]:
api_names = []
print("number of subtask: ", len(dag))
for i in range(len(dag)):
    api_names.append({"name": dag[i]['template']})

# {selected_apis: api_names}
api_names

number of subtask:  5


[{'name': 'tti-animation-art'},
 {'name': 'image-resizing'},
 {'name': 'tti-animation-art'},
 {'name': 'image-resizing'},
 {'name': 'send-email'}]

### topological order

In [99]:
dep_list = []
for item in dag:
    print(item["name"], ", ", item["dependencies"])
    dep_list.append(item["dependencies"])

t1 ,  []
t2 ,  []
t3 ,  ['t1', 't2']
t4 ,  ['t1', 't2']
t5 ,  ['t3', 't4']


In [156]:
task_num = []
task_nums = []

for i in range(len(dep_list)):
    if len(dep_list[i]) == 0 or (dep_list[i] == dep_list[i-1] and i > 0):
        pass
    else:
        task_nums.append(task_num)
        task_num = []
    task_num.append(i+1)
task_nums.append(task_num)  

In [157]:
topological_order = []
for i in range(len(task_nums)):
    if len(task_nums[i]) > 1:
        topological = {
            "state": "paralell"
            }          
    else:
        topological = {
            "state": "sequential",
            } 

    topological["task_nums"]= [
                i for i in task_nums[i]
            ]

    topological_order.append(topological)

## data dependency

In [160]:
import re

def extract_task_number(s):
    match = re.search(r't\d+', s)
    return match.group(0) if match else None

param_dependency_management = []
for i in range(len(dag)):
    user_input, dependent_paramas, dependent_params_with_source = [], [], []
    datadep = {
            "name": dag[i]["template"],
            "all_params": [
                item['name'] for item in dag[i]["arguments"]["parameters"]
            ]
        }
    for param in dag[i]["arguments"]["parameters"]:
        if "input" in param['value']:
            user_input.append(param['name'])
        elif "tasks" in param['value']:
            dependent_paramas.append(param['name'])
            dependent_params_with_source.append({str(param['name']): extract_task_number(param['value'])})
    datadep["user_input"] = user_input
    datadep["dependent_paramas"] = dependent_paramas
    datadep["dependent_params_with_source"] = dependent_params_with_source
    param_dependency_management.append(datadep)

In [161]:
param_dependency_management

[{'name': 'tti-animation-art',
  'all_params': ['prompt'],
  'user_input': ['prompt'],
  'dependent_paramas': [],
  'dependent_params_with_source': []},
 {'name': 'tti-animation-art',
  'all_params': ['prompt'],
  'user_input': ['prompt'],
  'dependent_paramas': [],
  'dependent_params_with_source': []},
 {'name': 'image-resizing',
  'all_params': ['width', 'height', 'file'],
  'user_input': ['width', 'height'],
  'dependent_paramas': ['file'],
  'dependent_params_with_source': [{'file': 't1'}]},
 {'name': 'image-resizing',
  'all_params': ['width', 'height', 'file'],
  'user_input': ['width', 'height'],
  'dependent_paramas': ['file'],
  'dependent_params_with_source': [{'file': 't2'}]},
 {'name': 'send-email',
  'all_params': ['sender_address',
   'receiver_address',
   'message_text',
   'message_subject',
   'file'],
  'user_input': ['sender_address',
   'receiver_address',
   'message_text',
   'message_subject'],
  'dependent_paramas': ['file'],
  'dependent_params_with_source': 

In [162]:
dag_components = {"selected_apis": api_names, "topological_order": topological_order, "param_dependency_management": param_dependency_management}
dag_components

{'selected_apis': [{'name': 'tti-animation-art'},
  {'name': 'tti-animation-art'},
  {'name': 'image-resizing'},
  {'name': 'image-resizing'},
  {'name': 'send-email'}],
 'topological_order': [{'state': 'paralell', 'task_nums': [1, 2]},
  {'state': 'paralell', 'task_nums': [3, 4]},
  {'state': 'sequential', 'task_nums': [5]}],
 'param_dependency_management': [{'name': 'tti-animation-art',
   'all_params': ['prompt'],
   'user_input': ['prompt'],
   'dependent_paramas': [],
   'dependent_params_with_source': []},
  {'name': 'tti-animation-art',
   'all_params': ['prompt'],
   'user_input': ['prompt'],
   'dependent_paramas': [],
   'dependent_params_with_source': []},
  {'name': 'image-resizing',
   'all_params': ['width', 'height', 'file'],
   'user_input': ['width', 'height'],
   'dependent_paramas': ['file'],
   'dependent_params_with_source': [{'file': 't1'}]},
  {'name': 'image-resizing',
   'all_params': ['width', 'height', 'file'],
   'user_input': ['width', 'height'],
   'depend