In [1]:
import pandas as pd
import json
import re

In [57]:
# Load toolbench bias queries
with open("toolbench_bias_queries.json", "r") as input_file, open("cluster_queries.json", "r") as cluster_queries_file, open("bias_dataset_openai_format.jsonl", "w") as output_file:
    # TODO: incorporate return values and api endpoint info into the description
    toolbench_bias_queries = json.load(input_file)
    cluster_queries = json.load(cluster_queries_file)
    
    cluster_id = 0
    types = set()

    clusters = []

    # Decisions for converting to OpenAI format:
    # 1. Function Name will be: f{tool_name}.{api_name} (both converted to snake_case) 
    # 2. Parameters will be "parameters": {"type":"object", "properties": {"{param_name}": {"type": "{type}", "description": "{description}"}}, required: ["{param_name}"]}


    # For Types
    # ENUM, string, STRING --> string
    # DATE (YYYY-MM-DD) --> string
    # BOOLEAN --> boolean
    # NUMBER --> float
    type_mapping = {
        "ENUM": "string",
        "DATE (YYYY-MM-DD)": "string",
        "BOOLEAN": "boolean",
        "NUMBER": "number",
        "STRING": "string",
        "string": "string",
    }

    cluster_id = 1
    for idx in range(0, len(toolbench_bias_queries), 500):
        apis = toolbench_bias_queries[idx]['api_list']

        formatted_tools = []

        
        for api in apis:
            # Convert to snake_case
            api_name = api['api_name'].lower()
            tool_name = api['tool_name'].lower()

            function_name = f"{tool_name}.{api_name}".replace(" ", "_").replace("-", "_").replace("/", "_").replace(".", "_")

            # any consecutive underscores should be replaced with a single underscore
            function_name = re.sub(r'_+', '_', function_name)

            # remove any non-alphanumeric characters
            function_name = re.sub(r'[^a-zA-Z0-9_]', '', function_name)
            function_name = function_name[:63]
            

            # Convert to OpenAI format
            required = []
            properties = {}
            for param in api['required_parameters']:
                types.add(param['type'])
                properties[param['name']] = {
                    "type": type_mapping[param['type']],
                    "description": param['description']
                }
                required.append(param['name'])

            for param in api['optional_parameters']:
                types.add(param['type'])
                properties[param['name']] = {
                    "type": type_mapping[param['type']],
                    "description": param['description']
                }

            formatted_tools.append({
                "name": function_name,
                "description": api.get('api_description', ''),
                "parameters": {
                    "type": "object",
                    "properties": properties,
                    "required": required
                }})
        assert cluster_id == cluster_queries[cluster_id - 1]['cluster_id']
        clusters.append({"id": f"bias-{cluster_id}", "function": 
        formatted_tools, "question": [[{"role": "user", "content": query}] for query in cluster_queries[cluster_id - 1]['queries']]})
        cluster_id += 1
    
    print(types)

    # generate jsonl
    for cluster in clusters:
        output_file.write(json.dumps(cluster) + "\n")
    



{'NUMBER', 'BOOLEAN', 'string', 'DATE (YYYY-MM-DD)', 'STRING', 'ENUM'}


In [55]:
import sys
import os
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), '..'))
from core.tool_modifier import format_tool_for_openai_api
from jsonschema import Draft7Validator, validate
from jsonschema.exceptions import SchemaError


# read all names from bias_dataset_bfcl_format.jsonl
with open("bias_dataset_bfcl_format.jsonl", "r") as file:
    for line in file:
        data = json.loads(line)
        for func in data["function"]:
            name = func["name"]
            format_tool_for_openai_api(func)
            schema = {
                "$schema": "https://json-schema.org/draft/2020-12/schema",
                **format_tool_for_openai_api(func)["function"]
            }

            try:
                Draft7Validator.check_schema(schema)
                # print("Schema is valid ✅")
            except SchemaError as e:
                print(format_tool_for_openai_api(func))
                print("Invalid schema ❌")
                print(e)
            
            if not re.match("^[a-zA-Z_][a-zA-Z0-9_\.]{0,62}$", name):
                print(name, len(name))




    


    # validate schema of bias_dataset_bfcl_regex.jsonl




address_from_to_latitude_longitude.coordinates_latitude_longitude_to_address 76
textsentai_____ai_powered_text_sentiment_analyzer_.textsentai_api_ 66
