In [1]:
import copy
import os
import json
from pprint import pp

In [2]:
model_dir = "/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__structured_auto_0.7_1200_1"
out_dir = model_dir.replace("auto", "auto-dedup").replace("solution", "solution-dedup")
out_dir

'/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__structured_auto-dedup_0.7_1200_1'

In [12]:
def new_fingerprint(model_dir, out_dir):
    fingerprint_path = os.path.join(model_dir, "fingerprint.jsonl")
    fingerprint = {}
    with open(fingerprint_path, "r") as f:
        for line in f:
            fingerprint |= json.loads(line)

    fingerprint["n_tool_calls"] += "-dedup"
    new_fingerprint_path = os.path.join(out_dir, "fingerprint.jsonl")

    with open(new_fingerprint_path, "w") as f:
        for key, value in fingerprint.items():
            json_line = json.dumps({key: value})
            f.write(json_line + '\n')

In [21]:
fingerprint_path = os.path.join(model_dir, "fingerprint.jsonl")
fingerprint = {}
with open(fingerprint_path, "r") as f:
    for line in f:
        fingerprint |= json.loads(line)

fingerprint["n_tool_calls"] += "-dedup"
new_fingerprint_path = os.path.join(out_dir, "fingerprint.jsonl")

with open(new_fingerprint_path, "w") as f:
    for key, value in fingerprint.items():
        json_line = json.dumps({key: value})
        f.write(json_line + '\n')

In [3]:
def get_files(path):
    file_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            file_list.append(file_path)
    return file_list

In [4]:
def get_generations(path):
    generations = []
    with open(path, "r") as f:
        for line in f:
            generations.append(json.loads(line))
    return generations

In [5]:
def remove_duplicates(json_list):
    unique_json_list = []
    unique_json_strings = set()

    for json_obj in json_list:
        json_string = json.dumps(json_obj, sort_keys=True)
        if json_string not in unique_json_strings:
            unique_json_list.append(json_obj)
            unique_json_strings.add(json_string)

    return unique_json_list

In [6]:
def bfcl_format(tool_calls):
    tool_strs = []
    for tool_call in tool_calls:
        tool_name = tool_call["tool_name"]
        tool_args = tool_call["tool_arguments"]
        args_string = ', '.join([f"{key}='{value}'" if isinstance(value, str) else f"{key}={value}" for key, value in tool_args.items()])
        tool_str = f'{tool_name}({args_string})'
        tool_strs.append(tool_str)
    result = '[' + ', '.join(tool_strs) + ']'
    return result

In [7]:
def get_new_generations(generations):

    new_generations = []
    _generations = copy.deepcopy(generations)
    for i, gen in enumerate(_generations):
        result = gen["result"]
        if "error" in result:
            new_generations.append(gen)
            continue
        tool_calls = gen["tool_calls"]
        tool_calls = remove_duplicates(tool_calls)
        new_result = bfcl_format(tool_calls)
        gen["result"] = new_result
        gen["tool_calls"] = tool_calls
        new_generations.append(gen)

    return new_generations

In [11]:
def save_new_generations(out_dir, generations_path, new_generations):
    new_generations_path = os.path.join(out_dir, "generations", generations_path.split("/")[-1])
    os.makedirs(os.path.dirname(new_generations_path), exist_ok=True)
    with open(new_generations_path, "w") as f:
        for new_gen in new_generations:
            f.write(json.dumps(new_gen) + "\n")

In [8]:
generations_dir = os.path.join(model_dir, "generations")
generations_paths = get_files(generations_dir)

In [9]:
for generations_path in generations_paths:

    # Load generations
    generations = get_generations(generations_path)

    # Make new generations
    new_generations = get_new_generations(generations)

    # Save new generations
    save_new_generations(out_dir, generations_path, new_generations)

In [22]:
def get_directories(path):
    directories = []

    for item in os.listdir(path):
        item_path = os.path.join(path, item)
        if os.path.isdir(item_path):
            directories.append(item_path)

    return directories

In [23]:
path = "/root/gorilla/berkeley-function-call-leaderboard/outputs"
get_directories(path)

['/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__conditional_solution_0.7_1200',
 '/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__structured_solution_0.7_1200',
 '/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__structured_auto_0.7_1200_1',
 '/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__conditional_auto_0.7_1200_1',
 '/root/gorilla/berkeley-function-call-leaderboard/outputs/databricks_dbrx-instruct__unstructured_solution_0.7_1200']