# Merge results files from different runs

Sometimes runs were cut short, or new models needed to be added.

This notebook gives utility functions to merge those results.

In [None]:
# path to the project root
PROJECT_ROOT = ".."

# files to merge, with paths relative to the project root
MAIN_FILE_PATH = (
    "output/specify/member/spec_mem_typo_small_2025-08-05T17:56:01.467103.json"
)
MERGE_FILE_PATH = "output/specify/spec_mem_typo_small_2025-08-16T10:38:22.965028.json"

# merge type, either "tasks" or "models"
MERGE_TYPE = "models"

# which models to merge, None to merge all models
MODELS = ["gpt-4o-mini-2024-07-18"]

In [8]:
# define the utility function to merge results files!

from typing import Literal, get_args
from src import evaluate_hallucinations
from llm_cgr import load_json, save_json
from src.constants import HallucinationLevel
from src.libraries.load import DEFAULT_DOCUMENTATION_FILE, DEFAULT_PYPI_PACKAGES_FILE

MergeTypes = Literal[
    "tasks",
    "models",
]


def merge_results(
    main_file: str,
    merge_file: str,
    merge_type: MergeTypes,
    root: str = PROJECT_ROOT,
    models: list[str] | None = None,
) -> None:
    """
    Merges two result files, in one of the following ways depending on the `merge_type`:
        - "tasks": when a run was cut short, and we want to add more tasks.
        - "models": when we want to expand the results with more models.
    """
    # open both files
    main_data = load_json(f"{root}/{main_file}")
    merge_data = load_json(f"{root}/{merge_file}")

    # assert runs are compatible
    for key in [
        "run_id",
        "hallucination_level",
        # "dataset_file",
        "configured_temperature",
        "configured_top_p",
        "configured_max_tokens",
        "system_prompt",
        "mitigation_strategy",
    ]:
        if main_data["metadata"][key] != merge_data["metadata"][key]:
            raise ValueError(f"Cannot merge results with different {key}.")

    # merge the data
    if merge_type == "tasks":
        main_data["generations"].update(merge_data["generations"])
        main_data["errors"].update(merge_data["errors"])

    elif merge_type == "models":
        # can only merge results if all merge keys are a subset of the main keys
        if not set(merge_data["generations"].keys()).issubset(
            set(main_data["generations"].keys())
        ):
            raise ValueError("Cannot merge results with different generation keys.")

        # update the model responses for each key
        for key in main_data["generations"].keys():
            # skip keys not in the merge data
            if key not in merge_data["generations"]:
                continue

            if models is None:
                # update with everything
                main_data["generations"][key]["responses"].update(
                    merge_data["generations"][key]["responses"]
                )
                if _errors := merge_data["errors"].get(key, None):
                    if key not in main_data["errors"]:
                        main_data["errors"][key] = _errors
                    else:
                        main_data["errors"][key].extend(_errors)

            else:
                # only update with specific model data
                for model in models:
                    main_data["generations"][key]["responses"][model] = merge_data[
                        "generations"
                    ][key]["responses"].get(model, [])
                    if _errors := [
                        e
                        for e in merge_data["errors"].get(key, [])
                        if e["model"] == model
                    ]:
                        if key not in main_data["errors"]:
                            main_data["errors"][key] = _errors
                        else:
                            main_data["errors"][key].extend(_errors)

    else:
        raise ValueError(
            f"Unknown merge type: {merge_type}. Use one of {get_args(MergeTypes)}."
        )

    # merge the metadata
    main_data["metadata"]["end_datetime"] = merge_data["metadata"]["end_datetime"]

    # save the merged data
    save_json(
        data=main_data,
        file_path=f"{root}/{main_file}",
    )

    # determine the ground truth file based on the hallucination level
    level = HallucinationLevel(main_data["metadata"]["hallucination_level"])
    ground_truth_file = (
        DEFAULT_PYPI_PACKAGES_FILE
        if level == HallucinationLevel.LIBRARY
        else DEFAULT_DOCUMENTATION_FILE
    )

    # evaluate the merged hallucinations
    evaluate_hallucinations(
        results_file=f"{root}/{main_file}",
        ground_truth_file=f"{root}/{ground_truth_file}",
    )

In [46]:
# do the merge

merge_results(
    main_file="output/mitigate/self_analysis/spec_lib_fabrication_2025-08-31T04:06:57.357245.json",
    merge_file="output/mitigate/spec_lib_fabrication_2025-09-03T21:31:09.200438.json",
    merge_type="models",
    models=["deepseek-chat"],
)