In [1]:
import importlib
import csv
import os
import pandas as pd
from termcolor import colored
from utils import io_helpers
from utils import llm

importlib.reload(io_helpers)
importlib.reload(llm)

documents = io_helpers.get_documents(read_relations=True)


prompts = io_helpers.get_prompt(name="comparison/check_for_contradictions")
system_prompt = prompts["system_prompt"]

FILEPATH = "data/additional_data/docs/_pairwise_comparison.csv"
FIELDNAMES = ["id1", "id2", "conflicts", "model", "prompt"]
# MODEL = "gpt-4.1-mini"

# DOC_ID = 134

try:
    known_results = pd.read_csv(
        FILEPATH,
        usecols=FIELDNAMES,
        dtype={"id1": "Int64", "id2": "Int64", "conflicts": "boolean"},
    )
except FileNotFoundError:
    os.makedirs(os.path.dirname(FILEPATH), exist_ok=True)
    with open(FILEPATH, "x", newline="") as f:
        writer = csv.DictWriter(f, FIELDNAMES)
        writer.writeheader()
    known_results = pd.DataFrame(columns=FIELDNAMES)


def pairs_generator(doc_id: int):
    doc = documents[documents["doc_id"].astype(int) == int(doc_id)].squeeze()
    related_docs = documents[documents["doc_id"].isin(doc["related_docs"])]

    for _, related_doc in related_docs.iterrows():
        id1 = doc["doc_id"]
        id2 = related_doc["doc_id"]
        expected_result = (
            (id2 in doc["original_doc_ids"])
            | (id1 in related_doc["original_doc_ids"])
            | (
                len(set(doc["original_doc_ids"]).intersection(set(related_doc["original_doc_ids"]))) > 0
            )  # share the same original_doc
        )
        yield (id1, id2, expected_result)


MODELS = ["gpt-4.1-mini", "gpt-4o"]
DOC_IDS_TO_CHECK = [300001, 100134, 134, 400192, 78]

for MODEL in MODELS:
    for DOC_ID in DOC_IDS_TO_CHECK:
        for id1, id2, expected_result in pairs_generator(DOC_ID):
            # for id1, id2, expected_result in [(136, 137, False)]:
            if not known_results[
                (
                    (known_results["id1"] == id1) & (known_results["id2"] == id2)
                    | ((known_results["id1"] == id2) & (known_results["id2"] == id1))
                )
                & (known_results["model"] == MODEL)
            ].empty:
                print(f"-- {id1} & {id2} -- processed before")
                continue

            text1 = documents[documents["doc_id"].astype(int) == int(id1)].squeeze()["content"]
            text2 = documents[documents["doc_id"].astype(int) == int(id2)].squeeze()["content"]

            user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)
            actual_result = llm.call_openai(system_prompt, user_prompt, MODEL).contradictory_info_found

            print(f"-- {id1} & {id2} -- ", end="")
            if expected_result == actual_result:
                print(colored("Check!", "green"))
            else:
                print(colored("Wrong!", "red"))
                print(f"expected result: {expected_result}")
                print(f"actual result: {actual_result}")

            new_row = {
                "id1": id1,
                "id2": id2,
                "conflicts": actual_result,
                "model": MODEL,
                "prompt": prompts["user_prompt"],
            }
            known_results = pd.concat([known_results, pd.DataFrame([new_row])], ignore_index=True)

            known_results.to_csv(FILEPATH, columns=FIELDNAMES, mode="w", index=False)

-- 300001 & 41 -- processed before
-- 300001 & 42 -- processed before
-- 300001 & 43 -- processed before
-- 300001 & 44 -- processed before
-- 300001 & 45 -- processed before
-- 300001 & 46 -- processed before
-- 300001 & 47 -- processed before
-- 300001 & 48 -- processed before
-- 300001 & 49 -- processed before
-- 300001 & 51 -- processed before
-- 300001 & 52 -- processed before
-- 300001 & 53 -- processed before
-- 300001 & 54 -- processed before
-- 300001 & 55 -- processed before
-- 300001 & 56 -- processed before
-- 300001 & 57 -- processed before
-- 300001 & 58 -- processed before
-- 300001 & 59 -- processed before
-- 300001 & 60 -- processed before
-- 300001 & 61 -- processed before
-- 300001 & 62 -- processed before
-- 300001 & 63 -- processed before
-- 300001 & 64 -- processed before
-- 300001 & 65 -- processed before
-- 300001 & 66 -- processed before
-- 300001 & 68 -- processed before
-- 300001 & 69 -- processed before
-- 300001 & 71 -- processed before
-- 300001 & 72 -- pr

In [16]:
### evaluate models
from utils import evaluation

importlib.reload(evaluation)
importlib.reload(io_helpers)

documents = io_helpers.get_documents()

evaluation.evaluate_llm_comparison(documents, FILEPATH)

Unnamed: 0,model,accuracy,count
0,gpt-4o,0.958716,218.0
1,gpt-4.1-mini,0.93578,218.0
2,gpt-4.1-nano,0.717391,46.0
3,gpt-4o-mini,0.640625,64.0
