## Boolean Comparison

(Documents contain conflicts or not)


In [19]:
### comparison functions
import importlib
import csv
import os
import pandas as pd
from termcolor import colored
from utils import io_helpers
from utils import llm

importlib.reload(io_helpers)
importlib.reload(llm)

FILEPATH = "evaluation/_pairwise_comparison_bool.csv"
FIELDNAMES = ["id1", "id2", "conflicts", "model", "prompt"]


def processed_before() -> pd.DataFrame:
    try:
        known_results = pd.read_csv(
            FILEPATH,
            usecols=FIELDNAMES,
            dtype={"id1": "Int64", "id2": "Int64", "conflicts": "boolean"},
        )
    except FileNotFoundError:
        os.makedirs(os.path.dirname(FILEPATH), exist_ok=True)
        with open(FILEPATH, "x", newline="") as f:
            writer = csv.DictWriter(f, FIELDNAMES)
            writer.writeheader()
        known_results = pd.DataFrame(columns=FIELDNAMES)
    return known_results


def pairs_generator(doc_id: int, documents: pd.DataFrame):
    doc = documents[documents["doc_id"].astype(int) == int(doc_id)].squeeze()
    related_docs = documents[documents["doc_id"].isin(doc["related_docs"])]

    for _, related_doc in related_docs.iterrows():
        id1 = doc["doc_id"]
        id2 = related_doc["doc_id"]
        expected_result = (
            (id2 in doc["original_doc_ids"])
            | (id1 in related_doc["original_doc_ids"])
            | (
                len(set(doc["original_doc_ids"]).intersection(set(related_doc["original_doc_ids"]))) > 0
            )  # share the same original_doc
        )
        yield (id1, id2, expected_result)


def compare_with_llm(
    doc_id1: int,
    doc_id2: int,
    expected_result: bool,
    model: str,
    documents: pd.DataFrame,
    prompts: object,
    print_results: bool = True,
):
    known_results = processed_before()
    if not known_results[
        (
            (known_results["id1"] == doc_id1) & (known_results["id2"] == doc_id2)
            | ((known_results["id1"] == doc_id2) & (known_results["id2"] == doc_id1))
        )
        & (known_results["model"] == model)
    ].empty:
        print(f"-- {doc_id1} & {doc_id2} -- processed before")
        return

    system_prompt = prompts["system_prompt"]

    text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
    text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

    user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

    llm_response = llm.call_any_llm(
        system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonCheckResponse
    )
    actual_result = llm_response.contradictory_info_found

    if print_results:
        print(f"-- {doc_id1} & {doc_id2} -- ", end="")
        if expected_result == actual_result:
            print(colored("Check!", "green"))
        else:
            print(colored("Wrong!", "red"))
            print(f"expected result: {expected_result}")
            print(f"actual result: {actual_result}")

    new_row = {
        "id1": doc_id1,
        "id2": doc_id2,
        "conflicts": actual_result,
        "model": model,
        "prompt": prompts["user_prompt"],
    }
    known_results = pd.concat([known_results, pd.DataFrame([new_row])], ignore_index=True)

    known_results.to_csv(FILEPATH, columns=FIELDNAMES, mode="w", index=False)

In [22]:
### evaluate different models
from utils import evaluation

importlib.reload(evaluation)
importlib.reload(io_helpers)

FILEPATH = "evaluation/_pairwise_comparison_bool.csv"

documents = io_helpers.get_documents(read_relations=True)
prompts = io_helpers.get_prompt(name="comparison/check_for_contradictions")

# DOC_IDS_TO_CHECK = [300002, 100134, 134, 400192]
DOC_IDS_TO_CHECK = []

model = "gpt-4o"

for doc_id in DOC_IDS_TO_CHECK:
    for id1, id2, gt_result in pairs_generator(doc_id, documents):
        compare_with_llm(id1, id2, gt_result, model=model, documents=documents, prompts=prompts, print_results=True)

evaluation.evaluate_llm_comparison(documents, FILEPATH)

Unnamed: 0,model,accuracy,count
0,gpt-4.1,0.955556,45.0
1,gpt-4o,0.955556,45.0
2,gpt-4.1-mini,0.933333,45.0


## Compare and extract information


### Evaluate model capabilities


In [1]:
### Define functions
import importlib
import csv
import os
import pandas as pd
from termcolor import colored
from utils import io_helpers
from utils import llm
import ast

importlib.reload(io_helpers)
importlib.reload(llm)

FILEPATH = "evaluation/_pairwise_comparison.csv"
FIELDNAMES = ["model", "comparison_type", "id1", "id2", "contain_conflicts", "conflicting_passages"]


def processed_before() -> pd.DataFrame:
    try:
        known_results = pd.read_csv(
            FILEPATH,
            usecols=FIELDNAMES,
            dtype={"id1": "Int64", "id2": "Int64", "contain_conflicts": "boolean"},
        )
    except FileNotFoundError:
        os.makedirs(os.path.dirname(FILEPATH), exist_ok=True)
        with open(FILEPATH, "x", newline="") as f:
            writer = csv.DictWriter(f, FIELDNAMES)
            writer.writeheader()
        known_results = pd.DataFrame(columns=FIELDNAMES)
    return known_results


def compare_and_extract_llm(
    doc_id1: int, doc_id2: int, model: str, comparison_type: str, documents: pd.DataFrame, prompts: object
):
    known_results = processed_before()
    if not known_results[
        (
            ((known_results["id1"] == doc_id1) & (known_results["id2"] == doc_id2))
            | ((known_results["id1"] == doc_id2) & (known_results["id2"] == doc_id1))
        )
        & (known_results["model"] == model)
    ].empty:
        print(f"-- {doc_id1} & {doc_id2} -- processed before")
        return

    system_prompt = prompts["system_prompt"]

    text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
    text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

    user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

    print(f"-- {doc_id1} & {doc_id2} -- ", end="")
    llm_response: llm.LLMDocumentComparisonExtractResponse = llm.call_any_llm(
        system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonExtractResponse
    )
    print("Done")

    new_row = {
        "model": model,
        "comparison_type": comparison_type,
        "id1": doc_id1,
        "id2": doc_id2,
        "contain_conflicts": llm_response.contradictory_info_found,
        "conflicting_passages": llm_response.contradictions,
    }
    known_results = pd.concat([known_results, pd.DataFrame([new_row])], ignore_index=True)

    known_results.to_csv(FILEPATH, columns=FIELDNAMES, mode="w", index=False)



In [None]:
### Define doc IDs to compare for testing
docs_to_compare = {
    "with_conflicts": {
        "single": [(134, 100134), (46, 100046), (179, 100179)],
        "multi": [(139, 400139), (205, 400205), (42, 400042)],
        "tabular": [(47, 300001), (136, 300002), (181, 300003)],
    },  # 3  # 3  # 3
    "without_conflicts": {
        "single": [(180, 181), (56, 58), (130, 138)],
        "multi": [(400204, 400075), (400192, 400194)],
        "tabular": [(300003, 66), (300003, 400192), (300001, 300002)],
    },  # 3  # 3  # 3
}

In [None]:
### Execute comparison
importlib.reload(llm)

documents = io_helpers.get_documents(read_relations=True)
prompts = io_helpers.get_prompt(name="comparison/extract_contradictions")

# DOC_IDS_TO_CHECK = [300001, 100134, 134, 400192, 78]

models = ["gpt-4.1", "gpt-4o"]
for id1, id2 in docs_to_compare["with_conflicts"]["tabular"]:
    for model in models:
        compare_and_extract_llm(
            id1, id2, comparison_type="with_multi", model=model, prompts=prompts, documents=documents
        )

# model = "gpt-4.1"

# text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
# text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

# system_prompt = prompts["system_prompt"]
# user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

# response = llm.call_any_llm(
#     system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonExtractResponse
# )

# print(response)

-- 47 & 300001 -- calling openai
Done
-- 47 & 300001 -- calling openai
Done
-- 136 & 300002 -- calling openai
Done
-- 136 & 300002 -- calling openai
Done
-- 181 & 300003 -- calling openai
Done
-- 181 & 300003 -- calling openai
Done


### Extract and save all contradictions


In [21]:
import importlib
import csv
import os
import pandas as pd
from utils import io_helpers
from utils import llm

importlib.reload(io_helpers)
importlib.reload(llm)

FIELDNAMES = ["doc_id1", "doc_id2", "conflicting_passage_doc1", "conflicting_passage_doc2", "model"]


def save_result(filepath: str, data: pd.DataFrame):
    try:
        df = pd.read_csv(filepath)
        pd.concat([df, data], ignore_index=True).to_csv(filepath, mode="w", index=False)
    except FileNotFoundError:
        data.to_csv(filepath, mode="w", index=False)


def extract_conflicts(
    doc_id1: int,
    doc_id2: int,
    documents: pd.DataFrame,
    prompts: object,
    filepath_results=str,
):
    model = "gpt-4.1"

    try:
        existing_data = pd.read_csv(filepath_results, dtype={"id1": "Int64", "id2": "Int64", "conflicts_found": bool})
    except FileNotFoundError:
        existing_data = pd.DataFrame(
            columns=["id1", "id2", "model", "conflicting_passage_doc1", "conflicting_passage_doc2", "conflicts_found"]
        )

    if not existing_data[
        ((existing_data["id1"] == doc_id1) & (existing_data["id2"] == doc_id2))
        | ((existing_data["id1"] == doc_id2) & (existing_data["id2"] == doc_id1))
    ].empty:
        print(f"-- {doc_id1} & {doc_id2} -- processed before")
        return

    system_prompt = prompts["system_prompt"]

    text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
    text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

    user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

    print(f"-- {doc_id1} & {doc_id2} --", end=" ")
    llm_response: llm.LLMDocumentComparisonExtractResponse = llm.call_any_llm(
        system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonExtractResponse
    )
    if len(llm_response.contradictions) == 0:
        print("No", end=" ")
    else:
        print(len(llm_response.contradictions), end=" ")
    print("contradictions found.")

    for conflict in llm_response.contradictions:
        new_result = {
            "id1": doc_id1,
            "id2": doc_id2,
            "model": model,
            "conflicting_passage_doc1": conflict.quote_from_document1,
            "conflicting_passage_doc2": conflict.quote_from_document2,
            "conflicts_found": False if len(llm_response.contradictions) == 0 else True,
        }
        save_result(filepath_results, pd.DataFrame([new_result]))

In [10]:
### get all documents pairs that actually contain conflicts

documents = io_helpers.get_documents()
pairs = []


def get_conflicting_docs(row):
    if len(row["original_doc_ids"]) == 0:
        return
    else:
        for original_doc_id in row["original_doc_ids"]:
            pairs.append((int(row["doc_id"]), int(original_doc_id)))

        if str(row["doc_id"]).startswith("100"):
            (
                pairs.append((row["doc_id"], 300001))
                if row["original_doc_ids"][0]
                in documents.loc[documents["doc_id"] == 300001].squeeze()["original_doc_ids"]
                else None
            )
            (
                pairs.append((row["doc_id"], 300002))
                if row["original_doc_ids"][0]
                in documents.loc[documents["doc_id"] == 300002].squeeze()["original_doc_ids"]
                else None
            )
            (
                pairs.append((row["doc_id"], 300003))
                if row["original_doc_ids"][0]
                in documents.loc[documents["doc_id"] == 300003].squeeze()["original_doc_ids"]
                else None
            )


documents.apply(get_conflicting_docs, axis=1)
print(len(pairs))

120


In [None]:
### Extract and save conflicting passages

importlib.reload(llm)

documents = io_helpers.get_documents(read_relations=True)
prompts = io_helpers.get_prompt(name="comparison/extract_contradictions")

docs_to_compare = [(208, 100208), (179, 100179)]

for id1, id2 in pairs:
    extract_conflicts(
        id1,
        id2,
        prompts=prompts,
        documents=documents,
        filepath_results="data/additional_data/docs/_conflicts.csv",
    )