In [None]:
import importlib
import csv
import os
import pandas as pd
from termcolor import colored
from utils import io_helpers
from utils import llm

importlib.reload(io_helpers)
importlib.reload(llm)

FILEPATH = "evaluation/_pairwise_comparison_bool.csv"
FIELDNAMES = ["id1", "id2", "conflicts", "model", "prompt"]


def get_known_results() -> pd.DataFrame:
    try:
        known_results = pd.read_csv(
            FILEPATH,
            usecols=FIELDNAMES,
            dtype={"id1": "Int64", "id2": "Int64", "conflicts": "boolean"},
        )
    except FileNotFoundError:
        os.makedirs(os.path.dirname(FILEPATH), exist_ok=True)
        with open(FILEPATH, "x", newline="") as f:
            writer = csv.DictWriter(f, FIELDNAMES)
            writer.writeheader()
        known_results = pd.DataFrame(columns=FIELDNAMES)
    return known_results


def pairs_generator(doc_id: int, documents: pd.DataFrame):
    doc = documents[documents["doc_id"].astype(int) == int(doc_id)].squeeze()
    related_docs = documents[documents["doc_id"].isin(doc["related_docs"])]

    for _, related_doc in related_docs.iterrows():
        id1 = doc["doc_id"]
        id2 = related_doc["doc_id"]
        expected_result = (
            (id2 in doc["original_doc_ids"])
            | (id1 in related_doc["original_doc_ids"])
            | (
                len(set(doc["original_doc_ids"]).intersection(set(related_doc["original_doc_ids"]))) > 0
            )  # share the same original_doc
        )
        yield (id1, id2, expected_result)


def compare_and_extract_llm(
    doc_id1: int,
    doc_id2: int,
    expected_result: bool,
    model: str,
    documents: pd.DataFrame,
    prompts: object,
    expect_extraction: bool = False,
):
    known_results = get_known_results()
    if not known_results[
        (
            (known_results["id1"] == doc_id1) & (known_results["id2"] == doc_id2)
            | ((known_results["id1"] == doc_id2) & (known_results["id2"] == doc_id1))
        )
        & (known_results["model"] == model)
    ].empty:
        print(f"-- {doc_id1} & {doc_id2} -- processed before")
        return

    system_prompt = prompts["system_prompt"]

    text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
    text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

    user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

    response_format = (
        llm.LLMDocumentComparisonExtractResponse if expect_extraction else llm.LLMDocumentComparisonCheckResponse
    )
    llm_response = llm.call_any_llm(system_prompt, user_prompt, model, response_format_pydantic=response_format)
    actual_result = llm_response.contradictory_info_found

    print(f"-- {doc_id1} & {doc_id2} -- ", end="")
    if expected_result == actual_result:
        print(colored("Check!", "green"))
    else:
        print(colored("Wrong!", "red"))
        print(f"expected result: {expected_result}")
        print(f"actual result: {actual_result}")

    new_row = {
        "id1": doc_id1,
        "id2": doc_id2,
        "conflicts": actual_result,
        "model": model,
        "prompt": prompts["user_prompt"],
    }
    known_results = pd.concat([known_results, pd.DataFrame([new_row])], ignore_index=True)

    known_results.to_csv(FILEPATH, columns=FIELDNAMES, mode="w", index=False)

    if expect_extraction:
        return llm_response.contradictions

In [85]:
import importlib
import csv
import os
import pandas as pd
from termcolor import colored
from utils import io_helpers
from utils import llm
import ast

importlib.reload(io_helpers)
importlib.reload(llm)

FILEPATH = "evaluation/_pairwise_comparison.csv"
FIELDNAMES = ["model", "comparison_type", "id1", "id2", "contain_conflicts", "conflicting_passages"]


def get_known_results() -> pd.DataFrame:
    try:
        known_results = pd.read_csv(
            FILEPATH,
            usecols=FIELDNAMES,
            dtype={"id1": "Int64", "id2": "Int64", "contain_conflicts": "boolean"},
        )
    except FileNotFoundError:
        os.makedirs(os.path.dirname(FILEPATH), exist_ok=True)
        with open(FILEPATH, "x", newline="") as f:
            writer = csv.DictWriter(f, FIELDNAMES)
            writer.writeheader()
        known_results = pd.DataFrame(columns=FIELDNAMES)
    return known_results


def compare_and_extract_llm(
    doc_id1: int, doc_id2: int, model: str, comparison_type: str, documents: pd.DataFrame, prompts: object
):
    known_results = get_known_results()
    if not known_results[
        (
            ((known_results["id1"] == doc_id1) & (known_results["id2"] == doc_id2))
            | ((known_results["id1"] == doc_id2) & (known_results["id2"] == doc_id1))
        )
        & (known_results["model"] == model)
    ].empty:
        print(f"-- {doc_id1} & {doc_id2} -- processed before")
        return

    system_prompt = prompts["system_prompt"]

    text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
    text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

    user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

    print(f"-- {doc_id1} & {doc_id2} -- ", end="")
    llm_response: llm.LLMDocumentComparisonExtractResponse = llm.call_any_llm(
        system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonExtractResponse
    )
    print("Done")

    new_row = {
        "model": model,
        "comparison_type": comparison_type,
        "id1": doc_id1,
        "id2": doc_id2,
        "contain_conflicts": llm_response.contradictory_info_found,
        "conflicting_passages": llm_response.contradictions,
    }
    known_results = pd.concat([known_results, pd.DataFrame([new_row])], ignore_index=True)

    known_results.to_csv(FILEPATH, columns=FIELDNAMES, mode="w", index=False)

In [74]:
documents = io_helpers.get_documents()
filtered_docs = documents[documents["original_doc_ids"].apply(func=(lambda x: True if len(x) > 0 else False))]

filtered_docs
documents

Unnamed: 0,doc_id,domain,content,original_doc_ids
0,40,Finance,Acme Government Solutions is a government indu...,[]
1,41,Finance,Entertainment Enterprises Inc. is an entertain...,[]
2,42,Finance,"Advanced Manufacturing Solutions Inc., establi...",[]
3,43,Finance,"EcoGuard Solutions, established on April 15, 2...",[]
4,44,Finance,"Green Fields Agriculture Ltd., established on ...",[]
...,...,...,...,...
28,400116,Law,In a significant legal proceeding at the Cedar...,[116]
29,400059,Finance,"Retail Emporium, a well-established retail gia...",[59]
0,300001,Finance,Changes that occurred in senior management of ...,"[46, 47, 52, 59, 66, 71, 72, 77, 78, 79]"
1,300002,Law,Chief judge according to the court judgment of...,"[134, 136, 139, 112, 114, 115, 119, 123, 125, ..."


In [87]:
docs_to_compare = {
    "with_conflicts": {
        "single": [(134, 100134), (46, 100046), (179, 100179)],
        "multi": [(139, 400139), (205, 400205), (42, 400042)],
        "tabular": [(47, 300001), (136, 300002), (181, 300003)],
    },  # 3  # 3  # 3
    "without_conflicts": {
        "single": [(180, 181), (56, 58), (130, 138)],
        "multi": [(400204, 400075), (400192, 400194)],
        "tabular": [(300003, 66), (300003, 400192), (300001, 300002)],
    },  # 3  # 3  # 3
}

In [94]:
importlib.reload(llm)

documents = io_helpers.get_documents(read_relations=True)
prompts = io_helpers.get_prompt(name="comparison/extract_contradictions")

# DOC_IDS_TO_CHECK = [300001, 100134, 134, 400192, 78]

models = ["gpt-4.1", "gpt-4o"]
for id1, id2 in docs_to_compare["with_conflicts"]["tabular"]:
    for model in models:
        compare_and_extract_llm(
            id1, id2, comparison_type="with_multi", model=model, prompts=prompts, documents=documents
        )

# model = "gpt-4.1"

# text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
# text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

# system_prompt = prompts["system_prompt"]
# user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

# response = llm.call_any_llm(
#     system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonExtractResponse
# )

# print(response)

-- 47 & 300001 -- calling openai
Done
-- 47 & 300001 -- calling openai
Done
-- 136 & 300002 -- calling openai
Done
-- 136 & 300002 -- calling openai
Done
-- 181 & 300003 -- calling openai
Done
-- 181 & 300003 -- calling openai
Done


In [61]:
print(response.contradictions)
for contradiction in response.contradictions:
    print(f"text1: {contradiction.quote_from_document1}")
    print(f"text2: {contradiction.quote_from_document2}")
    print("-------")

[Contradiction(quote_from_document1='L. Rogers, a 60-year-old Caucasian woman, was admitted to Knoxville General Hospital on January 6th.', quote_from_document2='Age: 56'), Contradiction(quote_from_document1='The patient, who works as a nurse in a local clinic and is married with two healthy children, has a generally healthy medical history.', quote_from_document2='Occupation: Teacher\n...\nOccupation and Working Conditions: Teacher at a local high school, generally well-lit and comfortable working conditions'), Contradiction(quote_from_document1='Upon examination, Dr. Nelson confirmed a preliminary diagnosis of glaucoma, which was initially suspected two months prior by an optometrist.', quote_from_document2='Diagnosis and Treatment History: Visited optometrist 2 months ago, diagnosed with early-stage cataract. Prescribed new glasses with minimal relief; no other treatments received prior to hospital admission.'), Contradiction(quote_from_document1='This was further validated by a sli

In [48]:
### evaluate models
from utils import evaluation

importlib.reload(evaluation)
importlib.reload(io_helpers)

documents = io_helpers.get_documents()

evaluation.evaluate_llm_comparison(documents, FILEPATH)

Unnamed: 0,model,accuracy,count
0,gpt-4o,0.947674,172.0
1,gpt-4.1-mini,0.936047,172.0
2,gemini-2.5-flash-preview-05-20,0.843023,172.0
3,gpt-4.1-nano,0.666667,39.0
4,gpt-4o-mini,0.633333,60.0
