In [1]:
import importlib
import csv
import os
import pandas as pd
from termcolor import colored
from utils import io_helpers
from utils import llm

importlib.reload(io_helpers)
importlib.reload(llm)

FILEPATH = "data/additional_data/docs/_pairwise_comparison.csv"
FIELDNAMES = ["id1", "id2", "conflicts", "model", "prompt"]


def get_known_results() -> pd.DataFrame:
    try:
        known_results = pd.read_csv(
            FILEPATH,
            usecols=FIELDNAMES,
            dtype={"id1": "Int64", "id2": "Int64", "conflicts": "boolean"},
        )
    except FileNotFoundError:
        os.makedirs(os.path.dirname(FILEPATH), exist_ok=True)
        with open(FILEPATH, "x", newline="") as f:
            writer = csv.DictWriter(f, FIELDNAMES)
            writer.writeheader()
        known_results = pd.DataFrame(columns=FIELDNAMES)
    return known_results


def pairs_generator(doc_id: int, documents: pd.DataFrame):
    doc = documents[documents["doc_id"].astype(int) == int(doc_id)].squeeze()
    related_docs = documents[documents["doc_id"].isin(doc["related_docs"])]

    for _, related_doc in related_docs.iterrows():
        id1 = doc["doc_id"]
        id2 = related_doc["doc_id"]
        expected_result = (
            (id2 in doc["original_doc_ids"])
            | (id1 in related_doc["original_doc_ids"])
            | (
                len(set(doc["original_doc_ids"]).intersection(set(related_doc["original_doc_ids"]))) > 0
            )  # share the same original_doc
        )
        yield (id1, id2, expected_result)


def compare_with_llm(
    doc_id1: int,
    doc_id2: int,
    expected_result: bool,
    model: str,
    documents: pd.DataFrame,
    prompts: object,
    expect_extraction: bool = False,
):
    known_results = get_known_results()
    if not known_results[
        (
            (known_results["id1"] == doc_id1) & (known_results["id2"] == doc_id2)
            | ((known_results["id1"] == doc_id2) & (known_results["id2"] == doc_id1))
        )
        & (known_results["model"] == model)
    ].empty:
        print(f"-- {doc_id1} & {doc_id2} -- processed before")
        return

    system_prompt = prompts["system_prompt"]

    text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
    text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

    user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

    response_format = (
        llm.LLMDocumentComparisonExtractResponse if expect_extraction else llm.LLMDocumentComparisonCheckResponse
    )
    llm_response = llm.call_any_llm(system_prompt, user_prompt, model, response_format_pydantic=response_format)
    actual_result = llm_response.contradictory_info_found

    print(f"-- {doc_id1} & {doc_id2} -- ", end="")
    if expected_result == actual_result:
        print(colored("Check!", "green"))
    else:
        print(colored("Wrong!", "red"))
        print(f"expected result: {expected_result}")
        print(f"actual result: {actual_result}")

    new_row = {
        "id1": doc_id1,
        "id2": doc_id2,
        "conflicts": actual_result,
        "model": model,
        "prompt": prompts["user_prompt"],
    }
    known_results = pd.concat([known_results, pd.DataFrame([new_row])], ignore_index=True)

    known_results.to_csv(FILEPATH, columns=FIELDNAMES, mode="w", index=False)

    if expect_extraction:
        return llm_response.contradictions



In [7]:
importlib.reload(llm)

documents = io_helpers.get_documents(read_relations=True)
prompts = io_helpers.get_prompt(name="comparison/extract_contradictions")

# DOC_IDS_TO_CHECK = [300001, 100134, 134, 400192, 78]

doc_id1 = 192
doc_id2 = 400192

model = "gpt-4o-mini"

text1 = documents[documents["doc_id"].astype(int) == int(doc_id1)].squeeze()["content"]
text2 = documents[documents["doc_id"].astype(int) == int(doc_id2)].squeeze()["content"]

print(text1)
print(text2)

system_prompt = prompts["system_prompt"]
user_prompt = llm.format_user_prompt(prompts["user_prompt"], text1=text1, text2=text2)

response = llm.call_any_llm(
    system_prompt, user_prompt, model, response_format_pydantic=llm.LLMDocumentComparisonExtractResponse
)

print(response)

Hospitalization Record

Basic Information:
Name: L. Rogers
Gender: Female
Age: 56
Ethnicity: Caucasian
Marital Status: Married
Occupation: Teacher
Address: 89, Greenfield street, Knoxville
Admission Time: 6th, January
Record Time: 6th, January
Historian: Dr. James Nelson
Hospital Name: Knoxville General Hospital

Chief Complaint:
Blurry, foggy vision and decreased visual acuity for the past 3 months

Present Illness:
Onset: 3 months ago, at home, gradual onset, no specific prodromal symptoms identified. Possible aging-related onset.
Main Symptoms: Blurry, foggy vision affecting both eyes, progressively worsening over time. Decreased visual acuity, significant especially in low-light conditions. No known relieving factors; glare worsens the symptoms.
Accompanying Symptoms: Increased glare and halos around lights, occasional monocular double or multiple vision, difficulty seeing in bright light.
Diagnosis and Treatment History: Visited optometrist 2 months ago, diagnosed with early-stage

In [3]:
print(response.contradictions)
for contradiction in response.contradictions:
    print(f"text1: {contradiction.quote_from_document1}")
    print(f"text2: {contradiction.quote_from_document2}")
    print("-------")

[]


In [2]:
### evaluate models
from utils import evaluation

importlib.reload(evaluation)
importlib.reload(io_helpers)

documents = io_helpers.get_documents()

evaluation.evaluate_llm_comparison(documents, FILEPATH)

Unnamed: 0,model,accuracy,count
0,gpt-4o,0.958716,218.0
1,gpt-4.1-mini,0.93578,218.0
2,gemini-2.5-flash-preview-05-20,0.866972,218.0
3,gpt-4.1-nano,0.717391,46.0
4,gpt-4o-mini,0.640625,64.0
