In [5]:
import pandas as pd
from importlib import reload
import ast
import importlib

from dotenv import load_dotenv

_ = load_dotenv("/Users/leon/.env")

## Single Textual Manipulation


In [None]:
from utils import io_helpers, data_helpers, llm

importlib.reload(io_helpers)

FILENAME = "single_textual_manipulations"

doc_ids = io_helpers.get_documents_to_manipulate("single_textual_manipulation")
doc_query_mapping: pd.DataFrame = io_helpers.get_doc_query_mapping("textual")

if len(set(doc_ids).symmetric_difference(set(doc_query_mapping["doc_id"]))) > 0:
    raise RuntimeError("Doc IDs don't match!")

documents = io_helpers.get_documents()
queries = io_helpers.get_queries()


def execute_manipulation(df_row):
    doc_id = df_row["doc_id"]
    query_id = df_row["query_id"]

    doc_id_new = data_helpers.get_id_for_manipulated_doc_or_query(doc_id, prefix_number=1)
    query_id_new = data_helpers.get_id_for_manipulated_doc_or_query(query_id, prefix_number=1)

    if io_helpers.file_has_been_manipulated("doc", FILENAME, doc_id_new) or io_helpers.file_has_been_manipulated(
        "query", FILENAME, query_id_new
    ):
        print(f"Doc with ID '{doc_id} has been manipulated before.")
        return None

    doc_entry = documents.loc[documents["doc_id"] == doc_id].squeeze()
    query_entry = queries.loc[queries["query.query_id"] == query_id].squeeze()

    system_prompt, user_prompt = io_helpers.get_prompt("manipulations/manipulation_single_textual")
    user_prompt = llm.format_user_prompt_single_textual(
        user_prompt,
        text=doc_entry["content"],
        question=query_entry["query.content"],
        answer=query_entry["ground_truth.content"],
        references=query_entry["ground_truth.references"],
    )
    response = llm.call_openai(
        system_prompt,
        user_prompt,
        model="gpt-4.1",
        response_format_pydantic=llm.SingleTextualManipulationResponse,
        temperature=0.2,
    )

    doc_entry_new = doc_entry.copy()
    doc_entry_new.update({"doc_id": doc_id_new, "content": response.text_new})
    doc_entry_new["original_doc_ids"] = [doc_id]

    query_entry_new = query_entry.copy()
    query_entry_new.update(
        {
            "query.query_id": query_id_new,
            "ground_truth.content": response.answer_new,
            "ground_truth.doc_ids": [doc_id_new],
            "ground_truth.references": response.references_new,
            "ground_truth.keypoints": [],
        }
    )
    query_entry_new["query.original_query_id"] = query_id

    io_helpers.easy_save_manipulated_doc(FILENAME, doc_entry_new)
    io_helpers.easy_save_manipulated_query(FILENAME, query_entry_new)


# _ = doc_query_mapping.apply(func=execute_manipulation, axis=1)

## Single Tabular Manipulation


In [None]:
### manipulate documents and save rows and queries
from utils import io_helpers, data_helpers, llm

importlib.reload(io_helpers)
FILENAME = "single_tabular_manipulations_rows"

DOMAIN_TO_ID = {"Medical": 300001, "Finance": 300002, "Law": 300003}

doc_ids = io_helpers.get_documents_to_manipulate("single_tabular_manipulation")
doc_query_mapping: pd.DataFrame = io_helpers.get_doc_query_mapping("tabular")

if len(set(doc_ids).symmetric_difference(set(doc_query_mapping["doc_id"]))) > 0:
    raise RuntimeError("Doc IDs don't match!")

documents = io_helpers.get_documents()
queries = io_helpers.get_queries()


new_documents = pd.DataFrame()


def execute_manipulation(df_row):
    doc_id = df_row["doc_id"]
    query_id = df_row["query_id"]

    doc_entry = documents.loc[documents["doc_id"] == doc_id].squeeze()
    query_entry = queries.loc[queries["query.query_id"] == query_id].squeeze()

    doc_id_new = DOMAIN_TO_ID[doc_entry["domain"]]
    query_id_new = data_helpers.get_id_for_manipulated_doc_or_query(query_id, prefix_number=3)

    if io_helpers.file_has_been_manipulated("query", "single_tabular_manipulations", query_id_new):
        print(f"Doc with ID '{doc_id} has been manipulated before.")
        return None

    system_prompt, user_prompt = io_helpers.get_prompt("manipulations/manipulation_single_tabular")
    entity = data_helpers.get_entity_by_doc_id(doc_id, documents)
    user_prompt = llm.format_user_prompt_single_tabular(
        user_prompt=user_prompt,
        question=query_entry["query.content"],
        answer=query_entry["ground_truth.content"],
        entity=entity,
    )

    response: llm.SingleTabularManipulationResponse = llm.call_openai(
        system_prompt,
        user_prompt,
        model="gpt-4o",
        response_format_pydantic=llm.SingleTabularManipulationResponse,
        temperature=0.2,
    )

    reference = f"{response.description} | {response.value}"

    doc_entry_new = doc_entry.copy()
    doc_entry_new.update({"doc_id": doc_id_new, "content": reference})
    doc_entry_new["original_doc_ids"] = [doc_id]

    query_entry_new = query_entry.copy()
    query_entry_new.update(
        {
            "query.query_id": query_id_new,
            "ground_truth.content": response.answer_new,
            "ground_truth.doc_ids": [doc_id_new],
            "ground_truth.references": [reference],
            "ground_truth.keypoints": [],
        }
    )
    query_entry_new["query.original_query_id"] = query_id

    io_helpers.easy_save_manipulated_doc(FILENAME, doc_entry_new)
    io_helpers.easy_save_manipulated_query("single_tabular_manipulations", query_entry_new)


_ = doc_query_mapping.apply(func=execute_manipulation, axis=1)

In [58]:
### aggregate tabular rows and save docs
import ast

tabular_docs = pd.read_csv(
    "additional_data/docs/single_tabular_manipulations_rows.csv", converters={"original_doc_ids": ast.literal_eval}
)


def to_list(series):
    if series.dropna().empty:
        return []
    return series.tolist()


def flatten_lists(series):
    return [item for sublist in series if isinstance(sublist, list) for item in sublist]


agg_funcs = {
    "doc_id": "first",
    "content": "\n".join,
    "company_name": to_list,
    "court_name": to_list,
    "hospital_patient_name": to_list,
    "original_doc_ids": flatten_lists,
}

aggregation = tabular_docs.groupby("domain").agg(agg_funcs).reset_index()
aggregation.rename(
    columns={
        "company_name": "company_names",
        "court_name": "court_names",
        "hospital_patient_name": "hospital_patient_names",
    },
    inplace=True,
)

aggregation.to_csv("additional_data/docs/single_tabular_manipulations.csv", index=False)

## Multi Textual Manipulation


In [1]:
### helper methods
from utils import data_helpers

reload(data_helpers)


def make_manipulated_query(query, qa_pairs):
    manipulated_query = query.copy()
    qa_pair = next(
        pair for pair in qa_pairs if pair.question.lower().strip() == query["query.content"].lower().strip()
    )
    manipulated_query.update(
        {
            "query.query_id": data_helpers.get_id_for_manipulated_doc_or_query(query["query.query_id"], 4),
            "ground_truth.content": qa_pair.answer,
            "ground_truth.references": [qa_pair.quote],
        }
    )
    manipulated_query["query.original_query_id"] = query["query.query_id"]
    return manipulated_query


def make_manipulated_doc(doc: pd.Series, text_new: str):
    manipulated_doc = doc.copy()
    manipulated_doc.update(
        {
            "doc_id": data_helpers.get_id_for_manipulated_doc_or_query(doc["doc_id"], 4),
            "content": text_new,
        }
    )
    manipulated_doc["original_doc_ids"] = [doc["doc_id"]]
    return manipulated_doc


KeyboardInterrupt



In [107]:
### Manipulate and save
from utils import io_helpers, data_helpers, llm
from importlib import reload
from typing import Literal
import pandas as pd

FILENAME = "multi_textual_manipulations"

reload(io_helpers)
reload(data_helpers)
reload(llm)


# for doc_id in doc_ids:
def manipulate_and_save(doc_id: int, on_exist: Literal["skip", "override"] = "skip"):
    documents = io_helpers.get_documents()
    queries = io_helpers.get_queries()

    doc_has_been_manipulated = io_helpers.file_has_been_manipulated(
        "doc", FILENAME, data_helpers.get_id_for_manipulated_doc_or_query(doc_id, 4)
    )
    if doc_has_been_manipulated & (on_exist == "skip"):
        print(f"Document with ID '{doc_id}' has been manipulated and saved before. Skipping this one.")
        return

    doc: pd.Series = data_helpers.get_doc_by_id(doc_id, documents)
    entity = data_helpers.get_entity_by_doc_id(doc_id, documents)
    text = doc["content"]

    related_queries: pd.DataFrame = data_helpers.get_queries_by_doc_id(doc_id, queries)
    qa_pairs = data_helpers.make_qa_pairs(related_queries)

    system_prompt, user_prompt = io_helpers.get_prompt("manipulations/manipulation_multi_textual_v03")
    user_prompt = llm.format_user_prompt_multi_textual_v02(user_prompt, entity, text, qa_pairs)

    llm_response: llm.MultiTextualManipulationResponseV02 = llm.call_openai(
        system_prompt,
        user_prompt,
        model="gpt-4o",
        response_format_pydantic=llm.MultiTextualManipulationResponseV02,
        temperature=0.8,
    )

    manipulated_doc: pd.Series = make_manipulated_doc(doc, llm_response.text_new)
    manipulated_queries: pd.DataFrame = related_queries.apply(
        make_manipulated_query, args=(llm_response.qa_pairs_new,), axis=1
    )

    if doc_has_been_manipulated & (on_exist == "override"):
        # delete existing doc and queries
        doc_id_new = data_helpers.get_id_for_manipulated_doc_or_query(doc_id, 4)
        query_ids_new = [
            data_helpers.get_id_for_manipulated_doc_or_query(id, 4)
            for id in related_queries["query.query_id"].to_list()
        ]
        io_helpers.delete_existing_doc_and_queries(doc_id_new, query_ids_new, FILENAME)

    io_helpers.easy_save_manipulated_doc(FILENAME, manipulated_doc)
    for _, manipulated_query in manipulated_queries.iterrows():
        io_helpers.easy_save_manipulated_query(FILENAME, manipulated_query)

In [191]:
id_iter = iter(io_helpers.get_documents_to_manipulate("multi_textual_manipulation"))

In [None]:
# for doc_id in id_iter:
#     manipulate_and_save(doc_id, on_exist="skip")

Document with ID '128' has been manipulated and saved before. Skipping this one.
Document with ID '132' has been manipulated and saved before. Skipping this one.
Document with ID '133' has been manipulated and saved before. Skipping this one.
Document with ID '134' has been manipulated and saved before. Skipping this one.
Document with ID '136' has been manipulated and saved before. Skipping this one.
Document with ID '205' has been manipulated and saved before. Skipping this one.
Document with ID '139' has been manipulated and saved before. Skipping this one.
Document with ID '40' has been manipulated and saved before. Skipping this one.
Document with ID '42' has been manipulated and saved before. Skipping this one.
Document with ID '52' has been manipulated and saved before. Skipping this one.
Document with ID '53' has been manipulated and saved before. Skipping this one.
Document with ID '183' has been manipulated and saved before. Skipping this one.
Saved document to 'additional_da

## Generate Keypoints


In [None]:
import pandas as pd
import ast
import importlib
from utils import io_helpers, llm

importlib.reload(llm)
importlib.reload(io_helpers)


def keypoints_for_row(data):
    question = data["query.content"]
    answer = data["ground_truth.content"]

    system_prompt, user_prompt = io_helpers.get_prompt("manipulations/keypoints")
    user_prompt = llm.format_user_prompt_keypoints(user_prompt, question=question, answer=answer)

    model = "gpt-4.1"

    response: llm.KeypointsGenerationResponse = llm.call_openai(
        system_prompt,
        user_prompt,
        model=model,
        response_format_pydantic=llm.KeypointsGenerationResponse,
        temperature=0.2,
    )
    return response.keypoints


def generate_keypoints(queries_path):
    queries = pd.read_csv(queries_path, converters={"ground_truth.keypoints": ast.literal_eval})
    queries["ground_truth.keypoints"] = queries.apply(keypoints_for_row, axis=1)
    queries.to_csv(queries_path, index=False)


queries = generate_keypoints("additional_data/queries/single_textual_manipulations.csv")

In [10]:
queries

Unnamed: 0,domain,ground_truth.content,ground_truth.doc_ids,ground_truth.keypoints,ground_truth.references,query.content,query.query_id,query.query_type,query.original_query_id
0,Finance,$250 million.,[100046],[JetWing Aviation's total liabilities amounted...,"[""Total liabilities for JetWing Aviation amoun...",What was the total amount of JetWing Aviation'...,102856,Factual Question,2856
1,Finance,Mr. John Doe.,[100071],[Mr. John Doe was appointed as CEO of ABC Educ...,"['In terms of senior management changes, Mr. J...",Who was appointed as CEO of ABC Education Corp...,102538,Factual Question,2538
2,Medical,"Fever, cough, and chest pain for 5 days.",[100208],"[V. Lewis's chief complaint is fever, cough, a...","['Chief Complaint: Fever, cough, and chest pai...",According to the hospitalization records of Ne...,106225,Factual Question,6225
3,Law,"19th July, 1964",[100119],"[H. Walker's birthdate is 19th July, 1964 acco...","['Date of Birth: 19th July, 1964']","According to the court judgment of Riverton, H...",104466,Factual Question,4466
4,Law,"22, Maple Avenue, Quarryville",[100123],"[The residence of Z. Torres was 22, Maple Aven...","['- Residence: 22, Maple Avenue, Quarryville']","According to the court judgment of Sterling, Q...",104480,Factual Question,4480
5,Law,Project Manager at Sterling Public Works Depar...,[100111],"[N. Adams was a Project Manager., N. Adams wor...",['- **Occupation:** Project Manager at Sterlin...,"According to the court judgment of Brighton, S...",104361,Factual Question,4361
6,Law,30th June 2023.,[100113],[The date of the court judgment was 30th June ...,['Under the provisions of Article 232 and rela...,"According to the court judgment of Preston, La...",104411,Factual Question,4411
7,Law,25th March 2023,[100122],[The judgment date was 25th March 2023.],['**Judgment Issued on 25th March 2023**'],"According to the court judgment of Trenton, Va...",104502,Factual Question,4502
8,Law,D. Morgan,[100124],[The defendant in the court judgment of Norwoo...,['- **Defendant Name**: D. Morgan'],"According to the court judgment of Norwood, Un...",104490,Factual Question,4490
9,Law,Six years of fixed-term imprisonment,[100129],[G. Torres was sentenced to six years of fixed...,"['Consequently, this court sentences G. Torres...","According to the court judgment of Vandalia, B...",104540,Factual Question,4540
