In [32]:
import pandas as pd
import ast

from dotenv import load_dotenv

load_dotenv("/Users/leon/.env")

True

In [31]:
#### static variables

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR_ROW = [
    *COLUMNS_DOCS,
    "original_doc_id",
    "query.original_query_id",
    "ground_truth.content",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

COLUMNS_QUERIES = [
    "domain",
    "ground_truth.content",
    "ground_truth.doc_ids",
    "ground_truth.keypoints",
    "ground_truth.references",
    "language",
    "prediction",
    "query.content",
    "query.query_id",
    "query.query_type",
]

COLUMNS_QUERIES_MANIPULATED = [*COLUMNS_QUERIES, "query.original_query_id"]

In [None]:
### helper functions/classes for manipulation
from typing import Tuple, List, Dict
from pydantic import BaseModel
import pandas as pd
import ast
import csv
import os
from typing import Literal
import sys
import json

DOCUMENTS = pd.read_csv("DRAGONball/en/docs.csv")
QUERIES = pd.read_csv(
    "DRAGONball/en/queries_flattened.csv",
    converters={
        "ground_truth.doc_ids": ast.literal_eval,
        "ground_truth.keypoints": ast.literal_eval,
        "ground_truth.references": ast.literal_eval,
    },
)


class SingleTextualManipulationResponse(BaseModel):
    text_new: str
    answer_new: str
    references_new: list[str]

class QAPair(BaseModel):
    question: str
    answer: str
    references: List[str]

class MultiTextualManipulationResponse(BaseModel):
    text_new: str
    qa_pairs_new: List[QAPair]

class SingleTabularManipulationResponse(BaseModel):
    answer_new: str
    description: str
    value: str

def read_prompt(path: str | os.PathLike) -> Dict:
    """Reads from JSON-file"""
    with open(path, "r") as f:
        return json.load(f)


def format_user_prompt_single_textual(user_prompt: str, text: str, question: str, answer: str, references: str) -> str:
    return user_prompt.format(text=text, question=question, answer=answer, references=references)


def format_user_prompt_single_tabular(user_prompt: str, question: str, answer: str, entity: str) -> str:
    return user_prompt.format(question=question, answer=answer, entity=entity)

def format_user_prompt_multi_textual(user_prompt: str, text: str, qa_pairs: List[Dict]) -> str:
    """qa_pairs must be of format: [ { question: '...', answer: '...' }, ... ]"""
    questions_str = ""
    for id, qa in enumerate(qa_pairs, start=1):
        questions_str += f"question{id}: {qa["question"]}\n"
        questions_str += f"answer{id}: {qa["answer"]}\n"

    return user_prompt.format(text=text, questions=questions_str)


def get_query_ids_for_doc(doc_id: int | str, query_types: list[str] = ["Factual Question"]) -> list[int]:
    """Selects query_ids for queries related to that doc and with a specified type."""
    return QUERIES[
        QUERIES["ground_truth.doc_ids"].apply(lambda doc_ids: int(doc_id) in doc_ids)
        & QUERIES["query.query_type"].isin(query_types)
    ]["query.query_id"].to_list()


def get_doc_query_mapping_single(target: Literal["textual", "tabular"]) -> List[Dict[str, int]]:
    with open("doc_query_mapping_single.csv", "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        return [
            {
                "doc_id": int(row["doc_id"]),
                "query_id": int(row["query_id_single"]) if target == "textual" else int(row["query_id_multi"]),
            }
            for row in reader
        ]

def get_doc_query_mapping_multi() -> List[Dict]:
    with open("docs_to_manipulate.csv", "r", encoding="utf-8", newline="") as f:
        reader = csv.DictReader(f)
        doc_ids = [row["doc_id"] for row in reader if int(row["multi_textual_manipulation"]) == 1]
    mapping = []
    for id in doc_ids: 
        entry = {
            "doc_id": id,
            "query_ids": get_query_ids_for_doc(id, query_types=["Factual Question"])
            }
        mapping.append(entry)
    return mapping


def get_query_properties(
    query_id,
    properties: list = ["ground_truth.content", "ground_truth.keypoints", "ground_truth.references", "query.content"],
) -> Tuple:
    """Select columns for query_id from queries dataframe."""
    row = QUERIES[QUERIES["query.query_id"] == query_id]
    return tuple(row[prop].iloc[0] for prop in properties)


def get_doc_properties(
    doc_id,
    properties,
) -> Tuple:
    """Select columns for doc_id from docs dataframe."""
    row: pd.DataFrame = DOCUMENTS[DOCUMENTS["doc_id"] == doc_id].dropna(axis=1)
    return tuple(row[prop].iloc[0] for prop in properties if prop in row.columns)


def get_doc_text(doc_id: int | str) -> str:
    return DOCUMENTS[DOCUMENTS["doc_id"] == int(doc_id)]["content"].iloc[0]


def get_query_by_id(query_id: int) -> pd.Series:
    return QUERIES[QUERIES["query.query_id"].astype(int) == query_id].iloc[0]


def get_queries_by_id(query_ids: List[int]) -> pd.DataFrame:
    return QUERIES[QUERIES["query.query_id"].astype(int).isin(query_ids)]


def get_doc_by_id(doc_id: int | str) -> pd.Series:
    return DOCUMENTS[DOCUMENTS["doc_id"].astype(int) == int(doc_id)].iloc[0]


def openai_interface(system_prompt, user_prompt, response_format_pydantic=SingleTextualManipulationResponse):
    """execute openai LLM call"""
    from openai import OpenAI

    client = OpenAI()
    print("DEBUG: Attempting LLM call")
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
        response_format=response_format_pydantic,
        temperature=0,
    ).choices[0]
    print("DEBUG: Finished LLM call")
    return completion.message.parsed


def get_prompts_for_textual_single(doc_id: int, query_id: int) -> Tuple:
    PROMPT_TYPE = "manipulation_factual"
    prompt = [
        prompt for prompt in read_prompt("prompts/json/manipulate_docs.json") if prompt["prompt_type"] == PROMPT_TYPE
    ][0]
    text = get_doc_text(doc_id)
    answer, keypoints, references, question = get_query_properties(query_id)
    system_prompt = prompt["system_prompt"]
    user_prompt = format_user_prompt_single_textual(
        user_prompt=prompt["user_prompt"], text=text, answer=answer, question=question, references=references
    )
    return (system_prompt, user_prompt)


def get_prompts_for_tabular_single(query_id: int, doc_id: int) -> Tuple:
    prompt_obj = read_prompt("prompts/json/manipulation_tabular.json")
    system_prompt = prompt_obj["system_prompt"]
    user_prompt = prompt_obj["user_prompt"]
    answer, question = get_query_properties(query_id, properties=["ground_truth.content", "query.content"])
    (entity,) = get_doc_properties(doc_id, ["hospital_patient_name", "company_name", "court_name"])
    user_prompt = format_user_prompt_single_tabular(user_prompt, question, answer, entity)
    return (system_prompt, user_prompt)


def get_prompts_for_textual_multi(doc_id: int, query_ids: List[int]) -> Tuple:
    prompt_obj = read_prompt("../prompts/json/manipulation_multi_textual.json")
    text = get_doc_text(doc_id)
    qa_pairs = []
    for query_id in query_ids:
        question, answer = get_query_properties(query_id, ["query.content","ground_truth.content"])
        qa_pairs.append({"question": question, "answer": answer})
    
    system_prompt = prompt_obj["system_prompt"]
    user_prompt = format_user_prompt_multi_textual(user_prompt=prompt_obj["user_prompt"], text=text, qa_pairs=qa_pairs)
    
    return (system_prompt, user_prompt)


def get_id_for_manipulated_doc_or_query(original_doc_id: int, prefix_number=1) -> int:
    id_str = str(prefix_number) + str(original_doc_id).zfill(5)
    return int(id_str)


def save_manipulated_doc(filename: os.PathLike | str, fieldnames: List[str], **kwargs):
    """Saves a manipulated doc to csv.
    If an entry with that doc_id already exists in the csv, the new entry is NOT saved.
    """
    if filename is None:
        raise RuntimeError("Must specify a filename!")
    
    print(f"Saving Doc with ID '{kwargs["doc_id"]}'")
    is_empty = not os.path.exists(filename) or os.stat(filename).st_size == 0
    id_exists = False
    with open(filename, "a+", newline="") as f:
        if not is_empty:
            f.seek(0)
            reader = csv.DictReader(f, fieldnames=fieldnames)
            ids_present = {int(row["doc_id"]) for row in list(reader)[1:]}
            id_exists = int(kwargs["doc_id"]) in ids_present

        if id_exists == True:
            print(f"WARN: Row with ID {kwargs["doc_id"]} already exists. Did not write new document to '{filename}'.")
            return

        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        if is_empty:
            writer.writeheader()
        writer.writerow(kwargs)


def save_manipulated_query(
    filename: os.PathLike | str,
    fieldnames: List[str],
    **kwargs
):
    """Saves a manipulated query to csv.
    If an entry with that doc_id already exists in the csv, the new entry is NOT saved.
    """
    if filename is None:
        raise RuntimeError("Must specify a filename!")
    
    print(f"Saving Query with ID '{kwargs["query.query_id"]}'")
    is_empty = not os.path.exists(filename) or os.stat(filename).st_size == 0
    id_exists = False
    with open(filename, "a+", newline="") as f:
        if not is_empty:
            f.seek(0)
            reader = csv.DictReader(f, fieldnames=fieldnames)
            ids_present = {int(row["query.query_id"]) for row in list(reader)[1:]}
            id_exists = int(kwargs["query.query_id"]) in ids_present

        if id_exists == True:
            print(f"WARN: Row with ID {kwargs["query.query_id"]} already exists. Did not write new query to '{filename}'.")
            return

        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        if is_empty:
            writer.writeheader()
        writer.writerow(kwargs)



In [12]:
### describe documents and queries

print("Documents:\n", DOCUMENTS.columns)
print()
print("Queries:\n", QUERIES.columns)

Documents:
 Index(['hospital_patient_name', 'language', 'doc_id', 'domain', 'content',
       'company_name', 'court_name'],
      dtype='object')

Queries:
 Index(['domain', 'ground_truth.content', 'ground_truth.doc_ids',
       'ground_truth.keypoints', 'ground_truth.references', 'language',
       'prediction', 'query.content', 'query.query_id', 'query.query_type'],
      dtype='object')


## Single Textual Manipulation


In [None]:
### manipulate documents (textual)
def save_single_textual(doc_entry, query_entry, completion_parsed):
    # -- doc
    original_doc_id = doc_entry.doc_id
    manipulated_doc_entry = doc_entry.copy()
    manipulated_doc_entry.doc_id = get_id_for_manipulated_doc_or_query(original_doc_id)
    manipulated_doc_entry.content = completion_parsed.text_new
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, pd.Series([original_doc_id], index=["original_doc_id"])])

    save_manipulated_doc(**manipulated_doc_entry)

    # -- query
    original_query_id = query_entry["query.query_id"]
    manipulated_query_entry = query_entry.copy()
    manipulated_query_entry["ground_truth.content"] = completion_parsed.answer_new
    manipulated_query_entry["ground_truth.references"] = completion_parsed.references_new
    manipulated_query_entry["ground_truth.keypoints"] = []
    manipulated_query_entry["query.query_id"] = get_id_for_manipulated_doc_or_query(original_query_id)
    manipulated_query_entry = pd.concat(
        [manipulated_query_entry, pd.Series([original_query_id], index=["query.original_query_id"])]
    )
    manipulated_query_entry.index = manipulated_query_entry.index.str.replace(".", "__", regex=False)

    save_manipulated_query(**manipulated_query_entry)


doc_query_mapping = get_doc_query_mapping_single()

for mapping in doc_query_mapping:
    doc_id = mapping["doc_id"]
    query_id = mapping["query_id"]

    doc_entry = get_doc_by_id(doc_id)
    query_entry = get_query_by_id(query_id)

    system_prompt, user_prompt = get_prompts_for_textual_single(doc_id, query_id)

    # call openai
    completion_parsed = openai_interface(system_prompt, user_prompt)

    save_single_textual(doc_entry, query_entry, completion_parsed)
    print(f"Finished processing doc {doc_id} and query {query_id}.")

## Single Tabular Manipulation


In [None]:
### manipulate documents (tabular) and save rows
mapping = get_doc_query_mapping_single("tabular")

doc_entries = []

for id_pair in mapping:
    doc_id = id_pair["doc_id"]
    query_id = id_pair["query_id"]
    system_prompt, user_prompt = get_prompts_for_tabular_single(query_id, doc_id)

    response: SingleTabularManipulationResponse = openai_interface(
        system_prompt, user_prompt, SingleTabularManipulationResponse
    )

    manipulated_doc_entry = get_doc_by_id(doc_id).copy()
    manipulated_doc_entry.doc_id = get_id_for_manipulated_doc_or_query(doc_id, prefix_number=2)
    manipulated_doc_entry.content = " | ".join([response.description, response.value])
    additional_fields = pd.Series(
        [doc_id, query_id, response.answer_new],
        index=["original_doc_id", "query.original_query_id", "ground_truth.content"],
    )
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, additional_fields])

    save_manipulated_doc(
        filename="additional_data/docs/tabular_manipulations_result_rows.csv",
        fieldnames=COLUMNS_DOCS_MANIPULATED_TABULAR_ROW,
        **manipulated_doc_entry,
    )

In [None]:
### aggregate tabular rows and save docs
tabular_docs = pd.read_csv("additional_data/docs/tabular_manipulations_result_rows.csv")


def list_or_none(series):
    if series.dropna().empty:
        return None
    return series.tolist()


agg_funcs = {
    "doc_id": lambda x: 0,
    "language": "first",
    "content": "\n".join,
    "company_name": list_or_none,
    "court_name": list_or_none,
    "hospital_patient_name": list_or_none,
    "original_doc_id": list_or_none,
    "query.original_query_id": lambda x: None,
    "ground_truth.content": lambda x: None,
}

aggregation = tabular_docs.groupby("domain").agg(agg_funcs).reset_index()
aggregation["doc_id"] = [get_id_for_manipulated_doc_or_query(id, prefix_number=3) for id in [1, 2, 3]]
aggregation.rename(
    columns={
        "company_name": "company_names",
        "court_name": "court_names",
        "hospital_patient_name": "hospital_patient_names",
        "original_doc_id": "original_doc_ids",
    },
    inplace=True,
)

for row_dict in aggregation.to_dict(orient="records"):
    save_manipulated_doc(
        "additional_data/docs/tabular_manipulations_result.csv",
        fieldnames=COLUMNS_DOCS_MANIPULATED_TABULAR,
        **row_dict
    )

In [None]:
### save manipulated queries for aggregated tabular docs
tabular_docs = pd.read_csv("additional_data/docs/tabular_manipulations_result_rows.csv")

mapping_domain_doc_id = pd.read_csv(
    "additional_data/docs/tabular_manipulations_result.csv", usecols=["domain", "doc_id"]
)
mapping_dict = mapping_domain_doc_id.set_index("domain")["doc_id"].to_dict()

for row_dict in tabular_docs.to_dict(orient="records"):
    original_query_id = row_dict["query.original_query_id"]
    manipulated_query_entry = get_query_by_id(original_query_id).copy()

    manipulated_query_entry["ground_truth.doc_ids"] = [mapping_dict[row_dict["domain"]]]
    manipulated_query_entry["ground_truth.content"] = row_dict["ground_truth.content"]
    manipulated_query_entry["ground_truth.references"] = row_dict["content"]
    manipulated_query_entry["ground_truth.keypoints"] = []
    manipulated_query_entry["query.query_id"] = get_id_for_manipulated_doc_or_query(original_query_id, prefix_number=3)
    manipulated_query_entry = pd.concat(
        [manipulated_query_entry, pd.Series([original_query_id], index=["query.original_query_id"])]
    )

    save_manipulated_query(
        filename="additional_data/queries/tabular_manipulations_result.csv",
        fieldnames=COLUMNS_QUERIES_MANIPULATED,
        **manipulated_query_entry
    )

## Multi Textual Manipulation


In [62]:
def save_multi_textual(doc_entry, query_entries: pd.DataFrame, text_new: str, qa_pairs: List[QAPair]):
    # -- doc
    original_doc_id = doc_entry["doc_id"]
    manipulated_doc_entry = doc_entry.copy()
    manipulated_doc_entry["doc_id"] = get_id_for_manipulated_doc_or_query(original_doc_id, prefix_number=4)
    manipulated_doc_entry["content"] = text_new
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, pd.Series([original_doc_id], index=["original_doc_id"])])

    save_manipulated_doc(
        filename="additional_data/docs/multi_textual_manipulations.csv",
        fieldnames=COLUMNS_DOCS_MANIPULATED_TEXTUAL,
        **manipulated_doc_entry,
    )

    # -- queries
    for qa_pair in qa_pairs:
        try:
            query_entry = query_entries[
                query_entries["query.content"].apply(str.strip).apply(str.lower) == qa_pair.question.strip().lower()
            ].iloc[0]
        except IndexError:
            raise RuntimeError("No matching question found!")

        original_query_id = query_entry["query.query_id"]
        manipulated_query_entry = query_entry.copy()
        manipulated_query_entry["ground_truth.content"] = qa_pair.answer
        manipulated_query_entry["ground_truth.references"] = qa_pair.references
        manipulated_query_entry["ground_truth.keypoints"] = []
        manipulated_query_entry["ground_truth.doc_ids"] = [manipulated_doc_entry["doc_id"]]
        manipulated_query_entry["query.query_id"] = get_id_for_manipulated_doc_or_query(
            original_query_id, prefix_number=4
        )
        manipulated_query_entry = pd.concat(
            [manipulated_query_entry, pd.Series([original_query_id], index=["query.original_query_id"])]
        )

        save_manipulated_query(
            filename="additional_data/queries/multi_textual_manipulations.csv",
            fieldnames=COLUMNS_QUERIES_MANIPULATED,
            **manipulated_query_entry,
        )

In [76]:
doc_queries_mapping = get_doc_query_mapping_multi()

for index, mapping in enumerate(doc_queries_mapping, start=1):
    print(f"--- {index}/{len(doc_queries_mapping)} Start processing document with ID '{doc_id}' ---")
    doc_id = mapping["doc_id"]
    query_ids = mapping["query_ids"]

    doc_entry = get_doc_by_id(doc_id)
    query_entries = QUERIES.loc[QUERIES["query.query_id"].isin(query_ids)]

    system_prompt, user_prompt = get_prompts_for_textual_multi(doc_id, query_ids)

    # call openai
    completion_parsed: MultiTextualManipulationResponse = openai_interface(
        system_prompt, user_prompt, MultiTextualManipulationResponse
    )

    save_multi_textual(
        doc_entry, query_entries, text_new=completion_parsed.text_new, qa_pairs=completion_parsed.qa_pairs_new
    )
    print(f"Finished processing document with ID '{doc_id}'. Changed {len(query_ids)} queries.")

--- 1/30 Start processing document with ID '128' ---
DEBUG: Attempting LLM call
DEBUG: Finished LLM call
Saving Doc with ID '400128'
WARN: Row with ID 400128 already exists. Did not write new document to 'additional_data/docs/multi_textual_manipulations.csv'.
Saving Query with ID '404558'
WARN: Row with ID 404558 already exists. Did not write new query to 'additional_data/queries/multi_textual_manipulations.csv'.
Saving Query with ID '404559'
WARN: Row with ID 404559 already exists. Did not write new query to 'additional_data/queries/multi_textual_manipulations.csv'.
Finished processing document with ID '128'. Changed 2 queries.
--- 2/30 Start processing document with ID '128' ---
DEBUG: Attempting LLM call
DEBUG: Finished LLM call
Saving Doc with ID '400132'
Saving Query with ID '404566'
Saving Query with ID '404567'
Saving Query with ID '404568'
Saving Query with ID '404569'
Saving Query with ID '404570'
Finished processing document with ID '132'. Changed 5 queries.
--- 3/30 Start pr