In [1]:
import pandas as pd
import ast

from dotenv import load_dotenv

load_dotenv("/Users/leon/.env")

True

In [2]:
#### static variables

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR_ROW = [
    *COLUMNS_DOCS,
    "original_doc_id",
    "query.original_query_id",
    "ground_truth.content",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

COLUMNS_QUERIES = [
    "domain",
    "ground_truth.content",
    "ground_truth.doc_ids",
    "ground_truth.keypoints",
    "ground_truth.references",
    "language",
    "prediction",
    "query.content",
    "query.query_id",
    "query.query_type",
]

COLUMNS_QUERIES_MANIPULATED = [*COLUMNS_QUERIES, "query.original_query_id"]

In [14]:
### helper functions/classes for manipulation
from typing import Tuple, List, Dict
from pydantic import BaseModel
import pandas as pd
import ast
import csv
import os
from typing import Literal
import sys
import json

DOCUMENTS = pd.read_csv("DRAGONball/en/docs.csv")
QUERIES = pd.read_csv(
    "DRAGONball/en/queries_flattened.csv",
    converters={
        "ground_truth.doc_ids": ast.literal_eval,
        "ground_truth.keypoints": ast.literal_eval,
        "ground_truth.references": ast.literal_eval,
    },
)


class FactualQuestionResponse(BaseModel):
    text_new: str
    answer_new: str
    references_new: list[str]


class TabularDataResponse(BaseModel):
    answer_new: str
    description: str
    value: str

def read_prompt(path: str | os.PathLike) -> Dict:
    """Reads from JSON-file"""
    with open(path, "r") as f:
        return json.load(f)


def format_user_prompt_textual(user_prompt: str, text: str, question: str, answer: str, references: str) -> str:
    return user_prompt.format(text=text, question=question, answer=answer, references=references)


def format_user_prompt_tabular(user_prompt: str, question: str, answer: str, entity: str) -> str:
    return user_prompt.format(question=question, answer=answer, entity=entity)


def get_query_ids_for_doc(doc_id: int, query_types: list[str] = ["Factual Question"]) -> list[int]:
    """Selects query_ids for queries related to that doc and with a specified type."""
    return QUERIES[
        QUERIES["ground_truth.doc_ids"].apply(lambda doc_ids: doc_id in doc_ids)
        & QUERIES["query.query_type"].isin(query_types)
    ]["query.query_id"].to_list()


def get_doc_query_mapping(target: Literal["textual", "tabular"]) -> List[Dict[str, int]]:
    with open("doc_query_mapping_multi.csv", "r", newline="") as f:
        reader = csv.DictReader(f)
        return [
            {
                "doc_id": int(row["doc_id"]),
                "query_id": int(row["query_id_single"]) if target == "textual" else int(row["query_id_multi"]),
            }
            for row in reader
        ]


def get_query_properties(
    query_id,
    properties: list = ["ground_truth.content", "ground_truth.keypoints", "ground_truth.references", "query.content"],
) -> Tuple:
    """Select columns for query_id from queries dataframe."""
    row = QUERIES[QUERIES["query.query_id"] == query_id]
    return tuple(row[prop].iloc[0] for prop in properties)


def get_doc_properties(
    doc_id,
    properties,
) -> Tuple:
    """Select columns for doc_id from docs dataframe."""
    row: pd.DataFrame = DOCUMENTS[DOCUMENTS["doc_id"] == doc_id].dropna(axis=1)
    return tuple(row[prop].iloc[0] for prop in properties if prop in row.columns)


def get_doc_text(doc_id: int) -> str:
    return DOCUMENTS[DOCUMENTS["doc_id"] == doc_id]["content"].iloc[0]


def get_query_by_id(query_id: int) -> pd.Series:
    return QUERIES[QUERIES["query.query_id"].astype(int) == query_id].iloc[0]


def get_queries_by_id(query_ids: List[int]) -> pd.DataFrame:
    return QUERIES[QUERIES["query.query_id"].astype(int).isin(query_ids)]


def get_doc_by_id(doc_id: int) -> pd.Series:
    return DOCUMENTS[DOCUMENTS["doc_id"].astype(int) == doc_id].iloc[0]


def openai_interface(system_prompt, user_prompt, response_format_pydantic=FactualQuestionResponse):
    """execute openai LLM call"""
    from openai import OpenAI

    client = OpenAI()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
        response_format=response_format_pydantic,
        temperature=0,
    ).choices[0]

    return completion.message.parsed


def get_prompts_for_textual_manipulation(doc_id: int, query_id: int) -> Tuple:
    PROMPT_TYPE = "manipulation_factual"
    prompt = [
        prompt for prompt in read_prompt("prompts/json/manipulate_docs.json") if prompt["prompt_type"] == PROMPT_TYPE
    ][0]
    text = get_doc_text(doc_id)
    answer, keypoints, references, question = get_query_properties(query_id)
    system_prompt = prompt["system_prompt"]
    user_prompt = format_user_prompt_textual(
        user_prompt=prompt["user_prompt"], text=text, answer=answer, question=question, references=references
    )
    return (system_prompt, user_prompt)


def get_prompts_for_tabular_manipulation(query_id: int, doc_id: int) -> Tuple:
    prompt_obj = read_prompt("prompts/json/manipulation_tabular.json")
    system_prompt = prompt_obj["system_prompt"]
    user_prompt = prompt_obj["user_prompt"]
    answer, question = get_query_properties(query_id, properties=["ground_truth.content", "query.content"])
    (entity,) = get_doc_properties(doc_id, ["hospital_patient_name", "company_name", "court_name"])
    user_prompt = format_user_prompt_tabular(user_prompt, question, answer, entity)
    return (system_prompt, user_prompt)


def get_id_for_manipulated_doc_or_query(original_doc_id: int, prefix_number=1) -> int:
    id_str = str(prefix_number) + str(original_doc_id).zfill(5)
    return int(id_str)


def save_manipulated_doc(filename: os.PathLike | str, fieldnames: List[str], **kwargs):
    """Saves a manipulated doc to csv.
    If an entry with that doc_id already exists in the csv, the new entry is NOT saved.
    """

    is_empty = not os.path.exists(filename) or os.stat(filename).st_size == 0
    id_exists = False
    with open(filename, "a+", newline="") as f:
        if not is_empty:
            f.seek(0)
            reader = csv.DictReader(f, fieldnames=fieldnames)
            ids_present = {int(row["doc_id"]) for row in list(reader)[1:]}
            id_exists = int(kwargs["doc_id"]) in ids_present

        if id_exists == True:
            print(f"WARN: Row with ID {kwargs["doc_id"]} already exists. Did not write new document to '{filename}'.")
            return

        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        if is_empty:
            writer.writeheader()
        writer.writerow(kwargs)


def save_manipulated_query(
    filename: os.PathLike | str,
    fieldnames: List[str],
    **kwargs
):
    """Saves a manipulated doc to csv. Adds column "original_doc_id.
    If an entry with that doc_id already exists in the csv, the new entry is NOT saved.
    """
    is_empty = not os.path.exists(filename) or os.stat(filename).st_size == 0
    id_exists = False
    with open(filename, "a+", newline="") as f:
        if not is_empty:
            f.seek(0)
            reader = csv.DictReader(f, fieldnames=fieldnames)
            ids_present = {int(row["query.query_id"]) for row in list(reader)[1:]}
            id_exists = int(kwargs["query.query_id"]) in ids_present

        if id_exists == True:
            print(f"WARN: Row with ID {kwargs["query.query_id"]} already exists. Did not write new query to '{filename}'.")
            return

        writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction="ignore")
        if is_empty:
            writer.writeheader()
        writer.writerow(kwargs)

In [13]:
### describe documents and queries

print("Documents:\n", DOCUMENTS.columns)
print()
print("Queries:\n", QUERIES.columns)

Documents:
 Index(['hospital_patient_name', 'language', 'doc_id', 'domain', 'content',
       'company_name', 'court_name'],
      dtype='object')

Queries:
 Index(['domain', 'ground_truth.content', 'ground_truth.doc_ids',
       'ground_truth.keypoints', 'ground_truth.references', 'language',
       'prediction', 'query.content', 'query.query_id', 'query.query_type'],
      dtype='object')


In [22]:
### Randomly select documents from corpus
import random
import csv

documents = pd.read_csv("data/DRAGONball/en/docs.csv")

ids_by_domain = {}


for row in documents.itertuples(index=False):
    new_list = ids_by_domain.get(row.domain, [])
    new_list.append(row.doc_id)
    ids_by_domain[row.domain] = new_list

try:
    with open("docs_to_manipulate.csv", "x", encoding="utf-8", newline="") as f:
        all_rand_ids = []

        for domain, id_list in ids_by_domain.items():
            print(f"{domain}: {len(id_list)}")
            rand_ids = random.sample(id_list, k=10)
            print(f"\t{rand_ids}")
            all_rand_ids += rand_ids

        writer = csv.writer(f)
        header = ["doc_id"]
        writer.writerow(header)
        for id in all_rand_ids:
            writer.writerow([id])
except FileExistsError:
    print("File exists. Did nothing.")

File exists. Did nothing.


In [12]:
### Describe documents to manipulate
with open("docs_to_manipulate.csv", "r", newline="") as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    ids = set()
    for row in reader:
        ids.add(int(row[0]))

documents_to_manipulate = DOCUMENTS[DOCUMENTS["doc_id"].isin(ids)]
# documents_to_manipulate[["doc_id", "domain", "company_name", "court_name", "hospital_patient_name"]]

In [None]:
### -DEPRECATED- Create doc-query-query mapping
try:
    if os.path.exists("doc_query_mapping_multi.csv"):
        raise FileExistsError
    with open("doc_query_mapping.csv", "r", newline="") as f:
        reader = csv.DictReader(f, fieldnames=["doc_id", "query_id"])
        next(reader)  # skip headers+
        doc_queries_mapping = []
        for row in reader:
            doc_id = int(row["doc_id"])
            query_id = int(row["query_id"])
            additional_query_id = int(query_id)

            options = get_query_ids_for_doc(doc_id, query_types=["Factual Question"])
            while additional_query_id == query_id:
                if len(options) <= 1:
                    print(f"WARN: For doc '{doc_id}', only {len(options)} options are available.")
                    break
                additional_query_id = random.sample(options, 1)[0]
            keys = ["doc_id", "query_id_single", "query_id_multi"]
            values = [doc_id, query_id, additional_query_id]
            doc_queries_mapping.append(dict(zip(keys, values)))
            doc_queries_mapping.sort(key=lambda x: x["doc_id"])

    with open("doc_query_mapping_multi.csv", "x", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(doc_queries_mapping)
except FileExistsError:
    print("File exists. Did nothing.")

File exists. Did nothing.


In [None]:
### -DEPRECATED- Select a single query per document of type "Factual Question"


doc_ids_for_man = set()

with open("docs_to_manipulate.csv", "r", encoding="utf-8", newline="") as f:
    reader = csv.reader(f)
    reader.__next__()

    for row in reader:
        doc_ids_for_man.add(int(row[0]))

doc_query_mapping = []

for doc_id in doc_ids_for_man:
    factual_questions = get_query_ids_for_doc(doc_id, query_types=["Factual Question"])
    doc_query_mapping.append({"doc_id": doc_id, "query_id": random.sample(factual_questions, 1)[0]})


try:
    with open("doc_query_mapping.csv", "x", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=doc_query_mapping[0].keys())
        writer.writeheader()
        writer.writerows(doc_query_mapping)
except FileExistsError:
    print("File exists. Did nothing.")

File exists. Did nothing.


In [3]:
### manipulate documents (textual)
def save_doc_and_query(doc_entry, query_entry, completion_parsed):
    # -- doc
    original_doc_id = doc_entry.doc_id
    manipulated_doc_entry = doc_entry.copy()
    manipulated_doc_entry.doc_id = get_id_for_manipulated_doc_or_query(original_doc_id)
    manipulated_doc_entry.content = completion_parsed.text_new
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, pd.Series([original_doc_id], index=["original_doc_id"])])

    save_manipulated_doc(**manipulated_doc_entry)

    # -- query
    original_query_id = query_entry["query.query_id"]
    manipulated_query_entry = query_entry.copy()
    manipulated_query_entry["ground_truth.content"] = completion_parsed.answer_new
    manipulated_query_entry["ground_truth.references"] = completion_parsed.references_new
    manipulated_query_entry["ground_truth.keypoints"] = []
    manipulated_query_entry["query.query_id"] = get_id_for_manipulated_doc_or_query(original_query_id)
    manipulated_query_entry = pd.concat(
        [manipulated_query_entry, pd.Series([original_query_id], index=["query.original_query_id"])]
    )
    manipulated_query_entry.index = manipulated_query_entry.index.str.replace(".", "__", regex=False)

    save_manipulated_query(**manipulated_query_entry)


doc_query_mapping = get_doc_query_mapping()

for mapping in doc_query_mapping:
    doc_id = mapping["doc_id"]
    query_id = mapping["query_id"]

    doc_entry = get_doc_by_id(doc_id)
    query_entry = get_query_by_id(query_id)

    system_prompt, user_prompt = get_prompts_for_textual_manipulation(doc_id, query_id)

    # call openai
    completion_parsed = openai_interface(system_prompt, user_prompt)

    save_doc_and_query(doc_entry, query_entry, completion_parsed)
    print(f"Finished processing doc {doc_id} and query {query_id}.")

WARN: Row with ID 104620 already exists. Did not write new query to 'data/additional_data/queries/fact_single_manipulations.csv'.
Finished processing doc 134 and query 4620.


In [None]:
### manipulate documents (tabular) and save rows
mapping = get_doc_query_mapping("tabular")

doc_entries = []

for id_pair in mapping:
    doc_id = id_pair["doc_id"]
    query_id = id_pair["query_id"]
    system_prompt, user_prompt = get_prompts_for_tabular_manipulation(query_id, doc_id)

    response: TabularDataResponse = openai_interface(system_prompt, user_prompt, TabularDataResponse)

    manipulated_doc_entry = get_doc_by_id(doc_id).copy()
    manipulated_doc_entry.doc_id = get_id_for_manipulated_doc_or_query(doc_id, prefix_number=2)
    manipulated_doc_entry.content = " | ".join([response.description, response.value])
    additional_fields = pd.Series(
        [doc_id, query_id, response.answer_new],
        index=["original_doc_id", "query.original_query_id", "ground_truth.content"],
    )
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, additional_fields])

    save_manipulated_doc(
        filename="additional_data/docs/tabular_manipulations_result_rows.csv",
        fieldnames=COLUMNS_DOCS_MANIPULATED_TABULAR_ROW,
        **manipulated_doc_entry,
    )

{'hospital_patient_name': nan, 'language': 'en', 'doc_id': 200134, 'domain': 'Law', 'content': 'Chief judge according to the court judgment of Danbury, Pinehurst, Court | J. Smith', 'company_name': nan, 'court_name': 'Danbury, Pinehurst, Court', 'original_doc_id': 134, 'query.original_query_id': 4622, 'ground_truth.content': 'J. Smith'}
{'hospital_patient_name': nan, 'language': 'en', 'doc_id': 200136, 'domain': 'Law', 'content': 'Residence of F. Williams according to the court judgment of Upton, Georgetown, Court | 45, Maple Avenue, Georgetown.', 'company_name': nan, 'court_name': 'Upton, Georgetown, Court', 'original_doc_id': 136, 'query.original_query_id': 4609, 'ground_truth.content': '45, Maple Avenue, Georgetown.'}
{'hospital_patient_name': nan, 'language': 'en', 'doc_id': 200139, 'domain': 'Law', 'content': 'Defense lawyer for Y. Nelson according to the judgment of Glenwood, Quailwood, Court | J. Smith', 'company_name': nan, 'court_name': 'Glenwood, Quailwood, Court', 'original_

In [None]:
### aggregate tabular rows and save docs
tabular_docs = pd.read_csv("additional_data/docs/tabular_manipulations_result_rows.csv")


def list_or_none(series):
    if series.dropna().empty:
        return None
    return series.tolist()


agg_funcs = {
    "doc_id": lambda x: 0,
    "language": "first",
    "content": "\n".join,
    "company_name": list_or_none,
    "court_name": list_or_none,
    "hospital_patient_name": list_or_none,
    "original_doc_id": list_or_none,
    "query.original_query_id": lambda x: None,
    "ground_truth.content": lambda x: None,
}

aggregation = tabular_docs.groupby("domain").agg(agg_funcs).reset_index()
aggregation["doc_id"] = [get_id_for_manipulated_doc_or_query(id, prefix_number=3) for id in [1, 2, 3]]
aggregation.rename(
    columns={
        "company_name": "company_names",
        "court_name": "court_names",
        "hospital_patient_name": "hospital_patient_names",
        "original_doc_id": "original_doc_ids",
    },
    inplace=True,
)

for row_dict in aggregation.to_dict(orient="records"):
    save_manipulated_doc(
        "additional_data/docs/tabular_manipulations_result.csv",
        fieldnames=COLUMNS_DOCS_MANIPULATED_TABULAR,
        **row_dict
    )

WARN: Row with ID 300001 already exists. Did not write new document to 'data/additional_data/docs/tabular_manipulations_result.csv'.
WARN: Row with ID 300002 already exists. Did not write new document to 'data/additional_data/docs/tabular_manipulations_result.csv'.
WARN: Row with ID 300003 already exists. Did not write new document to 'data/additional_data/docs/tabular_manipulations_result.csv'.


In [None]:
### save manipulated queries for aggregated tabular docs
tabular_docs = pd.read_csv("additional_data/docs/tabular_manipulations_result_rows.csv")

mapping_domain_doc_id = pd.read_csv(
    "additional_data/docs/tabular_manipulations_result.csv", usecols=["domain", "doc_id"]
)
mapping_dict = mapping_domain_doc_id.set_index("domain")["doc_id"].to_dict()

for row_dict in tabular_docs.to_dict(orient="records"):
    original_query_id = row_dict["query.original_query_id"]
    manipulated_query_entry = get_query_by_id(original_query_id).copy()

    manipulated_query_entry["ground_truth.doc_ids"] = [mapping_dict[row_dict["domain"]]]
    manipulated_query_entry["ground_truth.content"] = row_dict["ground_truth.content"]
    manipulated_query_entry["ground_truth.references"] = row_dict["content"]
    manipulated_query_entry["ground_truth.keypoints"] = []
    manipulated_query_entry["query.query_id"] = get_id_for_manipulated_doc_or_query(original_query_id, prefix_number=3)
    manipulated_query_entry = pd.concat(
        [manipulated_query_entry, pd.Series([original_query_id], index=["query.original_query_id"])]
    )

    save_manipulated_query(
        filename="additional_data/queries/tabular_manipulations_result.csv",
        fieldnames=COLUMNS_QUERIES_MANIPULATED,
        **manipulated_query_entry
    )