In [1]:
import pandas as pd
import ast

from dotenv import load_dotenv

load_dotenv("/Users/leon/.env")

True

In [None]:
### helper functions/classes for manipulation
from typing import Tuple, List, Dict
from pydantic import BaseModel
import pandas as pd
import ast
import csv
import os

from utils.utils import read_prompt

doc_columns_ordered = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

DOCUMENTS = pd.read_csv("data/DRAGONball/en/docs.csv")
QUERIES = pd.read_csv(
    "data/DRAGONball/en/queries_flattened.csv",
    converters={
        "ground_truth.doc_ids": ast.literal_eval,
        "ground_truth.keypoints": ast.literal_eval,
        "ground_truth.references": ast.literal_eval,
    },
)


class FactualQuestionResponse(BaseModel):
    text_new: str
    answer_new: str
    references_new: list[str]


class TabularDataResponse(BaseModel):
    description: str
    value: str


def format_prompt_man_factual(user_prompt: str, text: str, question: str, answer: str, references: str) -> str:
    """Inserts dynamic information into the user prompt."""
    return user_prompt.format(text=text, question=question, answer=answer, references=references)


def format_prompt_keypoints(user_prompt: str, question: str, answer: str) -> str:
    return user_prompt.format(question=question, ground_truth=answer)


def format_prompt_tabular(user_prompt: str, question: str, answer: str) -> str:
    return user_prompt.format(question=question, answer=answer)


def get_query_ids_for_doc(doc_id: int, query_types: list[str] = ["Factual Question"]) -> list[int]:
    """Selects query_ids for queries related to that doc and with a specified type."""
    return QUERIES[
        QUERIES["ground_truth.doc_ids"].apply(lambda doc_ids: doc_id in doc_ids)
        & QUERIES["query.query_type"].isin(query_types)
    ]["query.query_id"].to_list()


def get_doc_query_mapping() -> List[Dict[str, int]]:
    with open("doc_query_mapping.csv", "r", newline="") as f:
        reader = csv.DictReader(f)
        return [{"doc_id": int(row["doc_id"]), "query_id": int(row["query_id"])} for row in reader]


def get_query_properties(
    query_id,
    properties: list = ["ground_truth.content", "ground_truth.keypoints", "ground_truth.references", "query.content"],
) -> Tuple:
    """Select columns for query_id from queries dataframe."""
    row = QUERIES[QUERIES["query.query_id"] == query_id]
    # print(row[properties[0]])
    return tuple(row[prop].iloc[0] for prop in properties)
    # row[prop].iloc[0] returns a scalar value (at position 0) instead of a Series


def get_doc_text(doc_id: int) -> str:
    return DOCUMENTS[DOCUMENTS["doc_id"] == doc_id]["content"].iloc[0]


def get_docs_to_manipulate() -> List[int]:
    with open("docs_to_manipulate.csv", "r", encoding="utf-8", newline="") as f:
        reader = csv.reader(f)
        reader.__next__()

        doc_ids_for_man = set()
        for row in reader:
            doc_ids_for_man.add(int(row[0]))
    return list(doc_ids_for_man)


def get_query_by_id(query_id: int) -> pd.Series:
    return QUERIES[QUERIES["query.query_id"].astype(int) == query_id].iloc[0]


def get_queries_by_id(query_ids: List[int]) -> pd.DataFrame:
    return QUERIES[QUERIES["query.query_id"].astype(int).isin(query_ids)]


def get_doc_by_id(doc_id: int) -> pd.Series:
    return DOCUMENTS[DOCUMENTS["doc_id"].astype(int) == doc_id].iloc[0]


def openai_interface(system_prompt, user_prompt, response_format_pydantic=FactualQuestionResponse):
    """execute openai LLM call"""
    from openai import OpenAI

    client = OpenAI()

    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}],
        response_format=response_format_pydantic,
        temperature=0,
    ).choices[0]

    return completion.message.parsed


def get_prompts_man_fact_single(doc_id: int, query_id: int) -> Tuple:
    PROMPT_TYPE = "manipulation_factual"
    prompt = [
        prompt for prompt in read_prompt("prompts/json/manipulate_docs.json") if prompt["prompt_type"] == PROMPT_TYPE
    ][0]

    text = get_doc_text(doc_id)
    answer, keypoints, references, question = get_query_properties(query_id)

    system_prompt = prompt["system_prompt"]
    user_prompt = format_prompt_man_factual(
        user_prompt=prompt["user_prompt"], text=text, answer=answer, question=question, references=references
    )
    return (system_prompt, user_prompt)


def get_prompts_man_tabular(query_id: int) -> Tuple:
    user_prompt = read_prompt("prompts/json/create_tabular_docs.json")["prompt"]
    system_prompt = "You are an expert in manipulation and transformation of textual data."
    answer, question = get_query_properties(query_id, properties=["ground_truth.content", "query.content"])
    user_prompt = format_prompt_tabular(user_prompt, question, answer)
    return (system_prompt, user_prompt)


def get_id_for_manipulated_doc_or_query(original_doc_id: int, prefix_number=1) -> int:
    id_str = str(prefix_number) + str(original_doc_id).zfill(5)
    return int(id_str)


def save_manipulated_doc(
    hospital_patient_name: str,
    language: str,
    doc_id: int,
    domain: str,
    content: str,
    company_name: str,
    court_name: str,
    original_doc_id: int,
):
    """Saves a manipulated doc to csv. Adds column "original_doc_id.
    If an entry with that doc_id already exists in the csv, the new entry is NOT saved.
    """
    params = locals()  # all params

    # change filename
    filename = "data/additional_data/docs/fact_multi_manipulations.csv"
    is_empty = not os.path.exists(filename) or os.stat(filename).st_size == 0
    id_exists = False
    with open(filename, "a+", newline="") as f:
        if not is_empty:
            f.seek(0)
            reader = csv.DictReader(f, fieldnames=params.keys())
            ids_present = {int(row["doc_id"]) for row in list(reader)[1:]}
            id_exists = int(doc_id) in ids_present

        if id_exists == True:
            print(f"WARN: Row with ID {doc_id} already exists. Did not write new document to '{filename}'.")
            return

        writer = csv.DictWriter(f, fieldnames=params.keys(), extrasaction="ignore")
        if is_empty:
            writer.writeheader()
        writer.writerow(params)


def save_manipulated_query(
    domain: str,
    ground_truth__content: str,
    ground_truth__doc_ids: List[int],
    ground_truth__keypoints: List[str],
    ground_truth__references: List[str],
    language: str,
    prediction: str,
    query__content: str,
    query__query_id: int,
    query__query_type: str,
    query__original_query_id: int,
):
    """Saves a manipulated doc to csv. Adds column "original_doc_id.
    If an entry with that doc_id already exists in the csv, the new entry is NOT saved.
    """
    params = {key.replace("__", "."): locals()[key] for key in locals().copy().keys()}  # all params

    filename = "data/additional_data/queries/fact_single_manipulations.csv"
    is_empty = not os.path.exists(filename) or os.stat(filename).st_size == 0
    id_exists = False
    with open(filename, "a+", newline="") as f:
        if not is_empty:
            f.seek(0)
            reader = csv.DictReader(f, fieldnames=params.keys())
            ids_present = {int(row["query.query_id"]) for row in list(reader)[1:]}
            id_exists = int(query__query_id) in ids_present

        if id_exists == True:
            print(f"WARN: Row with ID {query__query_id} already exists. Did not write new query to '{filename}'.")
            return

        writer = csv.DictWriter(f, fieldnames=params.keys(), extrasaction="ignore")
        if is_empty:
            writer.writeheader()
        writer.writerow(params)

In [229]:
### describe documents and queries

print("Documents:\n", DOCUMENTS.columns)
print()
print("Queries:\n", QUERIES.columns)

Documents:
 Index(['hospital_patient_name', 'language', 'doc_id', 'domain', 'content',
       'company_name', 'court_name'],
      dtype='object')

Queries:
 Index(['domain', 'ground_truth.content', 'ground_truth.doc_ids',
       'ground_truth.keypoints', 'ground_truth.references', 'language',
       'prediction', 'query.content', 'query.query_id', 'query.query_type'],
      dtype='object')


In [22]:
### Randomly select documents from corpus
import random
import csv

documents = pd.read_csv("data/DRAGONball/en/docs.csv")

ids_by_domain = {}


for row in documents.itertuples(index=False):
    new_list = ids_by_domain.get(row.domain, [])
    new_list.append(row.doc_id)
    ids_by_domain[row.domain] = new_list

try:
    with open("docs_to_manipulate.csv", "x", encoding="utf-8", newline="") as f:
        all_rand_ids = []

        for domain, id_list in ids_by_domain.items():
            print(f"{domain}: {len(id_list)}")
            rand_ids = random.sample(id_list, k=10)
            print(f"\t{rand_ids}")
            all_rand_ids += rand_ids

        writer = csv.writer(f)
        header = ["doc_id"]
        writer.writerow(header)
        for id in all_rand_ids:
            writer.writerow([id])
except FileExistsError:
    print("File exists. Did nothing.")

File exists. Did nothing.


In [230]:
### Describe documents to manipulate
with open("docs_to_manipulate.csv", "r", newline="") as f:
    reader = csv.reader(f)
    next(reader)  # skip header
    ids = set()
    for row in reader:
        ids.add(int(row[0]))

documents_to_manipulate = documents[documents["doc_id"].isin(ids)]
documents_to_manipulate.to_csv("docs_to_manipulate_all_info.csv", index=False, columns=doc_columns_ordered)
print(f"Output written to 'docs_to_manipulate_all_info.csv'")

Output written to 'docs_to_manipulate_all_info.csv'


In [226]:
### Create doc-query-query mapping
try:
    if os.path.exists("doc_query_mapping_multi.csv"):
        raise FileExistsError
    with open("doc_query_mapping.csv", "r", newline="") as f:
        reader = csv.DictReader(f, fieldnames=["doc_id", "query_id"])
        next(reader)  # skip headers+
        doc_queries_mapping = []
        for row in reader:
            doc_id = int(row["doc_id"])
            query_id = int(row["query_id"])
            additional_query_id = int(query_id)

            options = get_query_ids_for_doc(doc_id, query_types=["Factual Question"])
            while additional_query_id == query_id:
                if len(options) <= 1:
                    print(f"WARN: For doc '{doc_id}', only {len(options)} options are available.")
                    break
                additional_query_id = random.sample(options, 1)[0]
            keys = ["doc_id", "query_id_single", "query_id_multi"]
            values = [doc_id, query_id, additional_query_id]
            doc_queries_mapping.append(dict(zip(keys, values)))
            doc_queries_mapping.sort(key=lambda x: x["doc_id"])

    with open("doc_query_mapping_multi.csv", "x", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=keys)
        writer.writeheader()
        writer.writerows(doc_queries_mapping)
except FileExistsError:
    print("File exists. Did nothing.")

File exists. Did nothing.


In [42]:
### Select a single query per document of type "Factual Question"


doc_ids_for_man = set()

with open("docs_to_manipulate.csv", "r", encoding="utf-8", newline="") as f:
    reader = csv.reader(f)
    reader.__next__()

    for row in reader:
        doc_ids_for_man.add(int(row[0]))

doc_query_mapping = []

for doc_id in doc_ids_for_man:
    factual_questions = get_query_ids_for_doc(doc_id, query_types=["Factual Question"])
    doc_query_mapping.append({"doc_id": doc_id, "query_id": random.sample(factual_questions, 1)[0]})


try:
    with open("doc_query_mapping.csv", "x", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=doc_query_mapping[0].keys())
        writer.writeheader()
        writer.writerows(doc_query_mapping)
except FileExistsError:
    print("File exists. Did nothing.")

File exists. Did nothing.


In [None]:
### manipuale documents (fact single)
def save_doc_and_query(doc_entry, query_entry, completion_parsed):
    # -- doc
    original_doc_id = doc_entry.doc_id
    manipulated_doc_entry = doc_entry.copy()
    manipulated_doc_entry.doc_id = get_id_for_manipulated_doc_or_query(original_doc_id)
    manipulated_doc_entry.content = completion_parsed.text_new
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, pd.Series([original_doc_id], index=["original_doc_id"])])

    save_manipulated_doc(**manipulated_doc_entry)

    # -- query
    original_query_id = query_entry["query.query_id"]
    manipulated_query_entry = query_entry.copy()
    manipulated_query_entry["ground_truth.content"] = completion_parsed.answer_new
    manipulated_query_entry["ground_truth.references"] = completion_parsed.references_new
    manipulated_query_entry["ground_truth.keypoints"] = []
    manipulated_query_entry["query.query_id"] = get_id_for_manipulated_doc_or_query(original_query_id)
    manipulated_query_entry = pd.concat(
        [manipulated_query_entry, pd.Series([original_query_id], index=["query.original_query_id"])]
    )
    manipulated_query_entry.index = manipulated_query_entry.index.str.replace(".", "__", regex=False)

    save_manipulated_query(**manipulated_query_entry)


doc_query_mapping = get_doc_query_mapping()

for mapping in doc_query_mapping:
    doc_id = mapping["doc_id"]
    query_id = mapping["query_id"]

    doc_entry = get_doc_by_id(doc_id)
    query_entry = get_query_by_id(query_id)

    system_prompt, user_prompt = get_prompts_man_fact_single(doc_id, query_id)

    # call openai
    completion_parsed = openai_interface(system_prompt, user_prompt)

    save_doc_and_query(doc_entry, query_entry, completion_parsed)
    print(f"Finished processing doc {doc_id} and query {query_id}.")

WARN: Row with ID 100134 already exists. Did not write new document to 'data/additional_data/docs/fact_single_manipulations.csv'.
WARN: Row with ID 104620 already exists. Did not write new document to 'data/additional_data/queries/fact_single_manipulations.csv'.
Finished processing doc 134 and query 4620.
Finished processing doc 136 and query 4612.
Finished processing doc 139 and query 4643.
Finished processing doc 46 and query 2856.
Finished processing doc 47 and query 2246.
Finished processing doc 179 and query 5959.
Finished processing doc 52 and query 2509.
Finished processing doc 181 and query 5978.
Finished processing doc 59 and query 2698.
Finished processing doc 66 and query 2933.
Finished processing doc 198 and query 6143.
Finished processing doc 71 and query 2538.
Finished processing doc 72 and query 3189.
Finished processing doc 199 and query 6149.
Finished processing doc 77 and query 2378.
Finished processing doc 78 and query 2833.
Finished processing doc 79 and query 2728.

In [25]:
with open("doc_query_mapping_multi.csv", "r", newline="") as f:
    reader = csv.DictReader(f, fieldnames=["doc_id", "query_id_single", "query_id_multi"])
    next(reader)
    mapping = [{"doc_id": int(row["doc_id"]), "query_id": int(row["query_id_multi"])} for row in reader]


for id_pair in mapping[1:]:
    doc_id = id_pair["doc_id"]
    query_id = id_pair["query_id"]
    system_prompt, user_prompt = get_prompts_man_tabular(query_id)
    response: TabularDataResponse = openai_interface(system_prompt, user_prompt, TabularDataResponse)

    manipulated_doc_entry = get_doc_by_id(doc_id).copy()
    manipulated_doc_entry.doc_id = get_id_for_manipulated_doc_or_query(doc_id, prefix_number=2)
    manipulated_doc_entry.content = " | ".join([response.description, response.value])
    manipulated_doc_entry = pd.concat([manipulated_doc_entry, pd.Series([doc_id], index=["original_doc_id"])])

    save_manipulated_doc(**manipulated_doc_entry)