## Retrieval


### Recall


In [None]:
### Calculate Recall
from langchain_core.load import loads
import ast
import numpy as np
import pandas as pd
import string
from typing import Literal


def calculate_recall(
    top_k: int, with_dq: bool = True, print_info_if_wrong: bool = False, emb: Literal["_large", ""] = ""
):
    filename = (
        f"evaluation/_queries_with_context_with_DQ_{top_k}.csv"
        if with_dq
        else f"evaluation/_queries_with_context_without_DQ_{top_k}.csv"
    )
    data = pd.read_csv(
        filename,
        converters={"context": ast.literal_eval, "ground_truth.references": ast.literal_eval},
    )
    data["context"] = data["context"].apply(lambda chunks: [loads(c) for c in chunks])

    hits = []
    for _, row in data.iterrows():
        complete_context_txt = "\n".join([chunk.page_content for chunk in row["context"]]).lower()
        hit_in_row = True
        for ref in row["ground_truth.references"]:
            hit_in_row = hit_in_row and (ref.lower().strip() in complete_context_txt)
            if print_info_if_wrong and (not (ref.lower().strip().strip(string.punctuation) in complete_context_txt)):
                print("------")
                print(f"{row["query.content"]}: {row["ground_truth.content"]}, {row["ground_truth.doc_ids"]}")
                print("Reference:")
                print(ref.lower().strip())
                print()
                print("Metadata:")
                print("\n".join([str(c.metadata) for c in row["context"]]))
                print("Context:")
                print("\n- ".join([c.page_content for c in row["context"]]))
        hits.append(hit_in_row)
    recall = np.mean(hits)
    return recall


print(f"{calculate_recall(5, with_dq=True)*100:.2f} %\t(with DQ assessment)")
print(f"{calculate_recall(5, with_dq=False)*100:.2f} %\t(without DQ assessment)")

73.04 %	(with DQ assessment)
62.25 %	(without DQ assessment)


  data["context"] = data["context"].apply(lambda chunks: [loads(c) for c in chunks])


In [None]:
### Calculate Precision
from langchain_core.load import loads
import ast
import numpy as np
import pandas as pd
from typing import Literal


def calculate_precision(
    top_k: int, with_dq: bool = True, print_info_if_wrong: bool = False, emb: Literal["_large", ""] = ""
):
    filename = (
        f"evaluation/_queries_with_context_with_DQ_{top_k}.csv"
        if with_dq
        else f"evaluation/_queries_with_context_without_DQ_{top_k}.csv"
    )
    data = pd.read_csv(
        filename,
        converters={"context": ast.literal_eval, "ground_truth.references": ast.literal_eval},
    )
    data["context"] = data["context"].apply(lambda chunks: [loads(c) for c in chunks])

    precision_list = []
    for _, row in data.iterrows():
        precision_in_row = []
        for chunk in row["context"]:
            chunk_relevant = False
            for ref in row["ground_truth.references"]:
                if ref.lower().strip() in str(chunk.page_content).lower():
                    chunk_relevant = True
                    break
            precision_in_row.append(chunk_relevant)
        precision_list.append(np.mean(precision_in_row))
    precision = np.mean(precision_list)

    return precision


print(f"{calculate_precision(5, with_dq=True)*100:.2f} %\t(with DQ assessment)")
print(f"{calculate_precision(5, with_dq=False)*100:.2f} %\t(without DQ assessment)")

14.21 %	(with DQ assessment)
13.14 %	(without DQ assessment)


  data["context"] = data["context"].apply(lambda chunks: [loads(c) for c in chunks])


## Generation


### Completeness


In [3]:
### Generate keypoint coverage estimation
import pandas as pd
import numpy as np
import ast
import importlib
from utils import llm, io_helpers

importlib.reload(llm)


def calc_keypoint_coverage(row, model: str = "gpt-4.1"):
    question = row["query.content"]
    keypoints = row["ground_truth.keypoints"]
    generated_answer = row["generated_response"]

    system_prompt, user_prompt = io_helpers.get_prompts("keypoints/keypoints_validation")
    user_prompt = llm.format_user_prompt_keypoint_validation(
        user_prompt, question=question, keypoints=keypoints, generated_answer=generated_answer
    )

    response = llm.call_any_llm(
        system_prompt, user_prompt, model=model, response_format_pydantic=llm.LLMKeypointEvaluationResponse
    )

    return np.mean(response.keypoint_coverage).item()


def generate_keypoint_coverage(filepath: str, filepath_new: str):
    if filepath == filepath_new:
        raise RuntimeError("Paths must differ!")
    data = pd.read_csv(filename, converters={"ground_truth.keypoints": ast.literal_eval})
    data["keypoint_coverage"] = data.apply(calc_keypoint_coverage, axis=1)
    data.to_csv(filepath_new, index=False)


filename = "evaluation/_queries_with_context_without_DQ_5_generations.csv"
filename_new = "evaluation/keypoint_eval/_queries_with_context_without_DQ_5_generations.csv"
# _ = generate_keypoint_coverage(filename, filename_new)



In [4]:
### Calculate Completeness

import pandas as pd
import numpy as np

filename_with_dq = "evaluation/keypoint_eval/_queries_with_context_with_DQ_5_generations.csv"
filename_without_dq = "evaluation/keypoint_eval/_queries_with_context_without_DQ_5_generations.csv"

completeness_with_dq = np.mean(
    pd.read_csv(filename_with_dq, usecols=["keypoint_coverage"], dtype={"keypoint_coverage": "float64"})[
        "keypoint_coverage"
    ]
)
completeness_without_dq = np.mean(
    pd.read_csv(filename_without_dq, usecols=["keypoint_coverage"], dtype={"keypoint_coverage": "float64"})[
        "keypoint_coverage"
    ]
)

print(f"{completeness_with_dq*100:.2f} %\t(with DQ assessment)")
print(f"{completeness_without_dq*100:.2f} %\t(without DQ assessment)")

55.11 %	(with DQ assessment)
27.98 %	(without DQ assessment)
