## Retrieval


### Recall


In [5]:
### Calculate Recall
from langchain_core.load import loads
import ast
import numpy as np
import pandas as pd
import string
from typing import Literal


def calculate_recall(
    top_k: int, with_dq: bool = True, print_info_if_wrong: bool = False, emb: Literal["_large", ""] = ""
):
    filename = (
        f"evaluation/_queries_with_context_with_DQ_{top_k}.csv"
        if with_dq
        else f"evaluation/_queries_with_context_without_DQ_{top_k}.csv"
    )
    data = pd.read_csv(
        filename,
        converters={"context": ast.literal_eval, "ground_truth.references": ast.literal_eval},
    )
    data["context"] = data["context"].apply(lambda chunks: [loads(c) for c in chunks])

    hits = []
    for _, row in data.iterrows():
        complete_context_txt = "\n".join([chunk.page_content for chunk in row["context"]]).lower()
        hit_in_row = True
        for ref in row["ground_truth.references"]:
            hit_in_row = hit_in_row and (ref.lower().strip() in complete_context_txt)
            if print_info_if_wrong and (not (ref.lower().strip().strip(string.punctuation) in complete_context_txt)):
                print("------")
                print(f"{row["query.content"]}: {row["ground_truth.content"]}, {row["ground_truth.doc_ids"]}")
                print("Reference:")
                print(ref.lower().strip())
                print()
                print("Metadata:")
                print("\n".join([str(c.metadata) for c in row["context"]]))
                print("Context:")
                print("\n- ".join([c.page_content for c in row["context"]]))
        hits.append(hit_in_row)
    recall = np.mean(hits)
    return recall


print(f"{calculate_recall(5, with_dq=True)*100:.2f} %\t(with DQ assessment)")
print(f"{calculate_recall(5, with_dq=False)*100:.2f} %\t(without DQ assessment)")

73.04 %	(with DQ assessment)
62.25 %	(without DQ assessment)


## Generation


### Completeness


In [None]:
### Generate keypoint coverage estimation
import pandas as pd
import numpy as np
import ast
import importlib
from utils import llm, io_helpers

importlib.reload(llm)

filename = "evaluation/_queries_with_context_with_DQ_5_generations.csv"
filename_new = "evaluation/keypoint_eval/_queries_with_context_with_DQ_5_generations.csv"


def calc_keypoint_coverage(row, model: str = "gpt-4.1"):
    question = row["query.content"]
    keypoints = row["ground_truth.keypoints"]
    generated_answer = row["generated_response"]

    system_prompt, user_prompt = io_helpers.get_prompts("keypoints/keypoints_validation")
    user_prompt = llm.format_user_prompt_keypoint_validation(
        user_prompt, question=question, keypoints=keypoints, generated_answer=generated_answer
    )

    response = llm.call_any_llm(
        system_prompt, user_prompt, model=model, response_format_pydantic=llm.LLMKeypointEvaluationResponse
    )

    return np.mean(response.keypoint_coverage).item()


data = pd.read_csv(filename, converters={"ground_truth.keypoints": ast.literal_eval})
data["keypoint_coverage"] = data.apply(calc_keypoint_coverage, axis=1)

data.to_csv(filename_new, index=False)

[True]
1.0


0    None
dtype: object