In [None]:
import lancedb
import importlib
from langchain.vectorstores import LanceDB
from langchain.embeddings import OpenAIEmbeddings

# DB specifications
LANCEDB_DIR = "..."  # Set database path
TABLE_NAME_DOCS = "documents"
TABLE_NAME_CHUNKS = "chunks"

db = lancedb.connect(LANCEDB_DIR)

In [207]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = LanceDB(
    uri=LANCEDB_DIR,
    embedding=embeddings,
    table=db.open_table(TABLE_NAME_CHUNKS),
)

In [208]:
from langchain_community.document_loaders import DataFrameLoader
from typing import List, Tuple
import pandas as pd


def retrieve_newer_chunks(retrieved_chunks) -> List:
    while True:
        outdated_by_ids = {
            id_ for chunk in retrieved_chunks for id_ in chunk.metadata.get("outdated_by_chunk_ids", [])
        }
        chunk_ids = {chunk.metadata["chunk_id"] for chunk in retrieved_chunks}
        outdated_by_ids -= chunk_ids

        if len(outdated_by_ids) == 0:
            break

        table = db.open_table(TABLE_NAME_CHUNKS)
        if len(outdated_by_ids) == 1:
            table_data = table.search().where(f"id == {next(iter(outdated_by_ids))}").to_pandas()
        else:
            table_data = table.search().where(f"id IN {tuple(outdated_by_ids)}").to_pandas()

        # Flatten metadata column into top-level columns
        metadata_df = table_data["metadata"].apply(pd.Series)

        # Drop the original nested metadata column
        table_data = table_data.drop(columns=["metadata"])

        # Merge the expanded metadata back
        flattened_df = pd.concat([table_data, metadata_df], axis=1)
        loader = DataFrameLoader(flattened_df, page_content_column="text")
        additional_chunks = loader.load()
        retrieved_chunks.extend(additional_chunks)

    return retrieved_chunks


def filter_outdated_chunks(chunks: List) -> Tuple[List, List]:
    """
    returns:
    - (chunks_current, chunks_outdated)
    """
    chunks_outdated = []
    chunks_current = []
    for chunk in chunks:
        if len(chunk.metadata["outdated_by_chunk_ids"]) != 0:
            chunks_outdated.append(chunk)
        else:
            chunks_current.append(chunk)
    return (chunks_current, chunks_outdated)


def sort_chunks(chunks: List) -> List:
    chunks.sort(reverse=True, key=lambda c: c.metadata["creation_date"])
    return chunks

In [209]:
def retrieve(query: str, k=5):
    retrieved_docs = vector_store.similarity_search(query, k=k)
    return retrieved_docs

In [None]:
### Evaluate retrieval
from utils import io_helpers
import pandas as pd
from langchain_core.load import dumps, loads, load, serializable

importlib.reload(io_helpers)

queries_manipulated = io_helpers.get_queries("manipulated_only")


def retrieve_context(query_entry: pd.Series, top_k: int, with_dq: bool = True):
    """Retrieves context from lancedb and saves it to csv"""
    filename = (
        f"evaluation/_queries_with_context_with_DQ_{top_k}.csv"
        if with_dq
        else f"evaluation/_queries_with_context_without_DQ_{top_k}.csv"
    )

    chunks = retrieve(query_entry["query.content"], k=top_k)
    if with_dq:
        chunks = retrieve_newer_chunks(chunks)
        chunks = sort_chunks(chunks)

    for chunk in chunks:
        chunk.metadata.pop("creation_date", None)
        chunk.metadata.pop("vector", None)
        chunk.metadata["outdated_by_chunk_ids"] = list(chunk.metadata["outdated_by_chunk_ids"])  # np.array not allowed
        chunk.metadata["outdated_by_chunk_ids"] = [int(id) for id in chunk.metadata["outdated_by_chunk_ids"]]

    chunks_s = [dumps(chunk) for chunk in chunks]

    query_entry["context"] = chunks_s
    try:
        data = pd.read_csv(filename)
        data = pd.concat([data, pd.DataFrame([query_entry])], sort=False)
    except FileNotFoundError:
        data = pd.DataFrame([query_entry])
    data.to_csv(filename, index=False)


_ = queries_manipulated.apply(retrieve_context, args=(5, False), axis=1)
_ = queries_manipulated.apply(retrieve_context, args=(5, True), axis=1)

# query = queries_manipulated.sample(n=100, random_state=1)
# _ = query.apply(retrieve_context, args=(3, False), axis=1)
# _ = query.apply(retrieve_context, args=(3, True), axis=1)
# _ = query.apply(retrieve_context, args=(5, False), axis=1)
# _ = query.apply(retrieve_context, args=(5, True), axis=1)
# _ = query.apply(retrieve_context, args=(7, False), axis=1)
# _ = query.apply(retrieve_context, args=(7, True), axis=1)
# _ = query.apply(retrieve_context, args=(10, False), axis=1)
# _ = query.apply(retrieve_context, args=(10, True), axis=1)

In [None]:
import pandas as pd
import ast
import importlib
from utils import io_helpers, llm
from langchain_core.load import loads

importlib.reload(io_helpers)
importlib.reload(llm)


def generate(row, with_dq_assesment: bool = True, model: str = "gpt-4.1"):
    question = row["query.content"]

    context_chunks = [loads(c) for c in row["context"]]
    context = "\n\n-- new chunk --\n".join([c.page_content for c in context_chunks])

    if with_dq_assesment:
        system_prompt, user_prompt = io_helpers.get_prompts("qa/qa_with_dq")
    else:
        system_prompt, user_prompt = io_helpers.get_prompts("qa/qa_without_dq")

    user_prompt = llm.format_user_prompt_qa(user_prompt, question=question, context=context)
    response = llm.call_any_llm(system_prompt, user_prompt, model=model, response_format_pydantic=llm.LLMQAResponse)

    return response.answer


def produce_generations(queries_path: str):
    queries = pd.read_csv(queries_path, converters={"context": ast.literal_eval})
    queries["generated_response"] = queries.apply(generate, axis=1)
    queries.to_csv(f"{queries_path.split(".")[0]}_generations.csv", index=False)


# produce_generations("evaluation/_queries_with_context_with_DQ_5.csv")
# produce_generations("evaluation/_queries_with_context_without_DQ_5.csv")



In [None]:
# Prepare for completeness evaluation
filepath = "evaluation/keypoint_eval/_queries_with_context_with_DQ_5_generations.csv"
filepath_new = "evaluation/keypoint_eval/with_DQ.csv"

data = pd.read_csv(
    filepath,
    usecols=["query.content", "ground_truth.keypoints", "generated_response", "keypoint_coverage"],
)

data.to_csv(
    filepath_new,
    index=False,
    columns=["query.content", "ground_truth.keypoints", "generated_response", "keypoint_coverage"],
)

Unnamed: 0,ground_truth.keypoints,query.content,generated_response,keypoint_coverage
0,"[""JetWing Aviation's total liabilities amounte...",What was the total amount of JetWing Aviation'...,$250 million,1.0
1,['Mr. John Doe was appointed as CEO of ABC Edu...,Who was appointed as CEO of ABC Education Corp...,Mr. John Doe was appointed as CEO of ABC Educa...,1.0
2,"[""V. Lewis's chief complaint is fever, cough, ...",According to the hospitalization records of Ne...,"Fever, cough, and chest pain for 5 days.",1.0
3,"[""H. Walker's birthdate is 19th July, 1964 acc...","According to the court judgment of Riverton, H...","19th July, 1964",1.0
4,"['The residence of Z. Torres was 22, Maple Ave...","According to the court judgment of Sterling, Q...","22, Maple Avenue, Quarryville",1.0
...,...,...,...,...
199,['Retail Emporium uses a risk management syste...,What system does Retail Emporium have in place...,Retail Emporium conducts regular audits and ri...,1.0
200,['Retail Emporium was established in November ...,When was Retail Emporium established?,December 2005,0.0
201,['Retail Emporium distributed a dividend of $4...,How much dividend did Retail Emporium distribu...,$4 million,1.0
202,['Retail Emporium opened three new stores in M...,How many new stores did Retail Emporium open i...,Retail Emporium opened three new stores in Mar...,1.0
