In [189]:
import lancedb
import importlib
from langchain.vectorstores import LanceDB
from langchain.embeddings import OpenAIEmbeddings

# DB specifications
LANCEDB_DIR = "/Users/leon/Documents/study/MA/lancedb"
TABLE_NAME_DOCS = "documents"
TABLE_NAME_CHUNKS = "chunks_emb-large"

db = lancedb.connect(LANCEDB_DIR)

In [190]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vector_store = LanceDB(
    uri=LANCEDB_DIR,
    embedding=embeddings,
    table=db.open_table(TABLE_NAME_CHUNKS),
)

In [191]:
from langchain_community.document_loaders import DataFrameLoader
from typing import List, Tuple
import pandas as pd


def retrieve_newer_chunks(retrieved_chunks) -> List:
    while True:
        outdated_by_ids = {
            id_ for chunk in retrieved_chunks for id_ in chunk.metadata.get("outdated_by_chunk_ids", [])
        }
        chunk_ids = {chunk.metadata["chunk_id"] for chunk in retrieved_chunks}
        outdated_by_ids -= chunk_ids

        if len(outdated_by_ids) == 0:
            break

        table = db.open_table(TABLE_NAME_CHUNKS)
        if len(outdated_by_ids) == 1:
            table_data = table.search().where(f"id == {next(iter(outdated_by_ids))}").to_pandas()
        else:
            table_data = table.search().where(f"id IN {tuple(outdated_by_ids)}").to_pandas()

        # Flatten metadata column into top-level columns
        metadata_df = table_data["metadata"].apply(pd.Series)

        # Drop the original nested metadata column
        table_data = table_data.drop(columns=["metadata"])

        # Merge the expanded metadata back
        flattened_df = pd.concat([table_data, metadata_df], axis=1)
        loader = DataFrameLoader(flattened_df, page_content_column="text")
        additional_chunks = loader.load()
        retrieved_chunks.extend(additional_chunks)

    return retrieved_chunks


def filter_outdated_chunks(chunks: List) -> Tuple[List, List]:
    """
    returns:
    - (chunks_current, chunks_outdated)
    """
    chunks_outdated = []
    chunks_current = []
    for chunk in chunks:
        if len(chunk.metadata["outdated_by_chunk_ids"]) != 0:
            chunks_outdated.append(chunk)
        else:
            chunks_current.append(chunk)
    return (chunks_current, chunks_outdated)


def sort_chunks(chunks: List) -> List:
    chunks.sort(reverse=True, key=lambda c: c.metadata["creation_date"])
    return chunks

In [174]:
def retrieve(query: str, k=5):
    retrieved_docs = vector_store.similarity_search(query, k=k)
    return retrieved_docs

In [175]:
from typing import List
from utils import llm

importlib.reload(llm)


def generate(query: str, context: List, use_dq_assessment=True, model: str = "gpt-4.1"):
    docs_content = "\n\n-- new chunk --\n".join(f"{doc.page_content}" for doc in context)

    system_prompt = "You are an expert in question answering from a given context."
    if use_dq_assessment:
        user_prompt = f'Answer the given question as short as possible based on the given context. The context consist of multiple chunks which are devided by "-- new chunk --". The chunks are ordered based on their currency with the most current one at the beginning. If they include multiple conflictings answers to the question, take the one that appears first in the context.\nContext:\n{docs_content}\n\nQuestion: {query}'
    else:
        user_prompt = f"Answer the given question as short as possible based on the context, which has been retrieved by similarity search from a knowledge base.\nContext:\n{docs_content}\n\nQuestion: {query}"

    return llm.call_openai(
        system_prompt,
        user_prompt,
        model,
        response_format_pydantic=llm.LLMQAResponse,
    )

In [None]:
### Evaluate retrieval
from utils import io_helpers
import pandas as pd
from langchain_core.load import dumps, loads, load, serializable

importlib.reload(io_helpers)

queries_manipulated = io_helpers.get_queries("manipulated_only")


def retrieve_context(query_entry: pd.Series, top_k: int, with_dq: bool = True):
    """Retrieves context from lancedb and saves it to csv"""
    filename = (
        f"evaluation/retrieval_results/_query_with_context_with_DQ_{top_k}.csv"
        if with_dq
        else f"evaluation/retrieval_results/_query_with_context_without_DQ_{top_k}.csv"
    )

    chunks = retrieve(query_entry["query.content"], k=top_k)
    if with_dq:
        chunks = retrieve_newer_chunks(chunks)
        chunks = sort_chunks(chunks)

    for chunk in chunks:
        chunk.metadata.pop("creation_date", None)
        chunk.metadata.pop("vector", None)
        chunk.metadata["outdated_by_chunk_ids"] = list(chunk.metadata["outdated_by_chunk_ids"])  # np.array not allowed

    chunks_s = [dumps(chunk) for chunk in chunks]

    query_entry["context"] = chunks_s
    try:
        data = pd.read_csv(filename)
        data = pd.concat([data, pd.DataFrame([query_entry])], sort=False)
    except FileNotFoundError:
        data = pd.DataFrame([query_entry])
    data.to_csv(filename, index=False)


query = queries_manipulated.sample(n=100, random_state=1)

_ = query.apply(retrieve_context, args=(3, False), axis=1)
_ = query.apply(retrieve_context, args=(3, True), axis=1)
# _ = query.apply(retrieve_context, args=(5, False), axis=1)
# _ = query.apply(retrieve_context, args=(5, True), axis=1)
# _ = query.apply(retrieve_context, args=(7, False), axis=1)
# _ = query.apply(retrieve_context, args=(7, True), axis=1)
# _ = query.apply(retrieve_context, args=(10, False), axis=1)
# _ = query.apply(retrieve_context, args=(10, True), axis=1)

[90m[[0m2025-06-15T18:23:17Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:17Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:18Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:19Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:20Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:21Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:23Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet
[90m[[0m2025-06-15T18:23:

In [205]:
from langchain_core.load import loads
import ast
import numpy as np
import string
from typing import Literal


def calculate_recall(
    top_k: int, with_dq: bool = True, print_info_if_wrong: bool = False, emb: Literal["_large", ""] = ""
):
    filename = (
        f"evaluation/retrieval_results/_query_with_context_with_DQ_{top_k}{emb}.csv"
        if with_dq
        else f"evaluation/retrieval_results/_query_with_context_without_DQ_{top_k}{emb}.csv"
    )
    data = pd.read_csv(
        filename,
        converters={"context": ast.literal_eval, "ground_truth.references": ast.literal_eval},
    )
    data["context"] = data["context"].apply(lambda chunks: [loads(c) for c in chunks])

    hits = []
    for _, row in data.iterrows():
        complete_context_txt = "\n".join([chunk.page_content for chunk in row["context"]]).lower()
        hit_in_row = True
        for ref in row["ground_truth.references"]:
            hit_in_row = hit_in_row and (ref.lower().strip() in complete_context_txt)
            if print_info_if_wrong and (not (ref.lower().strip().strip(string.punctuation) in complete_context_txt)):
                print("------")
                print(f"{row["query.content"]}: {row["ground_truth.content"]}, {row["ground_truth.doc_ids"]}")
                print("Reference:")
                print(ref.lower().strip())
                print()
                print("Metadata:")
                print("\n".join([str(c.metadata) for c in row["context"]]))
                print("Context:")
                print("\n- ".join([c.page_content for c in row["context"]]))
        hits.append(hit_in_row)
    recall = np.mean(hits)
    return recall


for k in [3, 5, 7, 10]:
    print(f"Top_k: {k}")
    print(
        f"small: {calculate_recall(k, True)*100:.2f} %\tlarge: {calculate_recall(k, True, emb="_large")*100:.2f} %\t(with DQ)"
    )
    print(
        f"small: {calculate_recall(k, False)*100:.2f} %\tlarge: {calculate_recall(k, False, emb="_large")*100:.2f} %\t(without DQ)"
    )
    print()

Top_k: 3
small: 63.00 %	large: 63.00 %	(with DQ)
small: 46.00 %	large: 46.00 %	(without DQ)

Top_k: 5
small: 74.00 %	large: 66.00 %	(with DQ)
small: 62.00 %	large: 56.00 %	(without DQ)

Top_k: 7
small: 76.00 %	large: 69.00 %	(with DQ)
small: 65.00 %	large: 60.00 %	(without DQ)

Top_k: 10
small: 78.00 %	large: 72.00 %	(with DQ)
small: 71.00 %	large: 66.00 %	(without DQ)



In [178]:
query_content = "According to the judgment of Glenwood, Quailwood, Court, what was the occupation of Y. Nelson?"


print("Without DQ:")
context = retrieve(query)
response = generate(query, context, use_dq_assessment=False)
print(response.answer)
print()

print("With DQ:")
context = retrieve(query, k=5)
context = retrieve_newer_chunks(context)
context = sort_chunks(context)
response = generate(query, context=context)
print(response.answer)

Without DQ:


TypeError: expected string or buffer

In [None]:
for c in context:
    print(c.metadata["doc_id"])
    print(c.metadata["creation_date"])
    print(f"{c.metadata["chunk_id"]} - outdated by {c.metadata["outdated_by_chunk_ids"]}")
    print("- content:")
    print(c.page_content)
    print("-----------")

print(response.answer)

300002
2015-01-01 00:00:00
3705 - outdated by []
- content:
Chief judge according to the court judgment of Danbury, Pinehurst, Court | J. Smith
Residence of F. Williams according to the court judgment of Upton, Georgetown, Court | 45, Maple Avenue, Georgetown.
Defense lawyer for Y. Nelson according to the judgment of Glenwood, Quailwood, Court | J. Smith
Date of the court judgment of Bayside, Roseville, Court | 15th November, 2022
Name of the chief judge according to the court judgment of Trenton, Eastwood, Court | J. Smith
-----------
400139
2010-01-01 00:00:00
3544 - outdated by []
- content:
In a significant case adjudicated by the 9th Judicial Circuit of Glenwood, the Glenwood, Quailwood Court delivered a judgment on June 10, 2023, against a female defendant, Y. Nelson. The court proceedings were overseen by Chief Judge Hon. H. Ruiz and Presiding Judge Hon. E. Collins, with K. Kelly serving as the court clerk.
-----------
400139
2010-01-01 00:00:00
3549 - outdated by []
- content:
