In [43]:
import lancedb
import importlib
from langchain.vectorstores import LanceDB
from langchain.embeddings import OpenAIEmbeddings

# DB specifications
LANCEDB_DIR = "/Users/leon/Documents/study/MA/lancedb"
TABLE_NAME_DOCS = "documents"
TABLE_NAME_CHUNKS = "chunks"

db = lancedb.connect(LANCEDB_DIR)

In [44]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
vector_store = LanceDB(uri=LANCEDB_DIR, embedding=embeddings, table=db.open_table(TABLE_NAME_CHUNKS))

In [45]:
tuple([300])

(300,)

In [65]:
from langchain_community.document_loaders import DataFrameLoader
from typing import List, Tuple
import pandas as pd


def retrieve_newer_chunks(retrieved_chunks) -> List:
    while True:
        outdated_by_ids = {
            id_ for chunk in retrieved_chunks for id_ in chunk.metadata.get("outdated_by_chunk_ids", [])
        }
        chunk_ids = {chunk.metadata["chunk_id"] for chunk in retrieved_chunks}
        outdated_by_ids -= chunk_ids

        if len(outdated_by_ids) == 0:
            break

        table = db.open_table(TABLE_NAME_CHUNKS)
        if len(outdated_by_ids) == 1:
            table_data = table.search().where(f"id == {next(iter(outdated_by_ids))}").to_pandas()
        else:
            table_data = table.search().where(f"id IN {tuple(outdated_by_ids)}").to_pandas()

        # Flatten metadata column into top-level columns
        metadata_df = table_data["metadata"].apply(pd.Series)

        # Drop the original nested metadata column
        table_data = table_data.drop(columns=["metadata"])

        # Merge the expanded metadata back
        flattened_df = pd.concat([table_data, metadata_df], axis=1)
        loader = DataFrameLoader(flattened_df, page_content_column="text")
        additional_chunks = loader.load()
        retrieved_chunks.extend(additional_chunks)

    return retrieved_chunks


def filter_outdated_chunks(chunks: List) -> Tuple[List, List]:
    """
    returns:
    - (chunks_current, chunks_outdated)
    """
    chunks_outdated = []
    chunks_current = []
    for chunk in chunks:
        if len(chunk.metadata["outdated_by_chunk_ids"]) != 0:
            chunks_outdated.append(chunk)
        else:
            chunks_current.append(chunk)
    return (chunks_current, chunks_outdated)


def sort_chunks(chunks: List) -> List:
    chunks.sort(reverse=True, key=lambda c: c.metadata["creation_date"])
    return chunks

In [47]:
def retrieve(query: str, k=5):
    retrieved_docs = vector_store.similarity_search(query, k=k)
    return retrieved_docs

In [None]:
from typing import List
from utils import llm

importlib.reload(llm)


def generate(query: str, context: List, use_dq_assessment=True, model: str = "gpt-4.1"):
    docs_content = "\n\n-- new chunk --\n".join(f"{doc.page_content}" for doc in context)

    system_prompt = "You are an expert in question answering from a given context."
    if use_dq_assessment:
        user_prompt = f'Answer the given question as short as possible based on the given context. The context consist of multiple chunks which are devided by "-- new chunk --". The chunks are ordered based on their currency with the most current one at the beginning. If they include multiple conflictings answers to the question, take the one that appears first in the context.\nContext:\n{docs_content}\n\nQuestion: {query}'
    else:
        user_prompt = f"Answer the given question as short as possible based on the context, which has been retrieved by similarity search from a knowledge base.\nContext:\n{docs_content}\n\nQuestion: {query}"

    return llm.call_openai(system_prompt, user_prompt, model, response_format_pydantic=llm.LLMQAResponse)

In [None]:
### Evaluate retrieval

In [None]:
query = "According to the judgment of Glenwood, Quailwood, Court, what was the occupation of Y. Nelson?"


print("Without DQ:")
context = retrieve(query)
response = generate(query, context, use_dq_assessment=False)
print(response.answer)
print()

print("With DQ:")
context = retrieve(query, k=10)
context = retrieve_newer_chunks(context)
context = sort_chunks(context)
response = generate(query, context=context)
print(response.answer)

Without DQ:
The occupation of Y. Nelson is not specified in the judgment.

With DQ:
The judgment does not specify the occupation of Y. Nelson.


In [96]:
for c in context:
    print(c.metadata["doc_id"])
    print(c.metadata["creation_date"])
    print(f"{c.metadata["chunk_id"]} - outdated by {c.metadata["outdated_by_chunk_ids"]}")
    print("-----------")
    print(c.page_content)
    print("-----------")

print(response.answer)

300002
2015-01-01 00:00:00
3705 - outdated by []
-----------
Chief judge according to the court judgment of Danbury, Pinehurst, Court | J. Smith
Residence of F. Williams according to the court judgment of Upton, Georgetown, Court | 45, Maple Avenue, Georgetown.
Defense lawyer for Y. Nelson according to the judgment of Glenwood, Quailwood, Court | J. Smith
Date of the court judgment of Bayside, Roseville, Court | 15th November, 2022
Name of the chief judge according to the court judgment of Trenton, Eastwood, Court | J. Smith
-----------
400139
2010-01-01 00:00:00
3544 - outdated by []
-----------
In a significant case adjudicated by the 9th Judicial Circuit of Glenwood, the Glenwood, Quailwood Court delivered a judgment on June 10, 2023, against a female defendant, Y. Nelson. The court proceedings were overseen by Chief Judge Hon. H. Ruiz and Presiding Judge Hon. E. Collins, with K. Kelly serving as the court clerk.
-----------
400139
2010-01-01 00:00:00
3549 - outdated by []
---------