In [16]:
from importlib import reload
import lancedb
import pandas as pd
from dotenv import load_dotenv

load_dotenv("/Users/leon/.env")

# DB specifications
LANCEDB_DIR = "/Users/leon/Documents/study/MA/lancedb"
TABLE_NAME_DOCS = "documents"
TABLE_NAME_CHUNKS = "chunks_emb-large"

db = lancedb.connect(LANCEDB_DIR)

In [17]:
from utils import io_helpers

reload(io_helpers)


def add_fictional_creation_date(row):
    if str(row["doc_id"]).startswith("400"):
        return pd.to_datetime("2010-01-01")
    if str(row["doc_id"]).startswith("300"):
        return pd.to_datetime("2015-01-01")
    if str(row["doc_id"]).startswith("100"):
        return pd.to_datetime("2005-01-01")
    return pd.to_datetime("2000-01-01")


def get_documents_with_creation_date() -> pd.DataFrame:
    documents = io_helpers.get_documents(read_embeddings=True).drop(columns="original_doc_ids")
    documents["creation_date"] = documents.apply(add_fictional_creation_date, axis=1)
    return documents

In [18]:
documents = get_documents_with_creation_date()

table = db.create_table(TABLE_NAME_DOCS, data=documents, exist_ok=True)
db[TABLE_NAME_DOCS].head(1)

[90m[[0m2025-06-15T17:35:39Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet


pyarrow.Table
doc_id: int64
domain: string
content: string
embedding: list<item: double>
  child 0, item: double
creation_date: timestamp[ns]
----
doc_id: [[40]]
domain: [["Finance"]]
content: [["Acme Government Solutions is a government industry company established on June 1, 2001 in Washington, D.C., specializing in providing comprehensive government services and solutions.
In January 2021, Acme Government Solutions made a significant decision to distribute $5 million of dividends to its shareholders. This move not only enhanced shareholder returns but also showcased the company's commitment to rewarding its investors. This dividend distribution was a result of the company's successful acquisition of a major government contract worth $100 million in March 2021. This acquisition expanded Acme Government Solutions' service portfolio and increased its revenue potential. Furthermore, in April 2021, the company announced plans to establish regional offices in three new states, thereby exp

In [19]:
from langchain_community.document_loaders import DataFrameLoader

table_data: pd.DataFrame = table.search().to_pandas()

loader = DataFrameLoader(table_data, page_content_column="content")
lc_documents = loader.load()

[90m[[0m2025-06-15T17:35:39Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet


In [20]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

lc_chunks = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], chunk_size=512, chunk_overlap=128, strip_whitespace=True, keep_separator=False
).split_documents(lc_documents)


print(len(lc_chunks))
for i in range(len(lc_chunks)):
    lc_chunks[i].metadata["chunk_id"] = i
    lc_chunks[i].id = i

3720


In [21]:
from typing import List
from langchain_core.documents.base import Document
import string


def normalize_string(input: str) -> str:
    translator = str.maketrans({p: "" for p in string.punctuation})
    return input.lower().strip().translate(translator)


def flag_outdated_chunk(data: pd.Series, chunks: List[Document]) -> List[Document]:
    documents = get_documents_with_creation_date()
    creation_date1 = documents[documents["doc_id"] == data["id1"]].squeeze()["creation_date"]
    creation_date2 = documents[documents["doc_id"] == data["id2"]].squeeze()["creation_date"]

    if creation_date1 < creation_date2:  # doc2 is newer
        doc_id_old = data["id1"]
        doc_id_new = data["id2"]
        passage_old = data["conflicting_passage_doc1"]
        passage_new = data["conflicting_passage_doc2"]
    else:  # doc1 is newer
        doc_id_old = data["id2"]
        doc_id_new = data["id1"]
        passage_old = data["conflicting_passage_doc2"]
        passage_new = data["conflicting_passage_doc1"]

    chunk_ids_old = []
    chunk_ids_new = []

    for chunk in chunks:
        if chunk.metadata["doc_id"] not in [doc_id_old, doc_id_new]:
            continue
        if normalize_string(passage_old) in normalize_string(chunk.page_content):
            chunk_ids_old.append(chunk.metadata["chunk_id"])
        elif normalize_string(passage_new) in normalize_string(chunk.page_content):
            chunk_ids_new.append(chunk.metadata["chunk_id"])

    for chunk in chunks:
        if chunk.metadata["chunk_id"] in chunk_ids_old:
            chunk.metadata["outdated_by_chunk_ids"] = chunk_ids_new

    return chunks


conflicts = pd.read_csv(
    "data/additional_data/docs/_conflicts.csv",
    usecols=["id1", "id2", "model", "conflicting_passage_doc1", "conflicting_passage_doc2"],
    dtype={"id1": "Int64", "id2": "Int64"},
)

for _, data in conflicts.iterrows():
    lc_chunks = flag_outdated_chunk(data, lc_chunks)

In [22]:
from langchain.vectorstores import LanceDB
from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

try:
    vector_store = LanceDB(uri=LANCEDB_DIR, embedding=embeddings, table=db.open_table(TABLE_NAME_CHUNKS))
except ValueError:
    print("Table not existent")
    vector_store = LanceDB(uri=LANCEDB_DIR, embedding=embeddings, table_name=TABLE_NAME_CHUNKS)

Table not existent


In [23]:
chunks_from_here = lc_chunks.copy()

ids = [chunk.metadata["chunk_id"] for chunk in chunks_from_here]
texts = [chunk.page_content for chunk in chunks_from_here]
metadatas = [chunk.metadata for chunk in chunks_from_here]

for meta in metadatas:
    meta.pop("embedding", None)
    if meta.get("outdated_by_chunk_ids", None) is None:
        meta["outdated_by_chunk_ids"] = []

_ = vector_store.add_texts(texts=texts, metadatas=metadatas, ids=ids)

In [24]:
chunks_table = db["chunks"].search().to_pandas()
print(len(chunks_table))

chunks_table.head(1)

3720


[90m[[0m2025-06-15T17:40:14Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet


Unnamed: 0,vector,id,text,metadata
0,"[0.016151262, -0.020787708, 0.08296303, 0.0547...",0,Acme Government Solutions is a government indu...,"{'chunk_id': 0, 'creation_date': 2000-01-01 00..."
