In [71]:
from importlib import reload
import lancedb
import pandas as pd
from dotenv import load_dotenv

load_dotenv("/Users/leon/.env")

# DB specifications
LANCEDB_DIR = "/Users/leon/Documents/study/MA/lancedb"
TABLE_NAME_DOCS = "documents"
TABLE_NAME_CHUNKS = "chunks"

db = lancedb.connect(LANCEDB_DIR)

In [72]:
from utils import io_helpers

reload(io_helpers)


def add_fictional_creation_date(row):
    if str(row["doc_id"]).startswith("400"):
        return pd.to_datetime("2010-01-01")
    if str(row["doc_id"]).startswith("300"):
        return pd.to_datetime("2015-01-01")
    if str(row["doc_id"]).startswith("100"):
        return pd.to_datetime("2005-01-01")
    return pd.to_datetime("2000-01-01")


def get_documents_with_creation_date() -> pd.DataFrame:
    documents = io_helpers.get_documents(read_embeddings=True).drop(columns="original_doc_ids")
    documents["creation_date"] = documents.apply(add_fictional_creation_date, axis=1)
    return documents

In [73]:
documents = get_documents_with_creation_date()

table = db.create_table(TABLE_NAME_DOCS, data=documents, exist_ok=True)
db[TABLE_NAME_DOCS].head(1)

documents

[90m[[0m2025-06-11T18:48:54Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet


Unnamed: 0,doc_id,domain,content,embedding,creation_date
0,40,Finance,Acme Government Solutions is a government indu...,"[0.035415836, 0.015197343, 0.08163272, 0.02999...",2000-01-01
1,41,Finance,Entertainment Enterprises Inc. is an entertain...,"[0.050379667, -0.0005295727, 0.04373168, 0.038...",2000-01-01
2,42,Finance,"Advanced Manufacturing Solutions Inc., establi...","[0.023992022, 0.011302027, 0.050733544, 0.0336...",2000-01-01
3,43,Finance,"EcoGuard Solutions, established on April 15, 2...","[0.0594782, 0.01698723, 0.061538648, 0.0547162...",2000-01-01
4,44,Finance,"Green Fields Agriculture Ltd., established on ...","[0.017018009, 0.014309261, 0.082911275, 0.0502...",2000-01-01
...,...,...,...,...,...
166,400116,Law,In a significant legal proceeding at the Cedar...,"[0.028841885, -0.034240857, 0.044584155, 0.031...",2010-01-01
167,400059,Finance,"Retail Emporium, a well-established retail gia...","[0.03652241, 0.02207698, 0.021544848, 0.039533...",2010-01-01
168,300001,Finance,Changes that occurred in senior management of ...,"[-0.004240031, -0.007925675, 0.018725948, 0.03...",2015-01-01
169,300002,Law,Chief judge according to the court judgment of...,"[-0.0014971758, 0.015988875, 0.023612805, 0.02...",2015-01-01


In [103]:
from langchain_community.document_loaders import DataFrameLoader

table_data: pd.DataFrame = table.search().to_pandas()

loader = DataFrameLoader(table_data, page_content_column="content")
lc_documents = loader.load()

[90m[[0m2025-06-11T19:58:03Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet


In [99]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

lc_chunks = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""], chunk_size=1024, chunk_overlap=128, strip_whitespace=True, keep_separator=False
).split_documents(lc_documents)


print(len(lc_chunks))
for i in range(len(lc_chunks)):
    lc_chunks[i].metadata["chunk_id"] = i
    lc_chunks[i].id = i

1727


In [100]:
from typing import List
from langchain_core.documents.base import Document
import string


def normalize_string(input: str) -> str:
    translator = str.maketrans({p: "" for p in string.punctuation})
    return input.lower().strip().translate(translator)


def flag_outdated_chunk(data: pd.Series, chunks: List[Document]) -> List[Document]:
    documents = get_documents_with_creation_date()
    creation_date1 = documents[documents["doc_id"] == data["id1"]].squeeze()["creation_date"]
    creation_date2 = documents[documents["doc_id"] == data["id2"]].squeeze()["creation_date"]

    if creation_date1 < creation_date2:  # doc2 is newer
        doc_id_old = data["id1"]
        doc_id_new = data["id2"]
        passage_old = data["conflicting_passage_doc1"]
        passage_new = data["conflicting_passage_doc2"]
    else:  # doc1 is newer
        doc_id_old = data["id2"]
        doc_id_new = data["id1"]
        passage_old = data["conflicting_passage_doc2"]
        passage_new = data["conflicting_passage_doc1"]

    chunk_ids_old = []
    chunk_ids_new = []

    for chunk in chunks:
        if chunk.metadata["doc_id"] not in [doc_id_old, doc_id_new]:
            continue
        if normalize_string(passage_old) in normalize_string(chunk.page_content):
            chunk_ids_old.append(chunk.metadata["chunk_id"])
        elif normalize_string(passage_new) in normalize_string(chunk.page_content):
            chunk_ids_new.append(chunk.metadata["chunk_id"])

    for chunk in chunks:
        if chunk.metadata["chunk_id"] in chunk_ids_old:
            chunk.metadata["outdated_by_chunk_ids"] = chunk_ids_new

    return chunks


conflicts = pd.read_csv(
    "data/additional_data/docs/_conflicts.csv",
    usecols=["id1", "id2", "model", "conflicting_passage_doc1", "conflicting_passage_doc2"],
    dtype={"id1": "Int64", "id2": "Int64"},
)

for _, data in conflicts.iterrows():
    lc_chunks = flag_outdated_chunk(data, lc_chunks)

In [104]:
from langchain.vectorstores import LanceDB
from langchain.embeddings import OpenAIEmbeddings
from openai import RateLimitError
from tenacity import retry, retry_if_exception_type, wait_random, stop_after_attempt

embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

try:
    vector_store = LanceDB(uri=LANCEDB_DIR, embedding=embeddings, table=db.open_table(TABLE_NAME_CHUNKS))
except ValueError:
    print("Table not existent")
    vector_store = LanceDB(uri=LANCEDB_DIR, embedding=embeddings, table_name=TABLE_NAME_CHUNKS)

ids_processed = []

Table not existent


In [78]:
print(lc_chunks[0].metadata)
print(lc_chunks[1].metadata)

{'doc_id': 40, 'domain': 'Finance', 'embedding': array([ 0.03541584,  0.01519734,  0.08163272, ..., -0.01773023,
        0.01715001, -0.00257613]), 'creation_date': Timestamp('2000-01-01 00:00:00'), 'chunk_id': 0, 'outdated_by_chunk_ids': [1652]}
{'doc_id': 40, 'domain': 'Finance', 'embedding': array([ 0.03541584,  0.01519734,  0.08163272, ..., -0.01773023,
        0.01715001, -0.00257613]), 'creation_date': Timestamp('2000-01-01 00:00:00'), 'chunk_id': 1}


In [95]:
print(lc_chunks[0])

page_content='Acme Government Solutions is a government industry company established on June 1, 2001 in Washington, D.C., specializing in providing comprehensive government services and solutions.
In January 2021, Acme Government Solutions made a significant decision to distribute $5 million of dividends to its shareholders. This move not only enhanced shareholder returns but also showcased the company's commitment to rewarding its investors. This dividend distribution was a result of the company's successful acquisition of a major government contract worth $100 million in March 2021. This acquisition expanded Acme Government Solutions' service portfolio and increased its revenue potential. Furthermore, in April 2021, the company announced plans to establish regional offices in three new states, thereby expanding its presence and market reach. This strategic move allowed Acme Government Solutions to tap into new geographic markets, increasing its market share and potential customer bas

In [None]:
chunks_from_here = lc_chunks.copy()

ids = [chunk.metadata["chunk_id"] for chunk in chunks_from_here]
texts = [chunk.page_content for chunk in chunks_from_here]
metadatas = [chunk.metadata for chunk in chunks_from_here]

for meta in metadatas:
    meta.pop("embedding", None)
    if meta.get("outdated_by_chunk_ids", None) is None:
        meta["outdated_by_chunk_ids"] = []

# vector_store.add_texts(texts, metadatas, ids)

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
@retry(
    retry=retry_if_exception_type(RateLimitError),
    wait=wait_random(min=30, max=60),
    stop=stop_after_attempt(6),
)
def add_chunk_to_vector_store(id, text, metadata):
    if id in ids_processed:
        print(f"Skipping ID '{id}'")
    vector_store.add_texts([text], [metadata], [id])
    ids_processed.append(id)


for chunk in lc_chunks:
    id = chunk.metadata["chunk_id"]
    text = chunk.page_content
    metadata = chunk.metadata.copy()
    if metadata.get("outdated_by_chunk_ids", None) is None:
        metadata["outdated_by_chunk_ids"] = []
    try:
        del metadata["embedding"]
    except KeyError:
        pass
    try:
        del metadata["chunk_id"]
    except KeyError:
        pass
    # add_chunk_to_vector_store(id, text, metadata)

Skipping ID '0'
Skipping ID '1'
Skipping ID '2'


In [108]:
result = vector_store.similarity_search(
    "Who is the defense lawyer for Y. Nelson according to the judgment of Glenwood, Quailwood, Court?", k=10
)

print(len(result))

for res in result:
    # print(res.id, ": ", res.metadata)
    print(res)

10
page_content='Glenwood, Quailwood Court
9th Judicial Circuit
State of Glenwood

Case No: 2023-458-CR

Chief Judge: Hon. H. Ruiz
Presiding Judge: Hon. E. Collins
Court Clerk: K. Kelly

---

**JUDGMENT**

**The People of Glenwood vs. Y. Nelson**

**1. Court and Prosecutor Information:**

*Court:* Glenwood, Quailwood Court  
*Prosecutor:* Glenwood, Quailwood Procuratorate  

*Chief Judge:* Hon. H. Ruiz  
*Presiding Judge:* Hon. E. Collins  
*Court Clerk:* K. Kelly  

**2. Defendant and Defense Lawyer Information:**

*Defendant:* Y. Nelson  
*Gender:* Female  
*Birthdate:* December 5, 1981  
*Residence:* 79 Yorkshire Street, Quailwood  
*Ethnicity:* Caucasian  
*Occupation:* Barista  

*Defense Lawyer:* Y. Parker  
*Law Firm:* Parker & Associates  

**3. Case Procedures:**' metadata={'chunk_id': 952, 'creation_date': datetime.datetime(2000, 1, 1, 0, 0), 'doc_id': 139, 'domain': 'Law', 'outdated_by_chunk_ids': []}
page_content='Glenwood, Quailwood Court
9th Judicial Circuit
State of Glen

In [97]:
chunks_table = db["chunks"].search().to_pandas()
print(len(chunks_table))
print(chunks_table)

1727
                                                 vector    id  \
0     [0.03436723, -0.019573476, 0.08167088, 0.05522...     0   
1     [0.010202037, -0.0018611825, 0.07961098, 0.016...     1   
2     [0.020225868, -0.016554173, 0.09119935, 0.0184...     2   
3     [0.027389277, -0.009073669, 0.08853623, 0.0296...     3   
4     [0.036613952, 0.0014768749, 0.09441433, 0.0236...     4   
...                                                 ...   ...   
1722  [0.035874065, 0.0018244, 0.06389988, 0.0454944...  1722   
1723  [-0.004057323, -0.007902364, 0.018557545, 0.03...  1723   
1724  [-0.0014811668, 0.016003372, 0.023641666, 0.02...  1724   
1725  [-0.0036014558, -0.0070924377, 0.013750344, 0....  1725   
1726  [-0.01616679, -0.00020412497, 0.013650815, 0.0...  1726   

                                                   text  \
0     Acme Government Solutions is a government indu...   
1     In May 2021, Acme Government Solutions forged ...   
2     In February 2021, Acme Governme

[90m[[0m2025-06-11T19:50:47Z [33mWARN [0m lance::dataset::scanner[90m][0m nprobes is not set because nearest has not been called yet


In [None]:
print(type(db[TABLE_NAME_CHUNKS]))

<class 'lancedb.table.LanceTable'>


In [None]:
from lancedb.embeddings import get_registry
from lancedb.pydantic import LanceModel, Vector

from tenacity import (
    retry,
    wait_random_exponential,
    retry_if_exception_type,
    stop_after_attempt,
)