In [None]:
from pathlib import Path
from langchain.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from sentence_transformers import SentenceTransformer

from langchain.embeddings.base import Embeddings
from sentence_transformers import SentenceTransformer

class HuggingFaceEmbeddings(Embeddings):
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        # Load your Hugging Face model
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        """
        Embeds a list of documents.
        Args:
            texts (List[str]): List of document strings.
        Returns:
            List[List[float]]: List of embeddings for each document.
        """
        return self.model.encode(texts, convert_to_tensor=False).tolist()

    def embed_query(self, text):
        """
        Embeds a single query.
        Args:
            text (str): Query string.
        Returns:
            List[float]: Embedding for the query.
        """
        return self.model.encode([text], convert_to_tensor=False)[0].tolist()

embedding = HuggingFaceEmbeddings()



In [2]:
CHROMA_PATH = "chroma"
DATA_PATH = "data"

# ef = OllamaEmbeddings(
#     model="mxbai-embed-large"
# )

In [3]:
def load_pdfs_recursively(directory):
    documents = []
    pdf_files = Path(directory).rglob("*.pdf")  # Match all .pdf files
    
    for pdf_file in pdf_files:
        print(f"Processing file: {pdf_file}")  # Optional: Display progress
        loader = PyPDFLoader(str(pdf_file))   # Create a loader instance
        documents.extend(loader.load())
        
    return documents

In [4]:
all_documents = load_pdfs_recursively(DATA_PATH)

Processing file: data\India\Auxilo\Auxilo_article_1.pdf
Processing file: data\India\Auxilo\Auxilo_article_2.pdf
Processing file: data\India\Auxilo\Auxilo_article_3.pdf
Processing file: data\India\Auxilo\Auxilo_article_4.pdf
Processing file: data\India\Auxilo\Auxilo_article_5.pdf
Processing file: data\India\Auxilo\Auxilo_article_6.pdf
Processing file: data\India\Auxilo\Auxilo_article_7.pdf
Processing file: data\India\dezerv\dezerv._Transcript_6q1qHBdJ84s.pdf
Processing file: data\India\dezerv\dezerv._Transcript_E_A8_osOSCI.pdf
Processing file: data\India\dezerv\dezerv._Transcript_Fr6xPP_Xw5o.pdf
Processing file: data\India\dezerv\dezerv._Transcript_gdN6c2xuemU.pdf
Processing file: data\India\dezerv\dezerv._Transcript_U6TpRCZ4NIo.pdf
Processing file: data\India\dezerv\dezerv_article_1.pdf
Processing file: data\India\dezerv\dezerv_article_10.pdf
Processing file: data\India\dezerv\dezerv_article_2.pdf
Processing file: data\India\dezerv\dezerv_article_3.pdf
Processing file: data\India\dezer

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )
chunks = text_splitter.split_documents(all_documents)

In [24]:
def calculate_chunk_ids(chunks):

    # This will create IDs like "data/monopoly.pdf:6:2"
    # Page Source : Page Number : Chunk Index

    last_page_id = None
    current_chunk_index = 0

    for chunk in chunks:
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # If the page ID is the same as the last one, increment the index.
        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calculate the chunk ID.
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        last_page_id = current_page_id

        # Add it to the page meta-data.
        chunk.metadata["id"] = chunk_id

    return chunks

def chunk_list(lst, chunk_size):
    for i in range(0, len(lst), chunk_size):
        yield lst[i:i + chunk_size]

db = Chroma(
    persist_directory=CHROMA_PATH, embedding_function=embedding
)

# # Calculate Page IDs.
chunks_with_ids = calculate_chunk_ids(chunks)

# # Add or Update the documents.
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

# # Only add documents that don't exist in the DB.
new_chunks = []
for chunk in chunks_with_ids:
    if chunk.metadata["id"] not in existing_ids:
        new_chunks.append(chunk)

if len(new_chunks):
    print(f"👉 Adding new documents: {len(new_chunks)}")
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    # db.add_documents(new_chunks, ids=new_chunk_ids, batch_size=2500)
    # db.persist()

    batch_size = 5460  # Set to the maximum allowed batch size
    for i in range(0, len(new_chunks), batch_size):
        batch = new_chunks[i:i + batch_size]
        batch_ids = new_chunk_ids[i:i + batch_size]
        db.add_documents(batch, ids=batch_ids)
        db.persist()
else:
    print("✅ No new documents to add")

Number of existing documents in DB: 100
👉 Adding new documents: 7256
