In [None]:
%pip install langchain_chroma

In [None]:
import os
import fitz  # PyMuPDF
import chromadb
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.embeddings import HuggingFaceEmbeddings

In [2]:
### e-invoice PDF indexing

def extract_text_from_pdf(pdf_path):
    """Extract text from a single PDF file."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    return text.strip()

def extract_all_pdf_content(directory):
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".pdf"):
            file_path = os.path.join(directory, file_name)
            pdf_text = extract_text_from_pdf(file_path)
            documents.append(Document(
                page_content=pdf_text,
                metadata={"source": file_name}
            ))
    return documents

def split_text(documents):
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=800, chunk_overlap=100
    )
    return text_splitter.split_documents(documents)

def store_in_vector_db(chunks, persist_path, collection_name):
    texts = [chunk.page_content for chunk in chunks]
    metadatas = [chunk.metadata for chunk in chunks]
    embedded_chunks = embeddings.embed_documents(texts)

    client = chromadb.PersistentClient(path=persist_path)
    collection = client.get_or_create_collection(name=collection_name)

    ids = [str(i) for i in range(len(texts))]
    collection.add(
        ids=ids,
        embeddings=embedded_chunks,
        documents=texts,
        metadatas=metadatas
    )

    print("✅ Stored all documents in vector DB.")
    return client

# Get the directory of the current script
BASE_DIR = os.getcwd()

# === CONFIGURATION ===
PDF_DIRECTORY = os.path.join(BASE_DIR, "datasource", "finance", "E-invoice-pdf")

# === RUN THE WORKFLOW ===
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
documents = extract_all_pdf_content(PDF_DIRECTORY)
chunks = split_text(documents)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
# 1. Load and structure content from .txt files
def extract_structured_txt_content(directory):
    documents = []
    for file_name in os.listdir(directory):
        if file_name.endswith(".txt"):
            file_path = os.path.join(directory, file_name)
            with open(file_path, "r", encoding="utf-8") as file:
                full_text = file.read()

                # Split by --- URL ---
                sections = re.split(r'^--- (https?://[^\s]+) ---$', full_text, flags=re.MULTILINE)
                # Returns ['', url1, content1, url2, content2, ...]
                for i in range(1, len(sections), 2):
                    url = sections[i].strip()
                    content = sections[i + 1].strip()

                    if content:
                        documents.append(Document(
                            page_content=content,
                            metadata={"source": file_name, "url": url}
                        ))
    return documents

# 2. Smart chunking using LangChain text splitter
def split_text_by_structure(documents):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=100
    )
    return splitter.split_documents(documents)

# 3. Store into Chroma DB
def store_in_vector_db(chunks, persist_path, collection_name):
    texts = [chunk.page_content for chunk in chunks]
    metadatas = [chunk.metadata for chunk in chunks]

    # Create embeddings
    embedded_chunks = embeddings.embed_documents(texts)

    # Initialize Chroma DB
    client = chromadb.PersistentClient(path=persist_path)
    collection = client.get_or_create_collection(name=collection_name)

    ids = [str(i) for i in range(len(texts))]
    collection.add(
        ids=ids,
        embeddings=embedded_chunks,
        documents=texts,
        metadatas=metadatas
    )

    print("✅ Stored all documents in vector DB.")
    return client

# Get the directory of the current script
BASE_DIR = os.getcwd()

# === CONFIGURATION ===
TXT_DIRECTORY = os.path.join(BASE_DIR, "datasource", "finance", "tax")
PERSIST_DIRECTORY = os.path.join(BASE_DIR, "vector-db", "lhdn_db_new")
COLLECTION_NAME = "lhdn"

# === RUN THE WORKFLOW ===
documents = extract_structured_txt_content(TXT_DIRECTORY)
tax_chunks = split_text_by_structure(documents)


In [None]:
combined_chunks = chunks + tax_chunks
store_in_vector_db(combined_chunks, PERSIST_DIRECTORY, COLLECTION_NAME)

✅ Stored all documents in vector DB.


<chromadb.api.client.Client at 0x320286d10>