In [None]:
from pathlib import Path
import fitz  # PyMuPDF
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings  # Or replace with Together/HuggingFace

In [None]:
# Step 1: Extract text from all PDFs
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
# Load all PDF text
all_text = ""
pdf_dir = Path("../data")  # Adjust path if needed
for file in pdf_dir.glob("*.pdf"):
    all_text += extract_text_from_pdf(file)

print("✅ Loaded all text.")

In [None]:
# Step 2: Chunk the text
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_text(all_text)

print(f"✅ Total chunks: {len(chunks)}")

In [None]:
# Step 3: Embed and store in Chroma
persist_dir = "../db"
embedding = OpenAIEmbeddings()  # Replace if you use Together.ai
db = Chroma.from_texts(chunks, embedding, persist_directory=persist_dir)
db.persist()

print("✅ Chunks embedded and saved to vector DB.")