In [None]:
from pathlib import Path
import fitz  # PyMuPDF
import os
from dotenv import load_dotenv  # Optional for secure API key handling

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

In [None]:
# ✅ Load environment variables from .env if it exists
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

if not openai_api_key:
    raise ValueError("❌ OPENAI_API_KEY not found. Please set it in .env or environment variables.")

    
# ✅ Set up embedding model
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [None]:
# ✅ Step 1: Extract text from all PDFs
def extract_text_from_pdf(pdf_path):
    print(f"📄 Reading: {pdf_path.name}")
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [None]:
# ✅ Load all text from /data folder
pdf_dir = Path("../data")
pdf_files = list(pdf_dir.glob("*.pdf"))

if not pdf_files:
    raise FileNotFoundError("❌ No PDF files found in /data folder.")

all_text = ""
for file in pdf_files:
    all_text += extract_text_from_pdf(file)

print("✅ All PDF content loaded.")

In [None]:
# ✅ Step 2: Chunk the text
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_text(all_text)

print(f"✅ Total chunks created: {len(chunks)}")

In [None]:
# ✅ Step 3: Embed and store in ChromaDB
persist_dir = "../db"
db = Chroma.from_texts(chunks, embedding, persist_directory=persist_dir)
db.persist()

print(f"✅ Vector database saved to {persist_dir}")