In [1]:
from pathlib import Path
import fitz  # PyMuPDF
import os
from dotenv import load_dotenv

from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings


In [2]:
# ✅ Load environment variables
load_dotenv()
persist_dir = "db"
pdf_dir = Path("data/tutorials")  # Adjust path to your actual PDF folder

In [4]:
# ✅ Set up embedding model using HuggingFace
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Volumes/My Volume/askpy-rag/ragenv/lib/python3.12/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Volumes/My Volume/askpy-rag/ragenv/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Volumes/My Volume/askpy-rag/ragenv/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 739, in start
    s

In [11]:
# ✅ Extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    print(f"📄 Reading: {pdf_path.name}")
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"❌ Failed to read {pdf_path.name}: {e}")
        return ""

In [None]:
# ✅ Load all PDFs
from pathlib import Path

pdf_dir = Path("data/tutorials/")

pdf_files = list(pdf_dir.glob("*.pdf"))
if not pdf_files:
    raise FileNotFoundError("❌ No PDF files found in 'data/tutorials/'")

all_text = ""
for file in pdf_files:
    all_text += extract_text_from_pdf(file)

print("✅ All PDF content loaded.")

FileNotFoundError: ❌ No PDF files found in 'data/tutorials/'

In [None]:
# ✅ Chunk the text
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_text(all_text)
print(f"✅ Total chunks created: {len(chunks)}")

In [None]:
# ✅ Embed and store in Chroma
db = Chroma.from_texts(chunks, embedding, persist_directory=persist_dir)
db.persist()
print(f"✅ Vector database saved to '{persist_dir}'")