In [1]:
from pathlib import Path
import fitz  # PyMuPDF
import os
from dotenv import load_dotenv

from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings


In [2]:
# ‚úÖ Load environment variables
load_dotenv()
persist_dir = "../db"
pdf_dir = Path("../data/tutorials")  # Adjust path to your actual PDF folder

In [3]:
# ‚úÖ Set up embedding model using HuggingFace
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# ‚úÖ Extract text from a single PDF
def extract_text_from_pdf(pdf_path):
    print(f"üìÑ Reading: {pdf_path.name}")
    try:
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        return text
    except Exception as e:
        print(f"‚ùå Failed to read {pdf_path.name}: {e}")
        return ""

In [5]:
from pathlib import Path

# ‚úÖ Set path to folder containing PDFs
pdf_dir = Path("../data/tutorials")

# ‚úÖ Load all .pdf files
pdf_files = list(pdf_dir.glob("*.pdf"))
if not pdf_files:
    raise FileNotFoundError("‚ùå No PDF files found in 'data/tutorials/'")

all_text = ""
for file in pdf_files:
    all_text += extract_text_from_pdf(file)

print("‚úÖ All PDF content loaded.")


üìÑ Reading: Python Crash Course, 3rd Edition A Hands-On, Pr.pdf
üìÑ Reading: The Big Book Of Small Python Projects.pdf
üìÑ Reading: Python for data science 2022.pdf
üìÑ Reading: Automate_the_Boring_Stuff_with_Python,_2nd_Edition_@RedBlueHit.pdf
‚úÖ All PDF content loaded.


In [6]:
# ‚úÖ Chunk the text
splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=100)
chunks = splitter.split_text(all_text)
print(f"‚úÖ Total chunks created: {len(chunks)}")

‚úÖ Total chunks created: 1


In [None]:
# ‚úÖ Embed and store in Chroma
persist_dir = "../db"  # Go up one level from 'notebooks/' to the main folder
db = Chroma.from_texts(chunks, embedding, persist_directory=persist_dir)
print(f"‚úÖ Vector database saved to '{persist_dir}'")

‚úÖ Vector database saved to '../db'


  db.persist()
