### This file will contain the code to ingest PDF documents and convert them into vectors using Langchain and OpenAI embeddings.

In [None]:
import os
import langchain

from langchain_core.documents import Document
from langchain_community.document_loaders import UnstructuredPDFLoader
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_community.document_loaders import PyPDFLoader
import dotenv

dotenv_path = os.path.join(os.path.dirname('.env'))
if os.path.exists(dotenv_path):
    dotenv.load_dotenv(dotenv_path)

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
print('sucessfully loaded the env')

#### Using PyMuPDF to load PDF files


In [None]:
pdfLoader = PyMuPDFLoader("data/pdf/AI Agents guidebook.pdf")
pdfDocuments = pdfLoader.load()
print(f"Number of documents: {len(pdfDocuments)}")
print(f"Type of documents: {type(pdfDocuments)}")
print(f"Type of first document: {type(pdfDocuments[0])}")
print('-------------------------------------' )
print(f"Metadata of first document: {pdfDocuments[0].metadata['file_path']}")
print('-------------------------------------' )
print(f"Content of first document: {pdfDocuments[0].page_content[:500]}")

### We need to create a smart chunking strategy to remove empty spaces, blank pages, etc from PDF while chunking.

In [None]:
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter

def extract_clean_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = []
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text = page.get_text("text")
        # Remove empty lines
        lines = [line.strip() for line in text.splitlines() if line.strip()]
        # Skip blank pages
        if lines:
            all_text.append("\n".join(lines))
    doc.close()
    # Combine all non-blank pages
    return "\n\n".join(all_text)

pdf_path = pdfDocuments[0].metadata['file_path']
clean_text = extract_clean_text_from_pdf(pdf_path)

# Smart chunking
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", " ", ""],  # Try to split at paragraphs, then lines, then words, then chars
    chunk_size=500,
    chunk_overlap=200,
    length_function=len
)
chunks = splitter.split_text(clean_text)

print(f"Total chunks: {len(chunks)}")
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk}\n{'-'*40}")