## Load PDFs

In [5]:
from langchain_community.document_loaders import (
    PyPDFLoader,
    PyMuPDFLoader,
    UnstructuredPDFLoader,
)

In [10]:
### PyPDFLoader
print("PyPDFLoader")
try:
    loader = PyPDFLoader("data/pdf/lbdl.pdf")
    docs = loader.load()
    print(f"Number of documents: {len(docs)}")
    print(f"Content: {docs[0].page_content[:100]}")
    print(f"Metadata: {docs[0].metadata}")
except Exception as e:
    print(f"Error with PyPDFLoader: {e}")

PyPDFLoader
Number of documents: 185
Content: The Little Book
of
Deep Learning
François Fleuret
Metadata: {'producer': 'LaTeX and TikZ', 'creator': 'pdflatex', 'creationdate': '2024-07-23T16:31:04+02:00', 'author': 'François Fleuret', 'title': 'The Little Book of Deep Learning', 'subject': 'A short introduction to deep learning for readers with a STEM background. It aims at providing the necessary context to understand key AI models for image generation and language processing.', 'keywords': 'deep learning,machine learning,computer vision,natural language processing', 'moddate': '2024-07-23T16:31:04+02:00', 'trapped': '/False', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Debian) kpathsea version 6.3.4', 'source': 'data/pdf/lbdl.pdf', 'total_pages': 185, 'page': 0, 'page_label': '1'}


In [None]:
### PyMuPDFLoader
print("PyMuPDFLoader")
try:
    loader = PyMuPDFLoader("data/pdf/lbdl.pdf")
    docs = loader.load()
    print(f"Number of documents: {len(docs)}")
    print(f"Content: {docs[0].page_content[:100]}")
    print(f"Metadata: {docs[0].metadata}")
except Exception as e:
    print(f"Error with PyPDFLoader: {e}")

In [11]:
### Example of raw pdf extraction
print("Raw PDF Extraction")
raw_pdf_text = """Company Financial Report

    The financial performance for fiscal year 2024
    shows significant growth in profitability.



    Revenue increased by 25%.

The company's efficiency improved due to workflow
optimization.

Page 1 of 10
"""

# Apply the cleaning function
def clean_text(text):
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Fix ligatures
    text = text.replace("ﬁ", "fi")
    text = text.replace("ﬂ", "fl")
    
    return text

cleaned = clean_text(raw_pdf_text)
print("BEFORE:")
print(repr(raw_pdf_text[:100]))
print("\nAFTER:")
print(repr(cleaned[:100]))

Raw PDF Extraction
BEFORE:
'Company Financial Report\n\n    The financial performance for fiscal year 2024\n    shows significant g'

AFTER:
'Company Financial Report The financial performance for fiscal year 2024 shows significant growth in '


In [13]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
class SmartPDFProcessor:
    """ Advanced PDF Processing with error handling"""
    def __init__(self, chunk_size=1000, chunk_overlap=200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=[" "]
        )
    
    def process_pdf(self, pdf_path:str) -> list[Document]:
        """ Process PDF with smart chunking and metadata ehancement """
        loader = PyPDFLoader(pdf_path)
        pages = loader.load()
        # process each page
        processed_chunks = []
        for page_num, page in enumerate(pages):
            # clean the text
            cleaned_text = self.clean_text(page.page_content)
            # skip nearly empty pages (not sure I agree)
            if len(cleaned_text) < 50:
                continue
            
            # Create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[{
                    **page.metadata,
                    "page": page_num + 1,
                    "total_pages": len(pages),
                    "chunk_method": "smart_pdf_processor",
                    "char_count": len(cleaned_text)
                }]
            )
            
            processed_chunks.extend(chunks)
        return processed_chunks
            
    def clean_text(self, text):
        """ Clean the text """
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix ligatures
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text


In [18]:
processor = SmartPDFProcessor()
try:
    chunks = processor.process_pdf("data/pdf/lbdl.pdf")
    print(f"Number of chunks: {len(chunks)}")
    if chunks:
        print("\nSample chunk metadata")
        for key, value in chunks[200].metadata.items():
            print(f"{key}: {value}")
    # print(f"Content: {docs[0].page_content[:100]}")
    # print(f"Metadata: {docs[0].metadata}")
except Exception as e:
    print(f"Error with PyPDFLoader: {e}")

Number of chunks: 302

Sample chunk metadata
producer: LaTeX and TikZ
creator: pdflatex
creationdate: 2024-07-23T16:31:04+02:00
author: François Fleuret
title: The Little Book of Deep Learning
subject: A short introduction to deep learning for readers with a STEM background. It aims at providing the necessary context to understand key AI models for image generation and language processing.
keywords: deep learning,machine learning,computer vision,natural language processing
moddate: 2024-07-23T16:31:04+02:00
trapped: /False
ptex.fullbanner: This is pdfTeX, Version 3.141592653-2.6-1.40.24 (TeX Live 2022/Debian) kpathsea version 6.3.4
source: data/pdf/lbdl.pdf
total_pages: 185
page: 120
page_label: 120
chunk_method: smart_pdf_processor
char_count: 1191
