### Load Pdf files

### PyPDFLoader

In [2]:
from langchain_community.document_loaders import PyPDFLoader

try:
    pypdf_loader = PyPDFLoader(file_path="data/pdf/attention.pdf")
    pypdf_docs = pypdf_loader.load()
    print(f"Loaded {len(pypdf_docs)} pages")
    print(f"Page 1 : {pypdf_docs[0].page_content[:100]}")
    print(f"Metadata: {pypdf_docs[0].metadata}")
except Exception as e:
    print(f"Error : {e}")

Loaded 15 pages
Page 1 : Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/pdf/attention.pdf', 'total_pages': 15, 'page': 0, 'page_label': '1'}


### PyMuPDFLoader

In [6]:
from langchain_community.document_loaders import PyMuPDFLoader

try:
    pymupdf_loader = PyMuPDFLoader(file_path="data/pdf/attention.pdf")
    pymupdf_docs = pymupdf_loader.load()
    print(f"Loaded {len(pymupdf_docs)} pages")
    print(f"Page 1: {pymupdf_docs[0].page_content[:100]}")
    print(f"Metadata: {pymupdf_docs[0].metadata}")
except Exception as e:
    print(f"Error: {e}")

Loaded 15 pages
Page 1: Provided proper attribution is provided, Google hereby grants permission to
reproduce the tables and
Metadata: {'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'source': 'data/pdf/attention.pdf', 'file_path': 'data/pdf/attention.pdf', 'total_pages': 15, 'format': 'PDF 1.5', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'trapped': '', 'modDate': 'D:20240410211143Z', 'creationDate': 'D:20240410211143Z', 'page': 0}


### Handling PDF Challenges 
- Store text in complex ways (not just simple text)
- Can have formatting issues
- May contain scanned images (requiring OCR)
- Often have extraction artifacts

### Example of raw PDF extraction


In [9]:
raw_pdf_text = """Company Financial Report


    The ﬁnancial performance for ﬁscal year 2024
    shows signiﬁcant growth in proﬁtability.
    
    
    
    Revenue increased by 25%.
    
The company's efﬁciency improved due to workﬂow
optimization.


Page 1 of 10
"""

# Apply the cleaning function
def clean_text(text):
    # Remove excessive whitespace
    text = " ".join(text.split())
    
    # Fix ligatures
    text = text.replace("ﬁ", "fi")
    text = text.replace("ﬂ", "fl")
    
    return text

cleaned_pdf_text = clean_text(raw_pdf_text)
print("BEFORE: ", repr(raw_pdf_text))
print("AFTER: ", repr(cleaned_pdf_text))

BEFORE:  "Company Financial Report\n\n\n    The ﬁnancial performance for ﬁscal year 2024\n    shows signiﬁcant growth in proﬁtability.\n\n\n\n    Revenue increased by 25%.\n\nThe company's efﬁciency improved due to workﬂow\noptimization.\n\n\nPage 1 of 10\n"
AFTER:  "Company Financial Report The financial performance for fiscal year 2024 shows significant growth in profitability. Revenue increased by 25%. The company's efficiency improved due to workflow optimization. Page 1 of 10"


In [13]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from typing import List

class SmartPDFProcessor:
    """Advanced PDF Processing with error handling"""

    def __init__(self, chunk_size=1000, chunk_overlap=100):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap,
            separators=[" "],
        )

    def process_pdf(self, pdf_path):
        """Process PDF with smart chunking and metadata enhancement"""

        loader = PyPDFLoader(pdf_path)
        pages = loader.load()

        processed_chunks = []
        for page_num, page in enumerate(pages):
            cleaned_text = self._clean_text(page.page_content)

            # skip nearly empty pages
            if len(cleaned_text.strip()) < 50:
                continue
            
            # create chunks with enhanced metadata
            chunks = self.text_splitter.create_documents(
                texts=[cleaned_text],
                metadatas=[
                    {
                        **page.metadata,
                        "page": page_num+1,
                        "total_pages": len(pages),
                        "chunk_method": "SmartPDFProcessor.process_pdf",
                        "char_count": len(cleaned_text)
                    }
                ]
            )

            processed_chunks.extend(chunks)
        return processed_chunks

    def _clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = " ".join(text.split())
        
        # Fix common PDF extraction issues
        text = text.replace("ﬁ", "fi")
        text = text.replace("ﬂ", "fl")
        
        return text

In [None]:
pdf_processor = SmartPDFProcessor()

try:
    chunks = pdf_processor.process_pdf(pdf_path="data/pdf/attention.pdf")
    print(f"Processed into {len(chunks)} chunks")
except Exception as e:
    print(f"Error: {e}")

Processed into 49 chunks


In [21]:
# check enhanced meta_data
print(chunks[0].metadata)

{'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2024-04-10T21:11:43+00:00', 'author': '', 'keywords': '', 'moddate': '2024-04-10T21:11:43+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'data/pdf/attention.pdf', 'total_pages': 15, 'page': 1, 'page_label': '1', 'chunk_method': 'SmartPDFProcessor.process_pdf', 'char_count': 2857}


In [None]:
# Printing Page metadata of chunks
for i, chunk in enumerate(chunks):
    print(f"{i+1} - Chunk metadata Page- {chunk.metadata['page']}")

1 - Chunk metadata Page- 1
2 - Chunk metadata Page- 1
3 - Chunk metadata Page- 1
4 - Chunk metadata Page- 1
5 - Chunk metadata Page- 2
6 - Chunk metadata Page- 2
7 - Chunk metadata Page- 2
8 - Chunk metadata Page- 2
9 - Chunk metadata Page- 2
10 - Chunk metadata Page- 3
11 - Chunk metadata Page- 3
12 - Chunk metadata Page- 4
13 - Chunk metadata Page- 4
14 - Chunk metadata Page- 4
15 - Chunk metadata Page- 5
16 - Chunk metadata Page- 5
17 - Chunk metadata Page- 5
18 - Chunk metadata Page- 5
19 - Chunk metadata Page- 6
20 - Chunk metadata Page- 6
21 - Chunk metadata Page- 6
22 - Chunk metadata Page- 6
23 - Chunk metadata Page- 7
24 - Chunk metadata Page- 7
25 - Chunk metadata Page- 7
26 - Chunk metadata Page- 7
27 - Chunk metadata Page- 8
28 - Chunk metadata Page- 8
29 - Chunk metadata Page- 8
30 - Chunk metadata Page- 8
31 - Chunk metadata Page- 9
32 - Chunk metadata Page- 9
33 - Chunk metadata Page- 9
34 - Chunk metadata Page- 9
35 - Chunk metadata Page- 10
36 - Chunk metadata Page- 10