In [2]:
import os
from pathlib import Path
from typing import List, Dict
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from pdf2image import convert_from_path
from PIL import Image
import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class DocumentProcessor: 
    def __init__(self, output_dir: str = "data.processed"):
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir/"images"
        self.images_dir.mkdir(parents=True, exist_ok=True)

    def process_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Extract text AND embedded images from a PDF
        """
        print(f"Processing: {pdf_path}")
        
        documents = []
        filename = Path(pdf_path).stem
        
        try:
            # Extract everything including embedded images
            elements = partition_pdf(
                filename=pdf_path,
                
                # KEY: This extracts embedded images! important function to enable! 
                extract_images_in_pdf=True,
                image_output_dir_path=str(self.images_dir),
                
                # Optional: Smart chunking
                infer_table_structure=True,
                chunking_strategy="by_title",  #important for Oil and gas document extraction
                max_characters=4000,
                new_after_n_chars=3800,
                combine_text_under_n_chars=2000,
            )
            
            # Process extracted elements
            for i, element in enumerate(elements):
                # Text elements
                if element.category in ["Title", "NarrativeText", "Text", "ListItem"]:
                    if element.text.strip():
                        documents.append({
                            'type': 'text',
                            'content': element.text,
                            'metadata': {
                                'source': filename,
                                'chunk_id': f"{filename}_text_{i}",
                                'element_type': element.category
                            }
                        })
                
                # Embedded images (diagrams, photos in PDF)
                elif element.category == "Image":
                    # Image is already extracted to images_dir
                    image_path = element.metadata.image_path
                    if image_path:
                        documents.append({
                            'type': 'image',
                            'content': image_path,
                            'metadata': {
                                'source': filename,
                                'chunk_id': f"{filename}_img_{i}",
                                'coordinates': element.metadata.coordinates if hasattr(element.metadata, 'coordinates') else None
                            }
                        })
                
                # Tables
                elif element.category == "Table":
                    documents.append({
                        'type': 'table',
                        'content': element.text,
                        'metadata': {
                            'source': filename,
                            'chunk_id': f"{filename}_table_{i}",
                        }
                    })
                    
        except Exception as e:
            print(f"PDF extraction failed: {e}")
            print(f"Make sure the file is a valid PDF: {pdf_path}")
        
        print(f"✓ Extracted {len(documents)} elements from {filename}")
        print(f"  - Text: {sum(1 for d in documents if d['type'] == 'text')}")
        print(f"  - Images: {sum(1 for d in documents if d['type'] == 'image')}")
        print(f"  - Tables: {sum(1 for d in documents if d['type'] == 'table')}")
        
        return documents
    

    def process_directory(self, directory: str) -> List[Dict]:
        """Process all PDFs in a directory"""
        all_documents = []
        pdf_files = list(Path(directory).glob("*.pdf"))
        
        print(f"Found {len(pdf_files)} PDF files\n")
        
        for pdf_file in pdf_files:
            docs = self.process_pdf(str(pdf_file))
            all_documents.extend(docs)
            print()  # Blank line between files
            
        return all_documents




In [None]:
if __name__ == "__main__":
    processor = DocumentProcessor()
    documents = processor.process_directory("D:\Projects\mmRAG-with-Vision-Language-Model\PDFs")
    
    print("="*50)
    print(f"TOTAL EXTRACTION SUMMARY")
    print("="*50)
    print(f"Total documents: {len(documents)}")
    print(f"Text chunks: {sum(1 for d in documents if d['type'] == 'text')}")
    print(f"Embedded images: {sum(1 for d in documents if d['type'] == 'image')}")
    print(f"Tables: {sum(1 for d in documents if d['type'] == 'table')}")
    print(f"\nImages saved to: data/processed/images/")
    
# documents = processor.process_directory(r"D:\PDFs")


  documents = processor.process_directory("D:\Projects\mmRAG-with-Vision-Language-Model\PDFs")


Found 3 PDF files

Processing: D:\Projects\mmRAG-with-Vision-Language-Model\PDFs\Exxon_OIMS_framework.pdf
PDF extraction failed: tesseract is not installed or it's not in your PATH. See README file for more information.
Make sure the file is a valid PDF: D:\Projects\mmRAG-with-Vision-Language-Model\PDFs\Exxon_OIMS_framework.pdf
✓ Extracted 0 elements from Exxon_OIMS_framework
  - Text: 0
  - Images: 0
  - Tables: 0

Processing: D:\Projects\mmRAG-with-Vision-Language-Model\PDFs\Risk_Based_Inspection__RBI__580.pdf
