In [13]:
import fitz
import os
from pathlib import Path
from typing import List, Dict
from unstructured.partition.auto import partition
from unstructured.partition.pdf import partition_pdf
from pdf2image import convert_from_path
from PIL import Image
import uuid
from tqdm import tqdm
import json


In [15]:
class DocumentProcessor:
    def __init__(self, output_dir: str = "data/processed"):
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir / "images"
        self.images_dir.mkdir(parents=True, exist_ok=True)
        
    def process_pdf(self, pdf_path: str) -> List[Dict]:
        """
        Extract text and embedded images using PyMuPDF
        More reliable than unstructured + poppler
        """
        documents = []
        filename = Path(pdf_path).stem
        
        try:
            # Open PDF with PyMuPDF
            doc = fitz.open(pdf_path)
            
            for page_num in range(len(doc)):
                page = doc[page_num]
                
                # Extract text from page
                text = page.get_text()
                
                # Split text into reasonable chunks (by paragraphs)
                paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
                
                for i, para in enumerate(paragraphs):
                    if len(para) > 50:  # Skip very short text
                        documents.append({
                            'type': 'text',
                            'content': para,
                            'metadata': {
                                'source': filename,
                                'page': page_num + 1,
                                'chunk_id': f"{filename}_p{page_num}_text_{i}",
                            }
                        })
                
                # Extract embedded images from page
                image_list = page.get_images(full=True)
                
                for img_index, img in enumerate(image_list):
                    try:
                        xref = img[0]
                        base_image = doc.extract_image(xref)
                        image_bytes = base_image["image"]
                        image_ext = base_image["ext"]
                        
                        # Save image
                        image_filename = f"{filename}_p{page_num}_img{img_index}.{image_ext}"
                        image_path = self.images_dir / image_filename
                        
                        with open(image_path, "wb") as img_file:
                            img_file.write(image_bytes)
                        
                        documents.append({
                            'type': 'image',
                            'content': str(image_path),
                            'metadata': {
                                'source': filename,
                                'page': page_num + 1,
                                'chunk_id': f"{filename}_p{page_num}_img{img_index}",
                            }
                        })
                    except Exception as e:
                        print(f"  Warning: Could not extract image {img_index} from page {page_num}: {e}")
                        continue
            
            doc.close()
            
        except Exception as e:
            print(f"ERROR processing {filename}: {e}")
            return []
        
        return documents
    
    def process_directory(self, directory: str = "data/raw", 
                         recursive: bool = False) -> List[Dict]:
        """Process all PDFs in a directory"""
        all_documents = []
        
        if recursive:
            pdf_files = list(Path(directory).rglob("*.pdf"))
        else:
            pdf_files = list(Path(directory).glob("*.pdf"))
        
        if not pdf_files:
            print(f"⚠️  No PDF files found in {directory}")
            return []
        
        print(f"Found {len(pdf_files)} PDF files\n")
        
        for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
            docs = self.process_pdf(str(pdf_file))
            
            # Show progress for each file
            text_count = sum(1 for d in docs if d['type'] == 'text')
            img_count = sum(1 for d in docs if d['type'] == 'image')
            print(f"  ✓ {pdf_file.name}: {text_count} text chunks, {img_count} images")
            
            all_documents.extend(docs)
        
        return all_documents
    
    def save_to_json(self, documents: List[Dict], 
                     output_file: str = "data/processed/documents.json"):
        """Save extracted documents to JSON"""
        output_path = Path(output_file)
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        with open(output_file, 'w') as f:
            json.dump(documents, f, indent=2)
        print(f"\n✅ Saved {len(documents)} documents to {output_file}")

In [14]:
class DocumentProcessor:
    def __init__(self, output_dir: str = "data/processed"):
        self.output_dir = Path(output_dir)
        self.images_dir = self.output_dir / "images"
        self.images_dir.mkdir(parents=True, exist_ok=True)
        
    def process_pdf(self, pdf_path: str) -> List[Dict]:
        """Extract text AND embedded images from a single PDF"""
        
        documents = []
        filename = Path(pdf_path).stem
        
        try:
            elements = partition_pdf(
                filename=pdf_path,
                extract_images_in_pdf=True,
                image_output_dir_path=str(self.images_dir),
                infer_table_structure=True,
                chunking_strategy="by_title",
                max_characters=4000,
                new_after_n_chars=3800,
                combine_text_under_n_chars=2000,
            )
            
            for i, element in enumerate(elements):
                if element.category in ["Title", "NarrativeText", "Text", "ListItem"]:
                    if element.text.strip():
                        documents.append({
                            'type': 'text',
                            'content': element.text,
                            'metadata': {
                                'source': filename,
                                'chunk_id': f"{filename}_text_{i}",
                                'element_type': element.category
                            }
                        })
                
                elif element.category == "Image":
                    image_path = element.metadata.image_path
                    if image_path:
                        documents.append({
                            'type': 'image',
                            'content': image_path,
                            'metadata': {
                                'source': filename,
                                'chunk_id': f"{filename}_img_{i}",
                            }
                        })
                
                elif element.category == "Table":
                    documents.append({
                        'type': 'table',
                        'content': element.text,
                        'metadata': {
                            'source': filename,
                            'chunk_id': f"{filename}_table_{i}",
                        }
                    })
                    
        except Exception as e:
            print(f"ERROR processing {filename}: {e}")
            return []
        
        return documents
    
    def process_directory(self, directory: str = "data/raw", 
                         recursive: bool = False) -> List[Dict]:
        """
        Process all PDFs in a directory
        
        Args:
            directory: Path to folder with PDFs
            recursive: If True, searches subdirectories too
        """
        all_documents = []
        
        # Get PDF files
        if recursive:
            pdf_files = list(Path(directory).rglob("*.pdf"))  # Recursive
        else:
            pdf_files = list(Path(directory).glob("*.pdf"))   # Single folder
        
        print(f"Found {len(pdf_files)} PDF files\n")
        
        # Process each PDF with progress bar
        for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
            docs = self.process_pdf(str(pdf_file))
            all_documents.extend(docs)
        
        return all_documents
    
    def save_to_json(self, documents: List[Dict], 
                     output_file: str = "data/processed/documents.json"):
        """Save extracted documents to JSON"""
        with open(output_file, 'w') as f:
            json.dump(documents, f, indent=2)
        print(f"\n✅ Saved {len(documents)} documents to {output_file}")




In [None]:
if __name__ == "__main__":
    processor = DocumentProcessor()
    documents = processor.process_directory("data/raw")
    
    print("\n" + "="*60)
    print("EXTRACTION SUMMARY")
    print("="*60)
    print(f"Total documents: {len(documents)}")
    print(f"Text chunks: {sum(1 for d in documents if d['type'] == 'text')}")
    print(f"Embedded images: {sum(1 for d in documents if d['type'] == 'image')}")
    print(f"\nImages location: {processor.images_dir}")
    
    if documents:
        processor.save_to_json(documents)
    else:
        print("\n⚠️  No documents extracted. Check your PDF files.")

In [12]:
if __name__ == "__main__":
    processor = DocumentProcessor()
    #documents = processor.process_directory("data/raw")
    documents = processor.process_directory("D:/Projects/mmRAG-with-Vision-Language-Model/PDFs")
    #documents = processor.process_directory("PDFs")
    
    print("\n" + "="*60)
    print("EXTRACTION SUMMARY")
    print("="*60)
    print(f"Total documents: {len(documents)}")
    print(f"Text chunks: {sum(1 for d in documents if d['type'] == 'text')}")
    print(f"Embedded images: {sum(1 for d in documents if d['type'] == 'image')}")
    print(f"Tables: {sum(1 for d in documents if d['type'] == 'table')}")
    
    processor.save_to_json(documents)


Found 3 PDF files



Processing PDFs:   0%|          | 0/3 [00:00<?, ?it/s]

Processing PDFs:  33%|███▎      | 1/3 [00:36<01:12, 36.41s/it]

ERROR processing Exxon_OIMS_framework: Unable to get page count. Is poppler installed and in PATH?


Processing PDFs:  67%|██████▋   | 2/3 [01:05<00:32, 32.20s/it]

ERROR processing Risk_Based_Inspection__RBI__580: Unable to get page count. Is poppler installed and in PATH?


Processing PDFs: 100%|██████████| 3/3 [01:09<00:00, 23.16s/it]

ERROR processing Vertical turbine pump IOM: Unable to get page count. Is poppler installed and in PATH?

EXTRACTION SUMMARY
Total documents: 0
Text chunks: 0
Embedded images: 0
Tables: 0

✅ Saved 0 documents to data/processed/documents.json



