In [2]:
# 🛠 Required packages:
# pip install pytesseract pdf2image pillow tqdm

import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm
from pathlib import Path
import shutil
import gc  # For garbage collection

# 📁 Define folders
input_folder = Path("proc_final")            # Folder with input PDFs
output_folder = Path("txt_final")            # Output folder for TXT files
failed_folder = Path("failed_pdfs")          # Optional: store failed PDFs

output_folder.mkdir(parents=True, exist_ok=True)
failed_folder.mkdir(parents=True, exist_ok=True)

# 🧠 OCR Function
def ocr_pdf_to_txt(pdf_path, txt_path):
    try:
        print(f"🔍 Processing {pdf_path.name}")
        # Convert PDF to list of page images
        pages = convert_from_path(pdf_path)

        if not pages:
            print(f"⚠️ No pages found in {pdf_path.name}. Skipping.")
            return

        with open(txt_path, "w", encoding="utf-8") as out:
            for page_number, page in enumerate(
                tqdm(pages, desc=f"OCR: {pdf_path.name}", leave=False), start=1
            ):
                text = pytesseract.image_to_string(page)
                out.write(f"\n\n--- Page {page_number} ---\n\n")
                out.write(text)
                del page
                gc.collect()

    except Exception as e:
        print(f"❌ Failed to process {pdf_path.name}: {e}")
        try:
            shutil.move(str(pdf_path), failed_folder / pdf_path.name)
            print(f"📁 Moved to failed_pdfs: {pdf_path.name}")
        except Exception as move_error:
            print(f"⚠️ Could not move failed file: {move_error}")

# 🔁 Loop through all PDFs
pdf_files = list(input_folder.glob("*.pdf"))
print(f"📚 Found {len(pdf_files)} PDF(s) in {input_folder}")

for pdf_file in tqdm(pdf_files, desc="📦 Processing PDFs"):
    txt_file = output_folder / f"{pdf_file.stem}.txt"
    ocr_pdf_to_txt(pdf_file, txt_file)

print("\n✅ All PDFs processed.")

📚 Found 572 PDF(s) in proc_final


📦 Processing PDFs:   0%|          | 0/572 [00:00<?, ?it/s]

🔍 Processing 2017-Dec-31 - Koninklijke Philips NV.pdf


📦 Processing PDFs:   0%|          | 0/572 [00:52<?, ?it/s]


KeyboardInterrupt: 