In [None]:
# 🛠 Required packages:
# pip install pytesseract pdf2image pillow tqdm

import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm
from pathlib import Path
import shutil
import gc  # For garbage collection

# 📁 Define folders
input_folder = Path("proc_final")            # Folder with input PDFs
output_folder = Path("txt_final")            # Output folder for TXT files
failed_folder = Path("failed_pdfs")          # Optional: store failed PDFs

output_folder.mkdir(parents=True, exist_ok=True)
failed_folder.mkdir(parents=True, exist_ok=True)

# 🧠 OCR Function
def ocr_pdf_to_txt(pdf_path, txt_path):
    try:
        print(f"🔍 Processing {pdf_path.name}")
        # Convert PDF to list of page images
        pages = convert_from_path(pdf_path)

        if not pages:
            print(f"⚠️ No pages found in {pdf_path.name}. Skipping.")
            return

        with open(txt_path, "w", encoding="utf-8") as out:
            for page_number, page in enumerate(
                tqdm(pages, desc=f"OCR: {pdf_path.name}", leave=False), start=1
            ):
                text = pytesseract.image_to_string(page)
                out.write(f"\n\n--- Page {page_number} ---\n\n")
                out.write(text)
                del page
                gc.collect()

    except Exception as e:
        print(f"❌ Failed to process {pdf_path.name}: {e}")
        try:
            shutil.move(str(pdf_path), failed_folder / pdf_path.name)
            print(f"📁 Moved to failed_pdfs: {pdf_path.name}")
        except Exception as move_error:
            print(f"⚠️ Could not move failed file: {move_error}")

# 🔁 Loop through all PDFs
pdf_files = list(input_folder.glob("*.pdf"))
print(f"📚 Found {len(pdf_files)} PDF(s) in {input_folder}")

for pdf_file in tqdm(pdf_files, desc="📦 Processing PDFs"):
    txt_file = output_folder / f"{pdf_file.stem}.txt"
    ocr_pdf_to_txt(pdf_file, txt_file)

print("\n✅ All PDFs processed.")

📚 Found 572 PDF(s) in proc_final


📦 Processing PDFs:   0%|          | 0/572 [00:00<?, ?it/s]

🔍 Processing 2017-Dec-31 - Koninklijke Philips NV.pdf


📦 Processing PDFs:   0%|          | 1/572 [03:02<28:55:46, 182.39s/it]

🔍 Processing 2020 schibsted.pdf


📦 Processing PDFs:   0%|          | 2/572 [03:24<13:56:58, 88.10s/it] 

🔍 Processing 2012-Dec-31 - ArcelorMittal SA.pdf


📦 Processing PDFs:   1%|          | 3/572 [06:54<22:42:28, 143.67s/it]

🔍 Processing 2014-Sep-30 - Carl Zeiss Meditec AG (1).pdf


📦 Processing PDFs:   1%|          | 4/572 [08:22<19:14:21, 121.94s/it]

🔍 Processing 2022-Dec-31 - Raiffeisen Bank Internati....pdf


📦 Processing PDFs:   1%|          | 5/572 [09:37<16:30:52, 104.85s/it]

🔍 Processing 2014-Dec-31 - Exor NV.pdf


📦 Processing PDFs:   1%|          | 6/572 [13:00<21:44:48, 138.32s/it]

🔍 Processing 2012-Dec-31 - Repsol SA.pdf


📦 Processing PDFs:   1%|          | 7/572 [15:17<21:38:21, 137.88s/it]

🔍 Processing 2018-Dec-31 - Vivendi SE (1).pdf


📦 Processing PDFs:   1%|▏         | 8/572 [16:00<16:51:08, 107.57s/it]

🔍 Processing 2014-Dec-31 - Axel Springer SE.pdf


📦 Processing PDFs:   2%|▏         | 9/572 [18:41<19:26:12, 124.28s/it]

🔍 Processing 2015-Dec-31 - ams-OSRAM AG.pdf


📦 Processing PDFs:   2%|▏         | 10/572 [18:56<14:08:10, 90.55s/it]

🔍 Processing 2012-Dec-31 - Sanoma Oyj 1.pdf


📦 Processing PDFs:   2%|▏         | 11/572 [19:15<10:43:05, 68.78s/it]

🔍 Processing 2019-Dec-31 - Amadeus IT Group SA.pdf


📦 Processing PDFs:   2%|▏         | 12/572 [20:32<11:05:22, 71.29s/it]

🔍 Processing 2022 Banco de Sabadell.pdf


📦 Processing PDFs:   2%|▏         | 13/572 [23:35<16:17:58, 104.97s/it]

🔍 Processing 2012-Dec-31 - SAP SE.pdf


📦 Processing PDFs:   2%|▏         | 14/572 [29:06<26:50:45, 173.20s/it]

🔍 Processing 2021-Sep-30 - Siemens AG.pdf


📦 Processing PDFs:   3%|▎         | 15/572 [31:08<24:26:24, 157.96s/it]

🔍 Processing 2016-Dec-31 - Mediaset Espana Comunicac....pdf


📦 Processing PDFs:   3%|▎         | 16/572 [31:42<18:37:43, 120.62s/it]

🔍 Processing 2016-Dec-31 - Bertelsmann SE & Co KgaA.pdf


📦 Processing PDFs:   3%|▎         | 17/572 [33:22<17:37:29, 114.32s/it]

🔍 Processing 2013-Dec-31 - NOVABASE SGPS SA.pdf


📦 Processing PDFs:   3%|▎         | 18/572 [33:23<12:22:01, 80.36s/it] 

🔍 Processing 2018-Dec-31 - Beiersdorf AG.pdf


📦 Processing PDFs:   3%|▎         | 19/572 [34:55<12:52:09, 83.78s/it]

🔍 Processing 2013-Dec-31 - Bertelsmann SE & Co KgaA.pdf


📦 Processing PDFs:   3%|▎         | 20/572 [36:24<13:04:44, 85.30s/it]

🔍 Processing 2019-Dec-31 - Koninklijke Philips NV.pdf


📦 Processing PDFs:   4%|▎         | 21/572 [38:19<14:26:13, 94.33s/it]

🔍 Processing 2022-Dec-31 - ams-OSRAM AG.pdf


📦 Processing PDFs:   4%|▍         | 22/572 [39:16<12:40:17, 82.94s/it]

🔍 Processing 2015-Dec-31 - Amadeus IT Group SA.pdf


📦 Processing PDFs:   4%|▍         | 23/572 [41:01<13:41:53, 89.83s/it]

🔍 Processing 2013-Dec-31 - Mediaset Espana Comunicac....pdf


📦 Processing PDFs:   4%|▍         | 24/572 [41:27<10:45:14, 70.65s/it]

🔍 Processing 2018-Dec-31 - Exor NV.pdf


📦 Processing PDFs:   4%|▍         | 25/572 [43:20<12:39:02, 83.26s/it]

🔍 Processing 2015-Dec-31 - Schneider Electric SE 1.pdf


📦 Processing PDFs:   5%|▍         | 26/572 [43:42<9:49:29, 64.78s/it] 

🔍 Processing 2022-Sep-30 - Carl Zeiss Meditec AG.pdf


📦 Processing PDFs:   5%|▍         | 27/572 [45:06<10:40:21, 70.50s/it]

🔍 Processing 2014-Dec-31 - SEMAPA Sociedade de Inves....pdf


📦 Processing PDFs:   5%|▍         | 28/572 [46:31<11:20:24, 75.04s/it]

🔍 Processing 2015-Dec-31 - Stellantis NV.pdf


