In [1]:
from pathlib import Path

# Define folders
pdf_folder = Path("proc_final")
txt_folder = Path("txt_final")

# Get list of .txt base names
txt_basenames = {txt.stem for txt in txt_folder.glob("*.txt")}

# Loop over PDFs and delete if corresponding TXT exists
deleted_count = 0
for pdf in pdf_folder.glob("*.pdf"):
    if pdf.stem in txt_basenames:
        print(f"🗑 Deleting {pdf.name} (already processed)")
        pdf.unlink()
        deleted_count += 1

print(f"\n✅ Done. Deleted {deleted_count} file(s).")

🗑 Deleting 2017-Dec-31 - Koninklijke Philips NV.pdf (already processed)
🗑 Deleting 2020 schibsted.pdf (already processed)
🗑 Deleting 2012-Dec-31 - ArcelorMittal SA.pdf (already processed)
🗑 Deleting 2014-Sep-30 - Carl Zeiss Meditec AG (1).pdf (already processed)
🗑 Deleting 2022-Dec-31 - Raiffeisen Bank Internati....pdf (already processed)
🗑 Deleting 2014-Dec-31 - Exor NV.pdf (already processed)
🗑 Deleting 2012-Dec-31 - Repsol SA.pdf (already processed)
🗑 Deleting 2018-Dec-31 - Vivendi SE (1).pdf (already processed)
🗑 Deleting 2014-Dec-31 - Axel Springer SE.pdf (already processed)
🗑 Deleting 2015-Dec-31 - ams-OSRAM AG.pdf (already processed)
🗑 Deleting 2012-Dec-31 - Sanoma Oyj 1.pdf (already processed)
🗑 Deleting 2019-Dec-31 - Amadeus IT Group SA.pdf (already processed)
🗑 Deleting 2022 Banco de Sabadell.pdf (already processed)
🗑 Deleting 2012-Dec-31 - SAP SE.pdf (already processed)
🗑 Deleting 2021-Sep-30 - Siemens AG.pdf (already processed)
🗑 Deleting 2016-Dec-31 - Mediaset Espana Comun

In [1]:
# 🛠 Required packages:
# pip install pytesseract pdf2image pillow tqdm

import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm
from pathlib import Path
import shutil
import gc  # For garbage collection

# 📁 Define folders
input_folder = Path("proc_final")            # Folder with input PDFs
output_folder = Path("txt_final")            # Output folder for TXT files
failed_folder = Path("failed_pdfs")          # Optional: store failed PDFs

output_folder.mkdir(parents=True, exist_ok=True)
failed_folder.mkdir(parents=True, exist_ok=True)

# 🧠 OCR Function
def ocr_pdf_to_txt(pdf_path, txt_path):
    try:
        print(f"🔍 Processing {pdf_path.name}")
        # Convert PDF to list of page images
        pages = convert_from_path(pdf_path)

        if not pages:
            print(f"⚠️ No pages found in {pdf_path.name}. Skipping.")
            return

        with open(txt_path, "w", encoding="utf-8") as out:
            for page_number, page in enumerate(
                tqdm(pages, desc=f"OCR: {pdf_path.name}", leave=False), start=1
            ):
                text = pytesseract.image_to_string(page)
                out.write(f"\n\n--- Page {page_number} ---\n\n")
                out.write(text)
                del page
                gc.collect()

    except Exception as e:
        print(f"❌ Failed to process {pdf_path.name}: {e}")
        try:
            shutil.move(str(pdf_path), failed_folder / pdf_path.name)
            print(f"📁 Moved to failed_pdfs: {pdf_path.name}")
        except Exception as move_error:
            print(f"⚠️ Could not move failed file: {move_error}")

# 🔁 Loop through all PDFs
pdf_files = list(input_folder.glob("*.pdf"))
print(f"📚 Found {len(pdf_files)} PDF(s) in {input_folder}")

for pdf_file in tqdm(pdf_files, desc="📦 Processing PDFs"):
    txt_file = output_folder / f"{pdf_file.stem}.txt"
    ocr_pdf_to_txt(pdf_file, txt_file)

print("\n✅ All PDFs processed.")

📚 Found 572 PDF(s) in proc_final


📦 Processing PDFs:   0%|          | 0/572 [00:00<?, ?it/s]

🔍 Processing 2017-Dec-31 - Koninklijke Philips NV.pdf


📦 Processing PDFs:   0%|          | 0/572 [00:23<?, ?it/s]


KeyboardInterrupt: 