In [4]:
from pathlib import Path

# Define folders
pdf_folder = Path("failed_pdfs")
txt_folder = Path("txt_final")

# Get list of .txt base names
txt_basenames = {txt.stem for txt in txt_folder.glob("*.txt")}

# Loop over PDFs and delete if corresponding TXT exists
deleted_count = 0
for pdf in pdf_folder.glob("*.pdf"):
    if pdf.stem in txt_basenames:
        print(f"🗑 Deleting {pdf.name} (already processed)")
        pdf.unlink()
        deleted_count += 1

print(f"\n✅ Done. Deleted {deleted_count} file(s).")


✅ Done. Deleted 0 file(s).


In [3]:
# 🛠 Required packages:
# pip install pytesseract pdf2image pillow tqdm

import pytesseract
from pdf2image import convert_from_path
from tqdm import tqdm
from pathlib import Path
import shutil
import gc  # For garbage collection

# 📁 Define folders
input_folder = Path("proc_final")            # Folder with input PDFs
output_folder = Path("txt_final")            # Output folder for TXT files
failed_folder = Path("failed_pdfs")          # Optional: store failed PDFs

output_folder.mkdir(parents=True, exist_ok=True)
failed_folder.mkdir(parents=True, exist_ok=True)

# 🧠 OCR Function
def ocr_pdf_to_txt(pdf_path, txt_path):
    try:
        print(f"🔍 Processing {pdf_path.name}")
        # Convert PDF to list of page images
        pages = convert_from_path(pdf_path)

        if not pages:
            print(f"⚠️ No pages found in {pdf_path.name}. Skipping.")
            return

        with open(txt_path, "w", encoding="utf-8") as out:
            for page_number, page in enumerate(
                tqdm(pages, desc=f"OCR: {pdf_path.name}", leave=False), start=1
            ):
                text = pytesseract.image_to_string(page)
                out.write(f"\n\n--- Page {page_number} ---\n\n")
                out.write(text)
                del page
                gc.collect()

    except Exception as e:
        print(f"❌ Failed to process {pdf_path.name}: {e}")
        try:
            shutil.move(str(pdf_path), failed_folder / pdf_path.name)
            print(f"📁 Moved to failed_pdfs: {pdf_path.name}")
        except Exception as move_error:
            print(f"⚠️ Could not move failed file: {move_error}")

# 🔁 Loop through all PDFs
pdf_files = list(input_folder.glob("*.pdf"))
print(f"📚 Found {len(pdf_files)} PDF(s) in {input_folder}")

for pdf_file in tqdm(pdf_files, desc="📦 Processing PDFs"):
    txt_file = output_folder / f"{pdf_file.stem}.txt"
    ocr_pdf_to_txt(pdf_file, txt_file)

print("\n✅ All PDFs processed.")

📚 Found 0 PDF(s) in proc_final


📦 Processing PDFs: 0it [00:00, ?it/s]


✅ All PDFs processed.





2nd try ocr

In [6]:
# 🛠 Required packages:
# pip install pymupdf pytesseract pillow tqdm

import fitz  # PyMuPDF
import pytesseract
from PIL import Image
from tqdm import tqdm
from pathlib import Path
import shutil
import io
import gc

# 📁 Define folders
input_folder = Path("failed_pdfs2")
output_folder = Path("txt_final")
failed_folder = Path("failed_pdfs3")

output_folder.mkdir(parents=True, exist_ok=True)
failed_folder.mkdir(parents=True, exist_ok=True)

# 🧠 OCR with PyMuPDF
def ocr_with_fitz(pdf_path, txt_path):
    try:
        print(f"🔍 Processing {pdf_path.name}")

        # Try to open the file
        try:
            doc = fitz.open(pdf_path)
        except Exception as open_error:
            raise RuntimeError(f"File could not be opened: {open_error}")

        # Check for encryption
        if doc.is_encrypted:
            raise RuntimeError("PDF is encrypted and cannot be processed")

        # Check for zero pages
        if doc.page_count == 0:
            raise RuntimeError("PDF has 0 pages")

        full_text = ""
        for page in tqdm(doc, desc=f"OCR: {pdf_path.name}", leave=False):
            try:
                pix = page.get_pixmap(dpi=300)
                img = Image.open(io.BytesIO(pix.tobytes()))
                text = pytesseract.image_to_string(img)
                full_text += text
                del img, pix, page
                gc.collect()
            except Exception as page_error:
                print(f"⚠️ Failed to process page: {page_error}")

        # Save text
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(full_text)

        # Delete after success
        pdf_path.unlink()
        print(f"✅ Done and deleted: {pdf_path.name}")

    except Exception as e:
        print(f"❌ Failed to process {pdf_path.name}: {e}")
        try:
            shutil.move(str(pdf_path), failed_folder / pdf_path.name)
            print(f"📁 Moved to failed_pdfs: {pdf_path.name}")
        except Exception as move_error:
            print(f"⚠️ Could not move failed file: {move_error}")
        with open("hard_failed_pdfs.txt", "a", encoding="utf-8") as log:
            log.write(f"{pdf_path.name}: {e}\n")

# 🔁 Process all PDFs
pdf_files = list(input_folder.glob("*.pdf"))
print(f"📚 Found {len(pdf_files)} PDF(s) in {input_folder}")

for pdf_file in tqdm(pdf_files, desc="📦 Processing PDFs"):
    txt_file = output_folder / f"{pdf_file.stem}.txt"
    ocr_with_fitz(pdf_file, txt_file)

print("\n✅ All PDFs processed using PyMuPDF.")

📚 Found 27 PDF(s) in failed_pdfs2


📦 Processing PDFs: 100%|██████████| 27/27 [00:00<00:00, 247.14it/s]

🔍 Processing 2016-Dec-31 - Volvo AB.pdf
❌ Failed to process 2016-Dec-31 - Volvo AB.pdf: File could not be opened: Failed to open file 'failed_pdfs2/2016-Dec-31 - Volvo AB.pdf'.
📁 Moved to failed_pdfs: 2016-Dec-31 - Volvo AB.pdf
🔍 Processing 2017-Dec-31 - Henkel AG & Co KGaA.pdf
❌ Failed to process 2017-Dec-31 - Henkel AG & Co KGaA.pdf: File could not be opened: Failed to open file 'failed_pdfs2/2017-Dec-31 - Henkel AG & Co KGaA.pdf'.
📁 Moved to failed_pdfs: 2017-Dec-31 - Henkel AG & Co KGaA.pdf
🔍 Processing 2018-Dec-31 - Repsol SA.pdf
❌ Failed to process 2018-Dec-31 - Repsol SA.pdf: File could not be opened: Failed to open file 'failed_pdfs2/2018-Dec-31 - Repsol SA.pdf'.
📁 Moved to failed_pdfs: 2018-Dec-31 - Repsol SA.pdf
🔍 Processing 2018-Dec-31 - Nestle SA.pdf
❌ Failed to process 2018-Dec-31 - Nestle SA.pdf: File could not be opened: Failed to open file 'failed_pdfs2/2018-Dec-31 - Nestle SA.pdf'.
📁 Moved to failed_pdfs: 2018-Dec-31 - Nestle SA.pdf
🔍 Processing 2015-Aug-31 - Sodexo SA


