In [1]:
!pip install PyMuPDF opencv-python numpy tqdm pytesseract pillow



In [2]:
import os
import fitz  # PyMuPDF
import cv2
import numpy as np
from tqdm.notebook import tqdm
import json
import pandas as pd
import pytesseract
from datetime import datetime
from IPython.display import display, Image as IPImage
import errno

In [3]:
INPUT_PDF_DIR =  "D:\GERMANY\Research Lab 2025\Data" 
EXISTING_OUTPUT_DIR = "D:\GERMANY\Research Lab 2025\Data_img"  

pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [4]:
# Processing parameters
DPI = 300  # Image resolution (150-600 recommended)
LANGUAGE = "deu"  # For German text ("eng" for English)
TESSERACT_CONFIG = r'--psm 6 --oem 3'  # OCR engine mode

# Processing flags
SAVE_IMAGES = True       # Set False to skip image saving
EXTRACT_TEXT = True      # Set False to skip OCR
PREVIEW_FIRST_PAGE = False  # Show preview of first page
MAX_TEXT_PREVIEW = 500   # Characters to show in preview

In [5]:
OUTPUT_IMAGE_DIR = os.path.join(EXISTING_OUTPUT_DIR, "images")
OUTPUT_JSON_PATH = os.path.join(EXISTING_OUTPUT_DIR, "results.json")
OUTPUT_CSV_PATH = os.path.join(EXISTING_OUTPUT_DIR, "results.csv")
OUTPUT_STATS_PATH = os.path.join(EXISTING_OUTPUT_DIR, "stats.json")


In [6]:
try:
    os.makedirs(EXISTING_OUTPUT_DIR, exist_ok=True)
    if SAVE_IMAGES:
        os.makedirs(OUTPUT_IMAGE_DIR, exist_ok=True)
    print(f"Output will be saved to: {EXISTING_OUTPUT_DIR}")
except PermissionError as e:
    raise PermissionError(f"Cannot access output directory {EXISTING_OUTPUT_DIR}: {str(e)}")


Output will be saved to: D:\GERMANY\Research Lab 2025\Data_img


In [7]:
def preprocess_for_ocr(img):
    """Enhance image for better OCR results"""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    thresh = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                 cv2.THRESH_BINARY, 11, 2)
    denoised = cv2.fastNlMeansDenoising(thresh, h=30)
    kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
    return cv2.filter2D(denoised, -1, kernel)

def pdf_to_cv2(page, dpi=300):
    """Convert PDF page to OpenCV image"""
    pix = page.get_pixmap(dpi=dpi)
    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
    return cv2.cvtColor(img, cv2.COLOR_RGB2BGR)

def extract_text(page, img):
    """Extract text using both PDF extraction and OCR fallback"""
    try:
        text = page.get_text("text").strip()
        if len(text) < 50:  # Fallback to OCR if little text found
            processed_img = preprocess_for_ocr(img)
            text = pytesseract.image_to_string(
                processed_img, 
                lang=LANGUAGE,
                config=TESSERACT_CONFIG
            ).strip()
        return text
    except Exception as e:
        print(f"Text extraction error: {str(e)}")
        return ""

def safe_save(data, path):
    """Safe file writing with error handling"""
    try:
        if path.endswith('.json'):
            with open(path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
        elif path.endswith('.csv'):
            data.to_csv(path, index=False, encoding='utf-8')
        return True
    except PermissionError:
        print(f"Permission denied for {path}. Trying alternative...")
        try:
            temp_path = path + ".temp"
            if path.endswith('.json'):
                with open(temp_path, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2, ensure_ascii=False)
            elif path.endswith('.csv'):
                data.to_csv(temp_path, index=False, encoding='utf-8')
            os.replace(temp_path, path)  # Atomic write
            return True
        except Exception as e:
            print(f"Failed to save {path}: {str(e)}")
            return False

In [8]:
def process_pdf(pdf_path):
    """Process a single PDF file"""
    doc = fitz.open(pdf_path)
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    results = []
    
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        img = pdf_to_cv2(page, DPI)
        
        # Save image
        img_path = None
        if SAVE_IMAGES:
            img_path = os.path.join(OUTPUT_IMAGE_DIR, f"{base_name}_p{page_num+1}.png")
            try:
                cv2.imwrite(img_path, img)
            except Exception as e:
                print(f"Couldn't save image {img_path}: {str(e)}")
                img_path = None
        
        # Extract text
        text = extract_text(page, img) if EXTRACT_TEXT else ""
        
        # Store results
        results.append({
            "pdf": base_name,
            "page": page_num + 1,
            "image_path": img_path,
            "text": text,
            "text_length": len(text),
            "timestamp": datetime.now().isoformat()
        })
        
        # Preview first page
        if PREVIEW_FIRST_PAGE and page_num == 0:
            _, ret = cv2.imencode('.png', img)
            display(IPImage(data=ret.tobytes(), width=600))
            print(f"Preview: {base_name} (Page 1)")
            if EXTRACT_TEXT and text:
                print(f"\nText length: {len(text)} characters")
                print("\nText preview:\n" + text[:MAX_TEXT_PREVIEW] + ("..." if len(text) > MAX_TEXT_PREVIEW else ""))
    
    return results

In [9]:
def batch_process():
    """Process all PDFs in the input directory"""
    pdf_files = [f for f in os.listdir(INPUT_PDF_DIR) if f.lower().endswith('.pdf')]
    if not pdf_files:
        raise FileNotFoundError(f"No PDF files found in {INPUT_PDF_DIR}")
    
    all_results = []
    print(f"Processing {len(pdf_files)} PDF files...")
    
    for pdf_file in tqdm(pdf_files):
        pdf_path = os.path.join(INPUT_PDF_DIR, pdf_file)
        try:
            all_results.extend(process_pdf(pdf_path))
        except Exception as e:
            print(f"\nError processing {pdf_file}: {str(e)}")
    
    # Save results
    if all_results:
        df = pd.DataFrame(all_results)
        stats = {
            "total_pdfs": df['pdf'].nunique(),
            "total_pages": len(df),
            "average_text_length": df['text_length'].mean(),
            "processing_time": datetime.now().isoformat()
        }
        
        # Save with permission handling
        success = (
            safe_save(all_results, OUTPUT_JSON_PATH) and
            safe_save(df, OUTPUT_CSV_PATH) and
            safe_save(stats, OUTPUT_STATS_PATH)
        )
        
        if success:
            print("\nResults saved successfully:")
            print(f"- JSON: {OUTPUT_JSON_PATH}")
            print(f"- CSV: {OUTPUT_CSV_PATH}")
            print(f"- Images: {OUTPUT_IMAGE_DIR}")
            print(f"- Stats: {OUTPUT_STATS_PATH}")
        else:
            print("\nSome files couldn't be saved due to permission issues")
        
        return df
    else:
        print("\nNo results were generated")
        return None

# 9. Execute the processing
if __name__ == "__main__":
    final_results = batch_process()
    if final_results is not None:
        print(f"\nProcessing complete! Results saved to {EXISTING_OUTPUT_DIR}")

Processing 272 PDF files...


  0%|          | 0/272 [00:00<?, ?it/s]


Results saved successfully:
- JSON: D:\GERMANY\Research Lab 2025\Data_img\results.json
- CSV: D:\GERMANY\Research Lab 2025\Data_img\results.csv
- Images: D:\GERMANY\Research Lab 2025\Data_img\images
- Stats: D:\GERMANY\Research Lab 2025\Data_img\stats.json

Processing complete! Results saved to D:\GERMANY\Research Lab 2025\Data_img
