# PDF to PNG Image Converter

A Python utility that batch converts PDF files into high-quality PNG images. Each page of every PDF is extracted and saved as a separate image file, organized by PDF name.

**Features:**
- üîÑ Batch process multiple PDFs
- üìÅ Auto-organized output folders
- üìä Configurable 200 DPI quality
- üìù Detailed logging with progress
- ‚ö†Ô∏è Robust error handling

**Usage:** Place PDFs in `./PDF/` folder, run script, find images in `./media/{pdf_name}/`

**Setup:**
```bash
pip install pdf2image
```
---

In [None]:
import os
from pathlib import Path
from pdf2image import convert_from_path
import logging

# ------------------------------------------------------------
# BASIC CONFIGURATION
# ------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

PDF_DIR = Path("./PDF")       # Folder containing PDFs
MEDIA_ROOT = Path("./media")  # Folder to save extracted images

# Ensure media folder exists
MEDIA_ROOT.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# FUNCTION: Convert one PDF to PNG images
# ------------------------------------------------------------
def process_pdf_extract_images(pdf_path: Path) -> bool:
    """Convert all pages of a PDF to PNG images in a separate folder."""
    logging.info(f"üìÑ Received file: {pdf_path.name}")

    # Folder to store output images (e.g., ./media/my_pdf/)
    pdf_output_folder = MEDIA_ROOT / pdf_path.stem
    pdf_output_folder.mkdir(parents=True, exist_ok=True)

    try:
        # Convert PDF to images
        logging.info(f"‚öôÔ∏è Converting '{pdf_path.name}' to images...")
        dpi = 200  # Good quality for OCR, not too heavy
        fmt = "png"

        images = convert_from_path(pdf_path, dpi=dpi)
        if not images:
            raise FileNotFoundError("‚ùå No images were created from the PDF.")

        for i, image in enumerate(images, start=1):
            image_path = pdf_output_folder / f"page_{i}.png"
            image.save(image_path, fmt.upper())
            logging.info(f"‚úÖ Saved image: {image_path}")

        logging.info(f"üéâ All {len(images)} pages extracted for '{pdf_path.name}'")
        return True

    except Exception as e:
        logging.error(f"‚ùå Error processing '{pdf_path.name}': {str(e)}", exc_info=True)
        return False


# ------------------------------------------------------------
# MAIN LOOP: Process all PDFs in ./PDF/
# ------------------------------------------------------------
if __name__ == "__main__":
    if not PDF_DIR.exists():
        logging.error(f"‚ùå The folder '{PDF_DIR}' does not exist.")
        exit(1)

    pdf_files = sorted(PDF_DIR.glob("*.pdf"))
    if not pdf_files:
        logging.warning(f"‚ö†Ô∏è No PDF files found in '{PDF_DIR}'")
        exit(0)

    logging.info(f"üìÇ Found {len(pdf_files)} PDF file(s) to process...")

    for idx, pdf_file in enumerate(pdf_files, start=1):
        logging.info(f"\n{'='*60}\nProcessing {idx}/{len(pdf_files)}: {pdf_file.name}")
        success = process_pdf_extract_images(pdf_file)

        if success:
            logging.info(f"‚úÖ Finished: {pdf_file.name}")
        else:
            logging.warning(f"‚ö†Ô∏è Failed: {pdf_file.name}")

    logging.info("\nüèÅ All PDFs processed.")


# Decree Image Processor

Processes extracted PDF images from official journals by removing headers, cropping white borders, and splitting pages into left/right columns for OCR and document analysis.

**Features:**
- üéØ Automatic white border removal
- üìã Configurable header removal (4.5% by default)
- üìñ Split two-column pages into individual images
- üîÑ Batch process entire image directories
- üìä Preserves page numbering in filenames

**Input:** Images from previous PDF converter script (`./media/{pdf_name}/`)  
**Output:** Processed images in `./media/{pdf_name}_output/` with `_left` and `_right` suffixes

**Setup:**
```bash
pip install opencv-python numpy
```
---

In [None]:
import cv2
import numpy as np
import os
import glob

# === CONFIGURATION ===
input_folder = r"\media\"
output_folder = r"\media\"
os.makedirs(output_folder, exist_ok=True)

# üîß Adjust this value to control how much of the top is cropped (e.g., 0.03 = 3%, 0.05 = 5%)
HEADER_RATIO = 0.045

# === PROCESS EACH IMAGE ===
image_paths = sorted(glob.glob(os.path.join(input_folder, "*.png")))
if not image_paths:
    print(f"‚ö†Ô∏è No image files found in folder: {input_folder}")
else:
    for file_path in image_paths:
        page_name = os.path.splitext(os.path.basename(file_path))[0]
        img = cv2.imread(file_path)
        if img is None:
            print(f"‚ùå Could not read image: {file_path}")
            continue

        # --- Remove white borders ---
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 250, 255, cv2.THRESH_BINARY_INV)
        coords = cv2.findNonZero(thresh)
        if coords is None:
            print(f"‚ö†Ô∏è Skipping blank image: {file_path}")
            continue

        x, y, w, h = cv2.boundingRect(coords)
        cropped = img[y:y + h, x:x + w]

        # --- Remove header by percentage ---
        header_height = int(cropped.shape[0] * HEADER_RATIO)
        cropped_no_header = cropped[header_height:, :]

        # --- Split into left/right halves ---
        height, width, _ = cropped_no_header.shape
        mid = width // 2
        left_half = cropped_no_header[:, :mid]
        right_half = cropped_no_header[:, mid:]

        # --- Save outputs ---
        left_path = os.path.join(output_folder, f"{page_name}_left.png")
        right_path = os.path.join(output_folder, f"{page_name}_right.png")
        cv2.imwrite(left_path, left_half)
        cv2.imwrite(right_path, right_half)

        print(f"‚úÖ Processed page {page_name}")

    print("\nüéâ All pages processed successfully!")


# Google Vision OCR Processor

Extracts text from processed decree images using Google Cloud Vision API with full support for Arabic right-to-left (RTL) text. Outputs both structured JSON and merged plain text files.

**Features:**
- üß† Google Cloud Vision API OCR
- üåç Full Arabic RTL support with directional markers
- üìÑ Batch process entire image folders
- üíæ Dual output format (JSON + TXT)
- ‚è±Ô∏è Rate-limited API calls (1 second delay)
- üìù Comprehensive logging and error handling

**Input:** Processed images from decree processor (`./media/{pdf_name}/`)  
**Output:** 
- `{folder_name}.json` - Structured per-page OCR results
- `{folder_name}.txt` - Merged Arabic text (RTL formatted)

**Setup:**
```bash
pip install google-cloud-vision
# 1. Create Google Cloud project & enable Vision API
# 2. Download service account credentials as credentials.json
# 3. Place credentials.json in script directory
```

**Configuration:**
- Input folder: `./media/laws/` (modify `media_root` variable)
- API rate limit: 1 second per image (adjust `time.sleep()`)

In [None]:
import os
import io
import re
import json
import time
import logging
from google.cloud import vision
from pathlib import Path

# ------------------------------------------------------------
# BASIC CONFIGURATION
# ------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

credentials_path = os.path.join(os.getcwd(), "credentials.json")
if not os.path.exists(credentials_path):
    raise FileNotFoundError("‚ùå credentials.json not found in current folder")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

# Initialize Vision API client
try:
    client = vision.ImageAnnotatorClient()
    logging.info("‚úÖ Google Vision API client initialized")
except Exception as e:
    logging.exception("‚ùå Failed to initialize Vision client")
    raise e

# ------------------------------------------------------------
# MAIN OCR FUNCTION
# ------------------------------------------------------------
def process_folder(folder_path: Path):
    """Process all images in one folder using Vision OCR (Arabic RTL)."""
    logging.info(f"\nüìÇ Processing folder: {folder_path.name}")

    # Find all images
    images = sorted(
        [img for img in folder_path.iterdir() if img.suffix.lower() in [".png", ".jpg", ".jpeg"]],
        key=lambda x: int(re.search(r'page_(\d+)', x.name).group(1)) if re.search(r'page_(\d+)', x.name) else float('inf')
    )

    if not images:
        logging.warning(f"‚ö†Ô∏è No images found in {folder_path}")
        return

    ocr_results = {}

    for idx, image_path in enumerate(images, 1):
        logging.info(f"üñºÔ∏è ({idx}/{len(images)}) {image_path.name}")

        try:
            with io.open(image_path, "rb") as image_file:
                content = image_file.read()

            image = vision.Image(content=content)
            response = client.document_text_detection(image=image)

            if response.error.message:
                logging.error(f"‚ùå API Error for {image_path.name}: {response.error.message}")
                continue

            full_text = response.full_text_annotation.text.strip()
            if not full_text:
                logging.warning(f"‚ö†Ô∏è No text found in {image_path.name}")
                continue

            # Force Arabic right-to-left display
            rtl_marker = "\u202B"   # Right-to-left embedding (RLE)
            pop_marker = "\u202C"   # Pop directional formatting
            full_text_rtl = rtl_marker + full_text + pop_marker

            ocr_results[image_path.name] = full_text_rtl
            logging.info(f"‚úÖ Extracted {len(full_text)} characters from {image_path.name}")

            time.sleep(1)

        except Exception:
            logging.exception(f"‚ùå Error processing {image_path.name}")

    if not ocr_results:
        logging.warning(f"‚ö†Ô∏è No OCR results for {folder_path.name}")
        return

    # Save results
    json_path = folder_path / f"{folder_path.name}.json"
    txt_path = folder_path / f"{folder_path.name}.txt"

    try:
        # Save JSON (for structured analysis)
        with open(json_path, "w", encoding="utf-8") as jf:
            json.dump(ocr_results, jf, ensure_ascii=False, indent=4)
        logging.info(f"üíæ JSON saved: {json_path}")

        # Save merged text (Arabic right-to-left, no page headers)
        with open(txt_path, "w", encoding="utf-8") as tf:
            for name in images:
                if name.name in ocr_results:
                    tf.write(ocr_results[name.name] + "\n\n")
        logging.info(f"üíæ Merged Arabic text saved: {txt_path}")

    except Exception:
        logging.exception(f"‚ùå Failed to save OCR results for {folder_path.name}")

# ------------------------------------------------------------
# MAIN LOOP ‚Äî PROCESS ALL SUBFOLDERS
# ------------------------------------------------------------
media_root = Path("./media/")

if not media_root.exists():
    raise FileNotFoundError("‚ùå The './media' folder does not exist")

subfolders = [f for f in media_root.iterdir() if f.is_dir()]
if not subfolders:
    logging.warning("‚ö†Ô∏è No subfolders found in './media'. Nothing to process.")
else:
    logging.info(f"üìÅ Found {len(subfolders)} folder(s) to process under './media'")

for folder in subfolders:
    process_folder(folder)

logging.info("\nüèÅ OCR extraction complete ‚Äî Arabic text written right-to-left and saved.")

# Google Vision OCR Processor (Two-Column Optimized)

Extracts text from two-column decree images using Google Cloud Vision API with intelligent right-before-left column ordering. Optimized for official journal documents with Arabic RTL support.

**Features:**
- üß† Google Cloud Vision API OCR
- üìñ Intelligent two-column processing (right before left)
- üåç Full Arabic right-to-left (RTL) support
- üìÑ Batch process entire image folders
- üíæ Dual output format (JSON + TXT)
- ‚ö° Optimized rate-limiting (0.2s per image)
- üìù Comprehensive logging and error handling

**Input:** Two-column processed images from decree processor (`./media/laws/{folder}/page_*_left.png`, `page_*_right.png`)  
**Output:** 
- `{folder_name}.json` - Structured per-image OCR results
- `{folder_name}.txt` - Merged Arabic text (right‚Üíleft column order)

**Setup:**
```bash
pip install google-cloud-vision
# 1. Create Google Cloud project & enable Vision API
# 2. Download service account credentials as credentials.json
# 3. Place credentials.json in script directory
```

**Key Configuration:**
- Input folder: `./media/laws/` (modify `media_root` variable)
- API rate limit: 0.2 seconds per image (adjust `time.sleep(0.2)`)
- Column order: Right pages processed before left pages

**Processing Order Example:**
```
Input files:           Processing order:
page_1_right.png  ‚Üí   1. page_1_right.png
page_1_left.png   ‚Üí   2. page_1_left.png
page_2_right.png  ‚Üí   3. page_2_right.png
page_2_left.png   ‚Üí   4. page_2_left.png
```
---


In [None]:
import os
import io
import re
import json
import time
import logging
from google.cloud import vision
from pathlib import Path

# ------------------------------------------------------------
# BASIC CONFIGURATION
# ------------------------------------------------------------
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

credentials_path = os.path.join(os.getcwd(), "credentials.json")
if not os.path.exists(credentials_path):
    raise FileNotFoundError("‚ùå credentials.json not found in current folder")

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

# Initialize Vision API client
try:
    client = vision.ImageAnnotatorClient()
    logging.info("‚úÖ Google Vision API client initialized")
except Exception as e:
    logging.exception("‚ùå Failed to initialize Vision client")
    raise e


# ------------------------------------------------------------
# MAIN OCR FUNCTION
# ------------------------------------------------------------
def process_folder(folder_path: Path):
    """Process all images in one folder using Vision OCR (Arabic RTL, right before left)."""
    logging.info(f"\nüìÇ Processing folder: {folder_path.name}")

    # Find all images
    images = [img for img in folder_path.iterdir() if img.suffix.lower() in [".png", ".jpg", ".jpeg"]]
    if not images:
        logging.warning(f"‚ö†Ô∏è No images found in {folder_path}")
        return

    # Sort by page number, but ensure "right" comes before "left"
    def sort_key(img):
        match = re.search(r'page_(\d+)', img.name)
        page_num = int(match.group(1)) if match else float('inf')
        # Ensure right page before left if same page number
        right_first = 0 if "right" in img.name.lower() else 1
        return (page_num, right_first)

    images.sort(key=sort_key)

    ocr_results = {}

    for idx, image_path in enumerate(images, 1):
        logging.info(f"üñºÔ∏è ({idx}/{len(images)}) {image_path.name}")

        try:
            with io.open(image_path, "rb") as image_file:
                content = image_file.read()

            image = vision.Image(content=content)
            response = client.document_text_detection(image=image)

            if response.error.message:
                logging.error(f"‚ùå API Error for {image_path.name}: {response.error.message}")
                continue

            full_text = response.full_text_annotation.text.strip()
            if not full_text:
                logging.warning(f"‚ö†Ô∏è No text found in {image_path.name}")
                continue

            # Force right-to-left display for Arabic
            rtl_marker = "\u202B"   # Right-to-left embedding
            pop_marker = "\u202C"   # Pop directional formatting
            full_text_rtl = rtl_marker + full_text + pop_marker

            ocr_results[image_path.name] = full_text_rtl
            logging.info(f"‚úÖ Extracted {len(full_text)} characters from {image_path.name}")

            # Reduce delay ‚Äî you can set to 0 if you trust API limits
            time.sleep(0.2)

        except Exception:
            logging.exception(f"‚ùå Error processing {image_path.name}")

    if not ocr_results:
        logging.warning(f"‚ö†Ô∏è No OCR results for {folder_path.name}")
        return

    # ------------------------------------------------------------
    # SAVE RESULTS
    # ------------------------------------------------------------
    json_path = folder_path / f"{folder_path.name}.json"
    txt_path = folder_path / f"{folder_path.name}.txt"

    try:
        # Save structured data (JSON)
        with open(json_path, "w", encoding="utf-8") as jf:
            json.dump(ocr_results, jf, ensure_ascii=False, indent=4)
        logging.info(f"üíæ JSON saved: {json_path}")

        # Merge all text (no page headers, right before left)
        with open(txt_path, "w", encoding="utf-8") as tf:
            for name in images:
                if name.name in ocr_results:
                    tf.write(ocr_results[name.name] + "\n\n")
        logging.info(f"üíæ Merged text saved: {txt_path}")

    except Exception:
        logging.exception(f"‚ùå Failed to save OCR results for {folder_path.name}")


# ------------------------------------------------------------
# MAIN LOOP ‚Äî PROCESS ALL SUBFOLDERS
# ------------------------------------------------------------
media_root = Path("./media/")

if not media_root.exists():
    raise FileNotFoundError("‚ùå The './media/laws' folder does not exist")

subfolders = [f for f in media_root.iterdir() if f.is_dir()]
if not subfolders:
    logging.warning("‚ö†Ô∏è No subfolders found in './media/laws'. Nothing to process.")
else:
    logging.info(f"üìÅ Found {len(subfolders)} folder(s) to process under './media/laws'")

for folder in subfolders:
    process_folder(folder)

logging.info("\nüèÅ OCR extraction complete ‚Äî Arabic text saved right-to-left and read right-before-left.")
