<a href="https://colab.research.google.com/github/jaslark/crop-images/blob/main/paddleVL_mycustom.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip uninstall -y torch torchvision torchaudio
!pip uninstall -y paddleocr paddlepaddle paddlepaddle-gpu langchain langchain-community


In [None]:
# GPU PaddlePaddle (CUDA 11.8)
!pip install "paddlepaddle-gpu==3.2.2" -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
# PaddleOCR Doc Parser extra
!pip install "paddleocr[doc-parser]"

-----presss-restart runtime

In [None]:
from paddleocr import PaddleOCR

if "ocr" not in globals():
    ocr = PaddleOCR(use_angle_cls=True, lang='vi')
else:
    print("OCR instance already exists â€” reusing it")

In [None]:
!pip install https://paddle-whl.bj.bcebos.com/nightly/cu118/safetensors/safetensors-0.6.2.dev0-cp310-abi3-linux_x86_64.whl


In [None]:
from paddleocr import PaddleOCRVL

# Only create the pipeline ONCE per runtime
if "paddleocr_vl_pipeline" not in globals():
    # You can tweak these flags as you like
    paddleocr_vl_pipeline = PaddleOCRVL(
        use_doc_orientation_classify=True,  # auto rotate documents
        use_doc_unwarping=True,            # fix perspective
        use_layout_detection=True,         # full document parsing
        use_chart_recognition=False,       # set True if you care about charts
        device=None,                       # None -> prefer GPU:0 else CPU
    )
    print("PaddleOCR-VL pipeline created.")
else:
    print("Using existing PaddleOCR-VL pipeline (no re-init).")


In [None]:
# Update crop image
import os
import json
import shutil
import zipfile
from pathlib import Path

from PIL import Image


def run_paddleocr_vl_on_file(input_path: str, output_root: str) -> str:
    """
    Run PaddleOCR-VL on a single file (image or PDF).

    Creates a folder:
        output_root/
            <base>_paddleocrvl/
                results/
                cropped_images/   (only for image files)

    Returns the path to a .zip containing json + cropped images.
    """
    input_path = os.path.abspath(str(input_path))

    base_name = Path(input_path).stem
    work_dir = Path(output_root) / f"{base_name}_paddleocrvl"
    results_dir = work_dir / "results"
    crops_dir = work_dir / "cropped_images"

    # Clean previous run for same file
    if work_dir.exists():
        shutil.rmtree(work_dir)
    results_dir.mkdir(parents=True, exist_ok=True)
    crops_dir.mkdir(parents=True, exist_ok=True)

    # 2) Run PaddleOCR-VL
    print("Running PaddleOCR-VL...")
    outputs = paddleocr_vl_pipeline.predict(
        input=input_path,
        # use_xxx=None -> fall back to instantiation settings
        use_doc_orientation_classify=None,
        use_doc_unwarping=None,
        use_layout_detection=True,   # IMPORTANT: ensure layout_det_res is populated
        use_chart_recognition=None,
        use_queues=False,
    )

    # 3) Save JSON & (for images) crops
    json_paths = []
    is_image_input = input_path.lower().endswith(
        (".jpg", ".jpeg", ".png", ".bmp", ".webp")
    )

    for idx, res in enumerate(outputs):
        # Save the structured result with a deterministic name
        json_path = results_dir / f"{base_name}_page{idx}.json"
        res.save_to_json(save_path=str(json_path))
        json_paths.append(json_path)

        # Crop layout regions if the input is an image
        if is_image_input:
            try:
                create_layout_crops_for_page(
                    image_path=input_path,
                    json_path=json_path,
                    crops_dir=crops_dir,
                    page_index=idx,
                )
            except Exception as e:
                print(f"[WARN] Cropping failed for page {idx}: {e}")

    # 4) Zip everything except any "original" dir if it ever exists
    zip_path = str(work_dir) + ".zip"
    if os.path.exists(zip_path):
        os.remove(zip_path)

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(work_dir):
            if Path(root).name == "original":   # only skip original dir
                continue
            for f in files:
                full = Path(root) / f
                rel = full.relative_to(work_dir)
                zf.write(full, arcname=str(rel))

    print(f"Done. Zip at: {zip_path}")
    return zip_path

def create_layout_crops_for_page(image_path: str, json_path: Path, crops_dir: Path, page_index: int):
    """
    Simple cropping: uses `layout_det_res['boxes']` from PaddleOCR-VL JSON.
    Each box becomes one crop.
    """
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    res = data.get("res", {})
    layout_det_res = res.get("layout_det_res", {})
    boxes = layout_det_res.get("boxes", [])

    print(f"[DEBUG] {json_path.name}: found {len(boxes)} layout boxes")

    if not boxes:
        print(f"[INFO] No layout boxes found in {json_path.name}")
        return

    img = Image.open(image_path).convert("RGB")

    for i, box in enumerate(boxes):
        coord = box.get("coordinate", [])
        label = box.get("label", "region")
        if len(coord) != 4:
            print(f"[WARN] Box {i} has invalid coordinate: {coord}")
            continue

        x1, y1, x2, y2 = [float(v) for v in coord]
        crop = img.crop((x1, y1, x2, y2))

        # file name example: page0_000_text.png
        safe_label = "".join(
            c if c.isalnum() or c in ("-", "_") else "_" for c in str(label)
        )
        crop_name = f"page{page_index}_{i:03d}_{safe_label}.png"
        crop_path = crops_dir / crop_name
        crop.save(crop_path)
        print(f"[DEBUG] Saved crop: {crop_path}")


In [None]:
from google.colab import files

# Choose where to store intermediate outputs
OUTPUT_ROOT = "/content/paddleocrvl_outputs"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

uploaded = files.upload()  # choose JPG / PNG / PDF / etc.

zips_to_download = []

for filename in uploaded.keys():
    input_path = os.path.join("/content", filename)
    print("=" * 80)
    print("Processing:", input_path)
    zip_path = run_paddleocr_vl_on_file(input_path, OUTPUT_ROOT)
    zips_to_download.append(zip_path)

print("\nAll files processed. Ready to download zips.")

for z in zips_to_download:
    print("Downloading:", z)
    files.download(z)


In [None]:
import os
import cv2
import shutil
import zipfile
from pathlib import Path
from paddleocr import PPStructure, save_structure_res
from paddleocr.ppstructure.recovery.recovery_to_doc import sorted_layout_boxes

# --- 1. Initialize the Layout Analysis Engine (PPStructure) ---
# This model is specifically designed to detect regions and crop them.
# 'table=False' speeds it up if you don't need detailed Excel reconstruction,
# but set True if you want to parse inside tables.
print("Initializing PP-Structure Layout Engine...")
engine = PPStructure(
    show_log=True,
    image_orientation=True,
    layout=True,
    table=False,   # Set to True if you want internal cell parsing
    ocr=True,      # Set to True if you want text inside the crops
    lang='vi'      # Supports 'vi', 'en', etc.
)

def run_structure_and_crop(input_path: str, output_root: str) -> str:
    """
    Runs PPStructure on the file to detect layout (Header, Text, Image, Table).
    Automatically saves cropped images and results.
    """
    input_path = os.path.abspath(str(input_path))
    base_name = Path(input_path).stem

    # Create output directory: output_root/filename_structure
    save_folder = os.path.join(output_root, f"{base_name}_structure")

    # Clean previous run
    if os.path.exists(save_folder):
        shutil.rmtree(save_folder)
    os.makedirs(save_folder, exist_ok=True)

    # 1. Load Image
    img = cv2.imread(input_path)
    if img is None:
        print(f"[ERROR] Could not read image: {input_path}")
        return None

    print(f"Processing layout for: {base_name} ...")

    # 2. Run Inference
    # result is a list of dictionaries, one per region
    result = engine(img)

    # 3. Save Results (This function automatically crops the images!)
    # It creates a folder 'structure' inside save_folder containing the crops.
    save_structure_res(result, save_folder, base_name)

    print(f"[SUCCESS] Found {len(result)} regions. Crops saved in {save_folder}")

    # 4. Zip the results
    zip_path = save_folder + ".zip"
    if os.path.exists(zip_path):
        os.remove(zip_path)

    with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
        for root, _, files in os.walk(save_folder):
            for f in files:
                full_path = Path(root) / f
                # Create relative path for zip to avoid full folder structure
                rel_path = full_path.relative_to(Path(output_root))
                zf.write(full_path, arcname=str(rel_path))

    return zip_path

# --- Main Execution Block ---
from google.colab import files

OUTPUT_ROOT = "/content/paddle_structure_outputs"
os.makedirs(OUTPUT_ROOT, exist_ok=True)

# Note: We reuse the file you already uploaded (e.g., page_1.png)
# If you need to upload again, uncomment the next line:
# uploaded = files.upload()

# List of files in content (simple check to find your png/jpg)
files_in_content = [f for f in os.listdir("/content") if f.endswith(('.png', '.jpg', '.jpeg'))]

if not files_in_content:
    print("No images found in /content. Please upload an image first.")
else:
    zips_to_download = []
    for filename in files_in_content:
        input_path = os.path.join("/content", filename)

        # Skip checking the outputs folder itself
        if "paddle" in filename: continue

        print("=" * 80)
        zip_file = run_structure_and_crop(input_path, OUTPUT_ROOT)
        if zip_file:
            zips_to_download.append(zip_file)

    print("\nAll files processed. Downloading zips...")
    for z in zips_to_download:
        files.download(z)

ImportError: cannot import name 'PPStructure' from 'paddleocr' (/usr/local/lib/python3.12/dist-packages/paddleocr/__init__.py)