In [2]:
!pip install easyocr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [5]:
from google.colab import drive
drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [4]:
import json
from pathlib import Path
import numpy as np
import easyocr

In [None]:
# init reader 
reader = easyocr.Reader(['en'], gpu=True)

def serialize_easyocr_result(result):
    """
    Convert a single easyocr result tuple (bbox, text, conf)
    into JSON-serializable dict.
    result: (bbox, text, conf) where bbox may be np.array or nested lists.
    """
    bbox, text, conf = result
    # bbox might be np.ndarray or list of np.int32 => convert every coord to int
    try:
        bbox_list = [[int(x) for x in point] for point in bbox]
    except Exception:
        # safer fallback
        bbox_list = []
        for point in bbox:
            bbox_list.append([int(point[0]), int(point[1])])

    # conf may be numpy.float32 => convert to float
    conf_f = float(conf)

    return {"bbox": bbox_list, "text": str(text), "conf": conf_f}

def extract_text_serializable(image_path: str):
    """
    Runs easyocr and returns a JSON-serializable dict with:
      - lines: list of texts
      - full_text: string with newlines
      - raw: list of serialized result dicts [{bbox, text, conf}, ...]
    """
    raw = reader.readtext(image_path, detail=1)
    lines = [r[1] for r in raw]
    full_text = "\n".join(lines)
    serial = [serialize_easyocr_result(r) for r in raw]
    return {"lines": lines, "full_text": full_text, "raw": serial}

In [None]:
# Batch run  
PROCESSED_DIR = Path("../data/processed/SROIE2019")
OCR_OUTPUT_DIR = Path("../data/processed/ocr")
OCR_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

splits = ["train", "test"]

for split in splits:
    img_dir = PROCESSED_DIR / split
    out_dir = OCR_OUTPUT_DIR / split
    out_dir.mkdir(parents=True, exist_ok=True)

    print(f"Running OCR on {split} set...")

    for img_path in sorted(img_dir.glob("*")):
        out_file = out_dir / f"{img_path.stem}.json"

        result = extract_text_serializable(str(img_path))

        # write JSON safely
        with open(out_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

print("âœ” OCR extraction complete!")