## Automated Expense Extraction - Receipt Parsing Using YOLO and OCR
### OCR Image Text Extraction

In [1]:
!pip install easyocr


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
import json
import os
from pathlib import Path
import numpy as np
import easyocr
from tqdm import tqdm


In [4]:
# Check if running in Google Colab
if 'COLAB_GPU' in os.environ:
    # Mount Google Drive (for Colab)
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set DATA_PATH for Google Drive
    DATA_PATH = Path('/content/drive/MyDrive/data')
else:
    # Set DATA_PATH for local environment
    DATA_PATH = Path('../data')

In [5]:
# init reader
reader = easyocr.Reader(['en'], gpu=True)

def serialize_easyocr_result(result):
    """
    Convert a single easyocr result tuple (bbox, text, conf)
    into JSON-serializable dict.
    result: (bbox, text, conf) where bbox may be np.array or nested lists.
    """
    bbox, text, conf = result
    # bbox might be np.ndarray or list of np.int32 => convert every coord to int
    try:
        bbox_list = [[int(x) for x in point] for point in bbox]
    except Exception:
        # safer fallback
        bbox_list = []
        for point in bbox:
            bbox_list.append([int(point[0]), int(point[1])])

    # conf may be numpy.float32 => convert to float
    conf_f = float(conf)

    return {"bbox": bbox_list, "text": str(text), "conf": conf_f}

def extract_text_serializable(image_path: str):
    """
    Runs easyocr and returns a JSON-serializable dict with:
      - lines: list of texts
      - full_text: string with newlines
      - raw: list of serialized result dicts [{bbox, text, conf}, ...]
    """
    raw = reader.readtext(image_path, detail=1)
    lines = [r[1] for r in raw]
    full_text = "\n".join(lines)
    serial = [serialize_easyocr_result(r) for r in raw]
    return {"lines": lines, "full_text": full_text, "raw": serial}

In [7]:
# Define directories
PROCESSED_DIR = Path(f"{DATA_PATH}/processed/SROIE2019")
OCR_OUTPUT_DIR = Path(f"{DATA_PATH}/processed/ocr")
OCR_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

splits = ["train", "test"]

for split in splits:
    img_dir = PROCESSED_DIR / split
    out_dir = OCR_OUTPUT_DIR / split
    out_dir.mkdir(parents=True, exist_ok=True)

    print(f"Running OCR on {split} set...")

    for img_path in tqdm(sorted(img_dir.glob("*")), desc=f"Processing {split}", unit="file"):
        out_file = out_dir / f"{img_path.stem}.json"

        try:
            result = extract_text_serializable(str(img_path))
        except Exception as e:
            print(f"Failed: {img_path.name} - {e}")
            continue

        # Write JSON safely
        with open(out_file, "w", encoding="utf-8") as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

print("✔ OCR extraction complete!")

Running OCR on train set...


Processing train: 100%|██████████| 626/626 [1:37:00<00:00,  9.30s/file]


Running OCR on test set...


Processing test: 100%|██████████| 347/347 [6:45:02<00:00, 70.04s/file]

✔ OCR extraction complete!



