# Multilingual Invoice OCR & PII Redaction
Notebook implementing invoice scanning, OCR and PII redaction pipeline.

## 01_setup

In [None]:
import os
import re
import json
import hashlib
from pathlib import Path
from typing import List, Dict, Tuple

import numpy as np
import cv2
from PIL import Image, ImageDraw
from pdf2image import convert_from_path
from langdetect import detect, DetectorFactory

DetectorFactory.seed = 42

INPUT_GLOB = 'path/to/folder/**/*'
SUPPORTED_EXTS = {'.pdf', '.jpg', '.jpeg', '.png', '.tiff', '.tif'}
DPI = 300
MAX_PAGES = 200

OUT_IMG_DIR = Path('out/redacted_images')
OUT_PDF_DIR = Path('out/redacted_pdf')
REPORT_PATH = Path('out/pii_report.jsonl')
for p in [OUT_IMG_DIR, OUT_PDF_DIR, REPORT_PATH.parent]:
    p.mkdir(parents=True, exist_ok=True)


## 02_io

In [None]:
import glob


def discover_files(pattern: str) -> List[Path]:
    return [
        Path(p)
        for p in glob.glob(pattern, recursive=True)
        if Path(p).suffix.lower() in SUPPORTED_EXTS
    ]


def pdf_to_images(pdf_path: Path, dpi: int = DPI, max_pages: int = MAX_PAGES):
    return convert_from_path(str(pdf_path), dpi=dpi, first_page=1, last_page=max_pages)


def load_images(path: Path, dpi: int = DPI, max_pages: int = MAX_PAGES) -> List[Image.Image]:
    if path.suffix.lower() == '.pdf':
        return pdf_to_images(path, dpi=dpi, max_pages=max_pages)
    return [Image.open(path).convert('RGB')]


## 03_preprocess

In [None]:
def preprocess_image(img: Image.Image) -> Image.Image:
    arr = np.array(img)
    gray = cv2.cvtColor(arr, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray, h=10)
    blurred = cv2.GaussianBlur(denoised, (3, 3), 0)
    return Image.fromarray(blurred)


## 04_layout_ocr

In [None]:
from paddleocr import PaddleOCR

USE_GPU = os.environ.get('USE_GPU', 'true').lower() == 'true'
paddle = PaddleOCR(lang='en', use_gpu=USE_GPU, show_log=False)


def run_ocr(img: Image.Image) -> List[Dict[str, object]]:
    result = paddle.ocr(np.array(img), cls=True) or []
    lines: List[Dict[str, object]] = []
    for line in (result[0] if result else []):
        quad, (text, conf) = line
        lines.append({'bbox_quad': quad, 'text': text, 'conf': float(conf)})
    return lines


## 05_language_detect

In [None]:
LANG_HINTS = ['en', 'hi', 'mr', 'ta', 'ru', 'pl']


def detect_language(text: str) -> str:
    if not text.strip():
        return 'en'
    try:
        return detect(text)
    except Exception:
        return 'en'


## 06_pii_detect

In [None]:
EMAIL_REGEX = re.compile(r'(?i)[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}')
PHONE_REGEX = re.compile(r'(?:\+?91[-\s]?)?[6-9]\d{9}')


def find_regex_pii(text: str) -> List[Dict[str, object]]:
    hits = []
    for match in EMAIL_REGEX.finditer(text):
        hits.append({'type': 'EMAIL', 'text': match.group(), 'span': match.span()})
    for match in PHONE_REGEX.finditer(text):
        hits.append({'type': 'PHONE', 'text': match.group(), 'span': match.span()})
    return hits


def bbox_from_quad(quad: List[List[float]]) -> Tuple[int, int, int, int]:
    xs = [pt[0] for pt in quad]
    ys = [pt[1] for pt in quad]
    return int(min(xs)), int(min(ys)), int(max(xs)), int(max(ys))


def collect_pii(lines: List[Dict[str, object]]):
    detections = []
    boxes = []
    for line in lines:
        hits = find_regex_pii(line['text'])
        if not hits:
            continue
        box = bbox_from_quad(line['bbox_quad'])
        boxes.append(box)
        for hit in hits:
            detections.append({
                'type': hit['type'],
                'text_sample': hit['text'],
                'bbox_xyxy': list(box),
                'confidence': line['conf'],
                'mask_applied': True,
            })
    return detections, boxes


## 07_redact

In [None]:
def apply_redactions(img: Image.Image, boxes, color=(0, 0, 0)):
    draw = ImageDraw.Draw(img)
    for (x1, y1, x2, y2) in boxes:
        draw.rectangle([x1, y1, x2, y2], fill=color)
    return img


## 08_export

In [None]:
def file_sha256(path: Path) -> str:
    hasher = hashlib.sha256()
    with path.open('rb') as f:
        for chunk in iter(lambda: f.read(8192), b''):
            hasher.update(chunk)
    return hasher.hexdigest()


def save_page(img: Image.Image, base: Path, page_num: int):
    out = OUT_IMG_DIR / f'{base.stem}_{page_num:03d}.png'
    img.save(out)
    return out


def append_report(entry: dict):
    with REPORT_PATH.open('a', encoding='utf8') as f:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')


## 09_qc_dashboard

In [None]:
import matplotlib.pyplot as plt


def show_side_by_side(original: Image.Image, redacted: Image.Image):
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))
    axes[0].imshow(original)
    axes[0].set_title('Original')
    axes[1].imshow(redacted)
    axes[1].set_title('Redacted')
    for ax in axes:
        ax.axis('off')
    plt.show()


## 10_batch_runner

In [None]:
from tqdm.auto import tqdm


def process_all(pattern=INPUT_GLOB, dpi: int = DPI, max_pages: int = MAX_PAGES):
    files = discover_files(pattern)
    if not files:
        print(f'No files found for pattern: {pattern}')
        return
    for file in tqdm(files):
        images = load_images(file, dpi=dpi, max_pages=max_pages)
        file_hash = file_sha256(file)
        for i, img in enumerate(images, 1):
            pimg = preprocess_image(img)
            lines = run_ocr(pimg)
            text = '\n'.join([line['text'] for line in lines])
            lang = detect_language(text)
            detections, boxes = collect_pii(lines)
            redacted = apply_redactions(pimg.copy(), boxes)
            save_page(redacted, file, i)
            append_report({
                'file': str(file),
                'file_hash': file_hash,
                'page': i,
                'language': lang,
                'detections': detections,
            })


## 11_benchmarks

In [None]:
# TODO: implement benchmarking on synthetic samples
