In [None]:
import os, re, math
import numpy as np
import cv2
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# -----------------------------
# CONFIG
# -----------------------------
pdf_path      = r"C:\path\to\your\scan.pdf"
poppler_bin   = r"C:\poppler\Library\bin"
tesseract_exe = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

DPI = 500  # scanned CMS1500: 400-600 works best
pytesseract.pytesseract.tesseract_cmd = tesseract_exe

# -----------------------------
# Helpers
# -----------------------------
def pil_to_bgr(img_pil: Image.Image) -> np.ndarray:
    arr = np.array(img_pil)
    return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)

def bgr_to_pil(img_bgr: np.ndarray) -> Image.Image:
    return Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))

def deskew(bgr: np.ndarray) -> np.ndarray:
    # light deskew using text pixels
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(thr > 0))
    if coords.size == 0:
        return bgr
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    if abs(angle) < 0.3:
        return bgr
    (h, w) = bgr.shape[:2]
    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
    return cv2.warpAffine(bgr, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def preprocess_for_text(roi_bgr: np.ndarray, upscale=2.3) -> np.ndarray:
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=18, templateWindowSize=7, searchWindowSize=21)
    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
    gray = clahe.apply(gray)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 35, 11)
    thr = cv2.resize(thr, None, fx=upscale, fy=upscale, interpolation=cv2.INTER_CUBIC)
    thr = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((2,2), np.uint8), iterations=1)
    return thr

def preprocess_for_digits(roi_bgr: np.ndarray, upscale=3.0) -> np.ndarray:
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=18, templateWindowSize=7, searchWindowSize=21)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 31, 10)
    thr = cv2.resize(thr, None, fx=upscale, fy=upscale, interpolation=cv2.INTER_CUBIC)
    thr = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((2,2), np.uint8), iterations=1)
    return thr

def ocr_text(img_bin: np.ndarray) -> str:
    cfgs = ["--oem 3 --psm 6", "--oem 3 --psm 4", "--oem 3 --psm 11"]
    best = ""
    for cfg in cfgs:
        txt = pytesseract.image_to_string(bgr_to_pil(cv2.cvtColor(img_bin, cv2.COLOR_GRAY2BGR)),
                                          lang="eng", config=cfg).strip()
        txt = "\n".join([ln.strip() for ln in txt.splitlines() if ln.strip()])
        if len(txt) > len(best):
            best = txt
    return best

def ocr_digits(img_bin: np.ndarray) -> str:
    cfg = '--oem 3 --psm 7 -c tessedit_char_whitelist="0123456789"'
    txt = pytesseract.image_to_string(bgr_to_pil(cv2.cvtColor(img_bin, cv2.COLOR_GRAY2BGR)),
                                      lang="eng", config=cfg)
    return re.sub(r"\D", "", txt)

def find_box32_anchor(bgr: np.ndarray):
    """
    Find the '32' / '32.' label using image_to_data.
    Returns (x, y, w, h) in original image coords, or None.
    """
    # detection on mild upscale helps
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    det = cv2.resize(gray, None, fx=1.6, fy=1.6, interpolation=cv2.INTER_CUBIC)

    data = pytesseract.image_to_data(
        bgr_to_pil(cv2.cvtColor(det, cv2.COLOR_GRAY2BGR)),
        lang="eng",
        config="--oem 3 --psm 6",
        output_type=pytesseract.Output.DICT
    )

    cands = []
    for i, word in enumerate(data["text"]):
        if not word:
            continue
        t = word.strip().lower()
        if re.fullmatch(r"32[.:]?", t):
            conf = float(data["conf"][i]) if data["conf"][i] != "-1" else 0.0
            x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
            cands.append((conf, x, y, w, h))

    if not cands:
        return None

    cands.sort(key=lambda z: (z[0], z[3]*z[4]), reverse=True)
    _, x, y, w, h = cands[0]

    scale = 1.6
    return (int(x/scale), int(y/scale), int(w/scale), int(h/scale))

def crop_box32_and_32a(bgr: np.ndarray):
    """
    Dynamic crop using anchor '32' label.
    If anchor not found, falls back to approximate bottom-middle region.
    """
    H, W = bgr.shape[:2]
    anchor = find_box32_anchor(bgr)

    if anchor:
        ax, ay, aw, ah = anchor

        # Box 32 is to the right of the "32." label and spans a medium-large rectangle.
        # These ratios are robust to scan shifts because they're relative to the label position.
        x1 = max(0, int(ax - 0.01 * W))
        y1 = max(0, int(ay - 0.01 * H))
        x2 = min(W, int(x1 + 0.33 * W))    # width of box 32 region
        y2 = min(H, int(y1 + 0.10 * H))    # height of box 32 region

        box32 = bgr[y1:y2, x1:x2]

        # 32a is the bottom strip where NPI is printed (often just under the address lines)
        npi_y1 = max(0, int(y1 + 0.075 * H))
        npi_y2 = min(H, int(y1 + 0.105 * H))
        npi_x1 = x1
        npi_x2 = min(W, int(x1 + 0.18 * W))
        box32a = bgr[npi_y1:npi_y2, npi_x1:npi_x2]

        return box32, box32a, {"mode":"anchor", "anchor":anchor, "box32_xyxy":(x1,y1,x2,y2), "box32a_xyxy":(npi_x1,npi_y1,npi_x2,npi_y2)}

    # fallback: bottom-middle area (still works surprisingly often)
    x1 = int(0.33 * W); x2 = int(0.66 * W)
    y1 = int(0.84 * H); y2 = int(0.93 * H)
    box32 = bgr[y1:y2, x1:x2]

    npi_y1 = int(0.905 * H); npi_y2 = int(0.935 * H)
    npi_x1 = int(0.33 * W);  npi_x2 = int(0.50 * W)
    box32a = bgr[npi_y1:npi_y2, npi_x1:npi_x2]

    return box32, box32a, {"mode":"fallback", "box32_xyxy":(x1,y1,x2,y2), "box32a_xyxy":(npi_x1,npi_y1,npi_x2,npi_y2)}

def extract_box32(page_bgr: np.ndarray):
    page_bgr = deskew(page_bgr)
    box32_roi, box32a_roi, dbg = crop_box32_and_32a(page_bgr)

    # small padding improves capture when borders cut into text
    box32_roi  = cv2.copyMakeBorder(box32_roi,  8, 8, 8, 8, cv2.BORDER_REPLICATE)
    box32a_roi = cv2.copyMakeBorder(box32a_roi, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

    box32_bin  = preprocess_for_text(box32_roi,  upscale=2.4)
    box32a_bin = preprocess_for_digits(box32a_roi, upscale=3.2)

    text32 = ocr_text(box32_bin)
    npi32a = ocr_digits(box32a_bin)
    npi32a = npi32a[:10] if len(npi32a) >= 10 else npi32a

    return {"32_text": text32, "32a_npi": npi32a, "debug": dbg}

# -----------------------------
# RUN (page 1)
# -----------------------------
pages = convert_from_path(pdf_path, dpi=DPI, first_page=1, last_page=1, poppler_path=poppler_bin)
page_bgr = pil_to_bgr(pages[0])

res = extract_box32(page_bgr)
print("=== BOX 32 TEXT ===")
print(res["32_text"])
print("\n=== BOX 32a NPI ===")
print(res["32a_npi"])
print("\n=== DEBUG ===")
print(res["debug"])


In [None]:
import os, re, json
import numpy as np
import cv2
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# -----------------------------
# CONFIG
# -----------------------------
pdf_path      = r"C:\path\to\your\scan.pdf"
out_dir       = r"C:\path\to\output_text"
poppler_bin   = r"C:\poppler\Library\bin"
tesseract_exe = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

DPI = 500
TARGET_W, TARGET_H = 2550, 3300

os.makedirs(out_dir, exist_ok=True)
pytesseract.pytesseract.tesseract_cmd = tesseract_exe

# -----------------------------
# Basic helpers
# -----------------------------
def pil_to_bgr(img_pil: Image.Image) -> np.ndarray:
    return cv2.cvtColor(np.array(img_pil), cv2.COLOR_RGB2BGR)

def bgr_to_pil(img_bgr: np.ndarray) -> Image.Image:
    return Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))

def deskew(bgr: np.ndarray) -> np.ndarray:
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(thr > 0))
    if coords.size == 0:
        return bgr
    angle = cv2.minAreaRect(coords)[-1]
    angle = -(90 + angle) if angle < -45 else -angle
    if abs(angle) < 0.3:
        return bgr
    h, w = bgr.shape[:2]
    M = cv2.getRotationMatrix2D((w//2, h//2), angle, 1.0)
    return cv2.warpAffine(bgr, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def preprocess_for_text(roi_bgr: np.ndarray, upscale=2.2) -> np.ndarray:
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=18, templateWindowSize=7, searchWindowSize=21)
    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
    gray = clahe.apply(gray)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 35, 11)
    thr = cv2.resize(thr, None, fx=upscale, fy=upscale, interpolation=cv2.INTER_CUBIC)
    thr = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((2,2), np.uint8), iterations=1)
    return thr

def preprocess_for_digits(roi_bgr: np.ndarray, upscale=3.0) -> np.ndarray:
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=18, templateWindowSize=7, searchWindowSize=21)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 31, 10)
    thr = cv2.resize(thr, None, fx=upscale, fy=upscale, interpolation=cv2.INTER_CUBIC)
    thr = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((2,2), np.uint8), iterations=1)
    return thr

def ocr_text(img_bin: np.ndarray) -> str:
    cfgs = ["--oem 3 --psm 6", "--oem 3 --psm 4", "--oem 3 --psm 11"]
    best = ""
    for cfg in cfgs:
        txt = pytesseract.image_to_string(
            bgr_to_pil(cv2.cvtColor(img_bin, cv2.COLOR_GRAY2BGR)),
            lang="eng",
            config=cfg
        ).strip()
        txt = "\n".join([ln.strip() for ln in txt.splitlines() if ln.strip()])
        if len(txt) > len(best):
            best = txt
    return best

def ocr_digits(img_bin: np.ndarray, max_len=None) -> str:
    cfg = '--oem 3 --psm 7 -c tessedit_char_whitelist="0123456789"'
    txt = pytesseract.image_to_string(
        bgr_to_pil(cv2.cvtColor(img_bin, cv2.COLOR_GRAY2BGR)),
        lang="eng",
        config=cfg
    )
    d = re.sub(r"\D", "", txt)
    if max_len:
        d = d[:max_len]
    return d

def image_to_data_words(gray, scale=1.6):
    det = cv2.resize(gray, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
    data = pytesseract.image_to_data(
        bgr_to_pil(cv2.cvtColor(det, cv2.COLOR_GRAY2BGR)),
        lang="eng",
        config="--oem 3 --psm 6",
        output_type=pytesseract.Output.DICT
    )
    return data, scale

def find_anchor(gray, pattern, region=None):
    """
    Find anchor token by regex on OCR word boxes.
    region: optional (x1,y1,x2,y2) crop on gray to search in
    Returns (x,y,w,h) in ORIGINAL gray coords.
    """
    H, W = gray.shape[:2]
    if region:
        x1,y1,x2,y2 = region
        crop = gray[y1:y2, x1:x2]
        data, scale = image_to_data_words(crop, scale=1.7)
        offset_x, offset_y = x1, y1
    else:
        data, scale = image_to_data_words(gray, scale=1.7)
        offset_x, offset_y = 0, 0

    best = None
    for i, word in enumerate(data["text"]):
        if not word:
            continue
        t = word.strip().lower()
        if re.fullmatch(pattern, t):
            conf = float(data["conf"][i]) if data["conf"][i] != "-1" else 0.0
            x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
            # map back from scaled space + add crop offset
            x0 = int(x/scale) + offset_x
            y0 = int(y/scale) + offset_y
            w0 = int(w/scale)
            h0 = int(h/scale)
            cand = (conf, x0, y0, w0, h0)
            if best is None or (cand[0], cand[3]*cand[4]) > (best[0], best[3]*best[4]):
                best = cand

    if not best:
        return None
    _, x0, y0, w0, h0 = best
    return (x0, y0, w0, h0)

# -----------------------------
# Template warp fallback (if anchors fail)
# -----------------------------
def order_points(pts):
    rect = np.zeros((4,2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    d = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(d)]
    rect[3] = pts[np.argmax(d)]
    return rect

def warp_to_template(bgr):
    hsv = cv2.cvtColor(bgr, cv2.COLOR_BGR2HSV)
    lower1 = np.array([0, 40, 40]); upper1 = np.array([10, 255, 255])
    lower2 = np.array([170,40, 40]); upper2 = np.array([180,255,255])
    mask = cv2.inRange(hsv, lower1, upper1) | cv2.inRange(hsv, lower2, upper2)
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, np.ones((9,9), np.uint8), iterations=2)
    mask = cv2.dilate(mask, np.ones((5,5), np.uint8), iterations=1)
    cnts, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if not cnts:
        return cv2.resize(bgr, (TARGET_W, TARGET_H), interpolation=cv2.INTER_CUBIC)

    c = sorted(cnts, key=cv2.contourArea, reverse=True)[0]
    peri = cv2.arcLength(c, True)
    approx = cv2.approxPolyDP(c, 0.02*peri, True)

    if len(approx) != 4:
        rect = cv2.minAreaRect(c)
        approx = cv2.boxPoints(rect).astype(np.int32)

    pts = order_points(approx.reshape(-1,2).astype("float32"))
    dst = np.array([[0,0],[TARGET_W-1,0],[TARGET_W-1,TARGET_H-1],[0,TARGET_H-1]], dtype="float32")
    M = cv2.getPerspectiveTransform(pts, dst)
    return cv2.warpPerspective(bgr, M, (TARGET_W, TARGET_H))

def crop_norm(img, rect):
    x1,y1,x2,y2 = rect
    H,W = img.shape[:2]
    xa, ya = int(x1*W), int(y1*H)
    xb, yb = int(x2*W), int(y2*H)
    return img[max(0,ya):min(H,yb), max(0,xa):min(W,xb)]

# Minimal template crops (fallback)
BOXES_TEMPLATE = {
    "32":  (0.33, 0.84, 0.66, 0.93),
    "32a": (0.33, 0.905, 0.50, 0.935),
}

# -----------------------------
# Anchor-based crop definitions (relative to each box label)
# These are robust to scan shifts because they use the label position.
# -----------------------------
def crop_box32_anchor_based(bgr):
    H, W = bgr.shape[:2]
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)

    # search bottom half to avoid false "32" in other areas
    region = (0, int(0.55*H), W, H)
    a32 = find_anchor(gray, r"32[.:]?", region=region)

    if a32:
        ax, ay, aw, ah = a32
        # relative window sized from page dims; anchored at label
        x1 = max(0, int(ax - 0.01*W))
        y1 = max(0, int(ay - 0.01*H))
        x2 = min(W, int(x1 + 0.33*W))
        y2 = min(H, int(y1 + 0.10*H))
        box32 = bgr[y1:y2, x1:x2]

        npi_y1 = max(0, int(y1 + 0.075*H))
        npi_y2 = min(H, int(y1 + 0.105*H))
        npi_x1 = x1
        npi_x2 = min(W, int(x1 + 0.18*W))
        box32a = bgr[npi_y1:npi_y2, npi_x1:npi_x2]

        return box32, box32a, {"mode":"anchor", "anchor32":a32, "box32_xyxy":(x1,y1,x2,y2), "box32a_xyxy":(npi_x1,npi_y1,npi_x2,npi_y2)}

    return None, None, {"mode":"anchor_not_found"}

# -----------------------------
# Main OCR for "all boxes"
# Approach:
# 1) OCR whole page text (for completeness)
# 2) OCR known boxes via anchor-based; if fails, fallback to template warp
# 3) Save per-page .txt
# -----------------------------
def ocr_page_all(page_bgr, page_num):
    page_bgr = deskew(page_bgr)

    # (A) Whole page OCR (kept as a raw reference)
    whole_bin = preprocess_for_text(page_bgr, upscale=1.4)
    whole_text = ocr_text(whole_bin)

    results = {
        "page": page_num,
        "whole_page_text": whole_text,
        "boxes": {},
        "debug": {}
    }

    # (B) Box 32 + 32a via anchor-based crop
    box32, box32a, dbg32 = crop_box32_anchor_based(page_bgr)
    results["debug"]["box32"] = dbg32

    if box32 is not None:
        box32  = cv2.copyMakeBorder(box32,  8, 8, 8, 8, cv2.BORDER_REPLICATE)
        box32a = cv2.copyMakeBorder(box32a, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

        b32_bin  = preprocess_for_text(box32, upscale=2.4)
        b32a_bin = preprocess_for_digits(box32a, upscale=3.2)

        results["boxes"]["32"]  = ocr_text(b32_bin)
        results["boxes"]["32a"] = ocr_digits(b32a_bin, max_len=10)
    else:
        # fallback to warp+template crop
        warped = warp_to_template(page_bgr)
        roi32  = crop_norm(warped, BOXES_TEMPLATE["32"])
        roi32a = crop_norm(warped, BOXES_TEMPLATE["32a"])

        roi32  = cv2.copyMakeBorder(roi32,  8, 8, 8, 8, cv2.BORDER_REPLICATE)
        roi32a = cv2.copyMakeBorder(roi32a, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

        results["boxes"]["32"]  = ocr_text(preprocess_for_text(roi32,  upscale=2.4))
        results["boxes"]["32a"] = ocr_digits(preprocess_for_digits(roi32a, upscale=3.2), max_len=10)
        results["debug"]["box32"]["mode"] = "template_fallback"

    # TODO: Add other boxes using same pattern (anchor-based first; template fallback second).
    # You can replicate crop_box32_anchor_based() into crop_box33_anchor_based(), etc.

    return results

# -----------------------------
# RUN + SAVE
# -----------------------------
pages = convert_from_path(pdf_path, dpi=DPI, poppler_path=poppler_bin)

combined_txt_lines = []
all_pages_results = []

for i, p in enumerate(pages, start=1):
    bgr = pil_to_bgr(p)
    res = ocr_page_all(bgr, i)
    all_pages_results.append(res)

    # Write per-page text file
    page_txt_path = os.path.join(out_dir, f"page_{i:03d}_ocr.txt")
    with open(page_txt_path, "w", encoding="utf-8") as f:
        f.write(f"PAGE {i}\n")
        f.write("="*80 + "\n\n")
        f.write("[BOXES]\n")
        for k in sorted(res["boxes"].keys(), key=lambda x: (len(x), x)):
            f.write(f"\n--- BOX {k} ---\n")
            f.write(res["boxes"][k] + "\n")
        f.write("\n" + "="*80 + "\n")
        f.write("[WHOLE_PAGE_TEXT]\n")
        f.write(res["whole_page_text"] + "\n")
        f.write("\n" + "="*80 + "\n")
        f.write("[DEBUG]\n")
        f.write(json.dumps(res["debug"], indent=2) + "\n")

    combined_txt_lines.append(f"\n\n##### PAGE {i} #####\n")
    combined_txt_lines.append("[BOXES]\n")
    for k in sorted(res["boxes"].keys(), key=lambda x: (len(x), x)):
        combined_txt_lines.append(f"\n--- BOX {k} ---\n{res['boxes'][k]}\n")
    combined_txt_lines.append("\n[WHOLE_PAGE_TEXT]\n")
    combined_txt_lines.append(res["whole_page_text"])

# combined file
combined_path = os.path.join(out_dir, "combined_ocr_all_pages.txt")
with open(combined_path, "w", encoding="utf-8") as f:
    f.write("\n".join(combined_txt_lines))

# optional: save structured json for downstream parsing
json_path = os.path.join(out_dir, "ocr_all_pages.json")
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(all_pages_results, f, indent=2)

print(f"Saved per-page OCR text files to: {out_dir}")
print(f"Saved combined OCR to: {combined_path}")
print(f"Saved JSON results to: {json_path}")
print("\nBox 32 (page 1):\n", all_pages_results[0]["boxes"].get("32", ""))
print("\nBox 32a (page 1):\n", all_pages_results[0]["boxes"].get("32a", ""))
