In [None]:
import os, re, math
import numpy as np
import cv2
from pdf2image import convert_from_path
from PIL import Image
import pytesseract

# -----------------------------
# CONFIG
# -----------------------------
pdf_path      = r"C:\path\to\your\scan.pdf"
poppler_bin   = r"C:\poppler\Library\bin"
tesseract_exe = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

DPI = 500  # scanned CMS1500: 400-600 works best
pytesseract.pytesseract.tesseract_cmd = tesseract_exe

# -----------------------------
# Helpers
# -----------------------------
def pil_to_bgr(img_pil: Image.Image) -> np.ndarray:
    arr = np.array(img_pil)
    return cv2.cvtColor(arr, cv2.COLOR_RGB2BGR)

def bgr_to_pil(img_bgr: np.ndarray) -> Image.Image:
    return Image.fromarray(cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB))

def deskew(bgr: np.ndarray) -> np.ndarray:
    # light deskew using text pixels
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    thr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(thr > 0))
    if coords.size == 0:
        return bgr
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    if abs(angle) < 0.3:
        return bgr
    (h, w) = bgr.shape[:2]
    M = cv2.getRotationMatrix2D((w // 2, h // 2), angle, 1.0)
    return cv2.warpAffine(bgr, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)

def preprocess_for_text(roi_bgr: np.ndarray, upscale=2.3) -> np.ndarray:
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=18, templateWindowSize=7, searchWindowSize=21)
    clahe = cv2.createCLAHE(clipLimit=2.5, tileGridSize=(8,8))
    gray = clahe.apply(gray)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 35, 11)
    thr = cv2.resize(thr, None, fx=upscale, fy=upscale, interpolation=cv2.INTER_CUBIC)
    thr = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((2,2), np.uint8), iterations=1)
    return thr

def preprocess_for_digits(roi_bgr: np.ndarray, upscale=3.0) -> np.ndarray:
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.fastNlMeansDenoising(gray, None, h=18, templateWindowSize=7, searchWindowSize=21)
    thr = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 31, 10)
    thr = cv2.resize(thr, None, fx=upscale, fy=upscale, interpolation=cv2.INTER_CUBIC)
    thr = cv2.morphologyEx(thr, cv2.MORPH_CLOSE, np.ones((2,2), np.uint8), iterations=1)
    return thr

def ocr_text(img_bin: np.ndarray) -> str:
    cfgs = ["--oem 3 --psm 6", "--oem 3 --psm 4", "--oem 3 --psm 11"]
    best = ""
    for cfg in cfgs:
        txt = pytesseract.image_to_string(bgr_to_pil(cv2.cvtColor(img_bin, cv2.COLOR_GRAY2BGR)),
                                          lang="eng", config=cfg).strip()
        txt = "\n".join([ln.strip() for ln in txt.splitlines() if ln.strip()])
        if len(txt) > len(best):
            best = txt
    return best

def ocr_digits(img_bin: np.ndarray) -> str:
    cfg = '--oem 3 --psm 7 -c tessedit_char_whitelist="0123456789"'
    txt = pytesseract.image_to_string(bgr_to_pil(cv2.cvtColor(img_bin, cv2.COLOR_GRAY2BGR)),
                                      lang="eng", config=cfg)
    return re.sub(r"\D", "", txt)

def find_box32_anchor(bgr: np.ndarray):
    """
    Find the '32' / '32.' label using image_to_data.
    Returns (x, y, w, h) in original image coords, or None.
    """
    # detection on mild upscale helps
    gray = cv2.cvtColor(bgr, cv2.COLOR_BGR2GRAY)
    det = cv2.resize(gray, None, fx=1.6, fy=1.6, interpolation=cv2.INTER_CUBIC)

    data = pytesseract.image_to_data(
        bgr_to_pil(cv2.cvtColor(det, cv2.COLOR_GRAY2BGR)),
        lang="eng",
        config="--oem 3 --psm 6",
        output_type=pytesseract.Output.DICT
    )

    cands = []
    for i, word in enumerate(data["text"]):
        if not word:
            continue
        t = word.strip().lower()
        if re.fullmatch(r"32[.:]?", t):
            conf = float(data["conf"][i]) if data["conf"][i] != "-1" else 0.0
            x, y, w, h = data["left"][i], data["top"][i], data["width"][i], data["height"][i]
            cands.append((conf, x, y, w, h))

    if not cands:
        return None

    cands.sort(key=lambda z: (z[0], z[3]*z[4]), reverse=True)
    _, x, y, w, h = cands[0]

    scale = 1.6
    return (int(x/scale), int(y/scale), int(w/scale), int(h/scale))

def crop_box32_and_32a(bgr: np.ndarray):
    """
    Dynamic crop using anchor '32' label.
    If anchor not found, falls back to approximate bottom-middle region.
    """
    H, W = bgr.shape[:2]
    anchor = find_box32_anchor(bgr)

    if anchor:
        ax, ay, aw, ah = anchor

        # Box 32 is to the right of the "32." label and spans a medium-large rectangle.
        # These ratios are robust to scan shifts because they're relative to the label position.
        x1 = max(0, int(ax - 0.01 * W))
        y1 = max(0, int(ay - 0.01 * H))
        x2 = min(W, int(x1 + 0.33 * W))    # width of box 32 region
        y2 = min(H, int(y1 + 0.10 * H))    # height of box 32 region

        box32 = bgr[y1:y2, x1:x2]

        # 32a is the bottom strip where NPI is printed (often just under the address lines)
        npi_y1 = max(0, int(y1 + 0.075 * H))
        npi_y2 = min(H, int(y1 + 0.105 * H))
        npi_x1 = x1
        npi_x2 = min(W, int(x1 + 0.18 * W))
        box32a = bgr[npi_y1:npi_y2, npi_x1:npi_x2]

        return box32, box32a, {"mode":"anchor", "anchor":anchor, "box32_xyxy":(x1,y1,x2,y2), "box32a_xyxy":(npi_x1,npi_y1,npi_x2,npi_y2)}

    # fallback: bottom-middle area (still works surprisingly often)
    x1 = int(0.33 * W); x2 = int(0.66 * W)
    y1 = int(0.84 * H); y2 = int(0.93 * H)
    box32 = bgr[y1:y2, x1:x2]

    npi_y1 = int(0.905 * H); npi_y2 = int(0.935 * H)
    npi_x1 = int(0.33 * W);  npi_x2 = int(0.50 * W)
    box32a = bgr[npi_y1:npi_y2, npi_x1:npi_x2]

    return box32, box32a, {"mode":"fallback", "box32_xyxy":(x1,y1,x2,y2), "box32a_xyxy":(npi_x1,npi_y1,npi_x2,npi_y2)}

def extract_box32(page_bgr: np.ndarray):
    page_bgr = deskew(page_bgr)
    box32_roi, box32a_roi, dbg = crop_box32_and_32a(page_bgr)

    # small padding improves capture when borders cut into text
    box32_roi  = cv2.copyMakeBorder(box32_roi,  8, 8, 8, 8, cv2.BORDER_REPLICATE)
    box32a_roi = cv2.copyMakeBorder(box32a_roi, 8, 8, 8, 8, cv2.BORDER_REPLICATE)

    box32_bin  = preprocess_for_text(box32_roi,  upscale=2.4)
    box32a_bin = preprocess_for_digits(box32a_roi, upscale=3.2)

    text32 = ocr_text(box32_bin)
    npi32a = ocr_digits(box32a_bin)
    npi32a = npi32a[:10] if len(npi32a) >= 10 else npi32a

    return {"32_text": text32, "32a_npi": npi32a, "debug": dbg}

# -----------------------------
# RUN (page 1)
# -----------------------------
pages = convert_from_path(pdf_path, dpi=DPI, first_page=1, last_page=1, poppler_path=poppler_bin)
page_bgr = pil_to_bgr(pages[0])

res = extract_box32(page_bgr)
print("=== BOX 32 TEXT ===")
print(res["32_text"])
print("\n=== BOX 32a NPI ===")
print(res["32a_npi"])
print("\n=== DEBUG ===")
print(res["debug"])
