In [7]:
import argparse
import json
import re
from pathlib import Path
from statistics import median
from typing import Dict, List, Optional

import cv2
import easyocr
import numpy as np


def _word_entry(detection) -> Dict[str, float]:
    points, text, confidence = detection
    xs = [p[0] for p in points]
    ys = [p[1] for p in points]

    left = float(min(xs))
    right = float(max(xs))
    top = float(min(ys))
    bottom = float(max(ys))
    height = max(1.0, bottom - top)

    return {
        "text": text.strip(),
        "confidence": float(confidence),
        "left": left,
        "right": right,
        "top": top,
        "bottom": bottom,
        "height": height,
        "center_y": (top + bottom) / 2.0,
    }


def _load_image_variants(image_path: Path) -> List[np.ndarray]:
    image = cv2.imread(str(image_path))
    if image is None:
        raise ValueError(f"Could not read image: {image_path}")

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    upscaled = cv2.resize(gray, None, fx=2.0, fy=2.0, interpolation=cv2.INTER_CUBIC)

    denoised = cv2.fastNlMeansDenoising(upscaled, None, h=18, templateWindowSize=7, searchWindowSize=21)
    adaptive = cv2.adaptiveThreshold(
        denoised,
        255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        31,
        11,
    )

    otsu = cv2.threshold(denoised, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    return [gray, upscaled, denoised, adaptive, otsu]


def _read_detections(reader: easyocr.Reader, image: np.ndarray, numeric_only: bool) -> List:
    kwargs = {
        "detail": 1,
        "paragraph": False,
        "contrast_ths": 0.05,
        "adjust_contrast": 0.7,
        "text_threshold": 0.45,
        "low_text": 0.2,
        "width_ths": 0.5,
        "height_ths": 0.5,
    }
    if numeric_only:
        kwargs["allowlist"] = "0123456789-[], "

    return reader.readtext(image, **kwargs)


def _best_detections(reader: easyocr.Reader, variants: List[np.ndarray], numeric_only: bool) -> List:
    best = []
    best_score = -1.0
    for variant in variants:
        detections = _read_detections(reader, variant, numeric_only=numeric_only)
        if not detections:
            continue
        confidence_sum = sum(float(item[2]) for item in detections)
        score = confidence_sum / max(1, len(detections))
        if score > best_score:
            best_score = score
            best = detections
    return best


def _vertical_overlap_ratio(word: Dict[str, float], line: Dict[str, float]) -> float:
    overlap = max(0.0, min(word["bottom"], line["bottom"]) - max(word["top"], line["top"]))
    denom = min(word["height"], max(1.0, line["median_height"]))
    return overlap / denom


def _cluster_lines(words: List[Dict[str, float]]) -> List[List[Dict[str, float]]]:
    if not words:
        return []

    heights = [word["height"] for word in words]
    median_height = median(heights)
    center_tolerance = max(4.0, 0.35 * median_height)
    min_overlap_ratio = 0.45

    sorted_words = sorted(words, key=lambda word: (word["center_y"], word["left"]))
    line_clusters: List[Dict[str, object]] = []

    for word in sorted_words:
        best_index = -1
        best_score = -1.0

        for index, line in enumerate(line_clusters):
            overlap_ratio = _vertical_overlap_ratio(word, line)
            center_distance = abs(word["center_y"] - line["center_y"])
            if overlap_ratio >= min_overlap_ratio or center_distance <= center_tolerance:
                score = overlap_ratio - (center_distance / max(center_tolerance, 1.0)) * 0.1
                if score > best_score:
                    best_score = score
                    best_index = index

        if best_index == -1:
            line_clusters.append(
                {
                    "items": [word],
                    "top": word["top"],
                    "bottom": word["bottom"],
                    "center_y": word["center_y"],
                    "median_height": word["height"],
                }
            )
            continue

        selected = line_clusters[best_index]
        selected_items = selected["items"]
        selected_items.append(word)
        selected["top"] = min(selected["top"], word["top"])
        selected["bottom"] = max(selected["bottom"], word["bottom"])
        selected["center_y"] = sum(item["center_y"] for item in selected_items) / len(selected_items)
        selected["median_height"] = median(item["height"] for item in selected_items)

    ordered_lines = sorted(line_clusters, key=lambda line: line["center_y"])
    return [sorted(line["items"], key=lambda item: item["left"]) for line in ordered_lines]


def _normalize_numeric_text(text: str) -> str:
    replacements = str.maketrans(
        {
            "O": "0",
            "o": "0",
            "Q": "0",
            "I": "1",
            "l": "1",
            "|": "1",
            "S": "5",
            "s": "5",
            "B": "8",
            "G": "6",
            "g": "9",
        }
    )
    cleaned = text.translate(replacements)
    return re.sub(r"[^0-9,\-\s\[\]]", " ", cleaned)


def _number_tokens(detection) -> List[Dict[str, float]]:
    points, raw_text, confidence = detection
    text = _normalize_numeric_text(raw_text)
    numbers = re.findall(r"-?\d+", text)
    if not numbers:
        return []

    xs = [p[0] for p in points]
    ys = [p[1] for p in points]
    left = float(min(xs))
    right = float(max(xs))
    top = float(min(ys))
    bottom = float(max(ys))
    height = max(1.0, bottom - top)
    width = max(1.0, right - left)

    tokens: List[Dict[str, float]] = []
    segment_width = width / max(1, len(numbers))
    for idx, token in enumerate(numbers):
        token_left = left + idx * segment_width
        token_right = token_left + segment_width
        tokens.append(
            {
                "text": token,
                "value": int(token),
                "confidence": float(confidence),
                "left": token_left,
                "right": token_right,
                "top": top,
                "bottom": bottom,
                "height": height,
                "center_y": (top + bottom) / 2.0,
                "center_x": (token_left + token_right) / 2.0,
            }
        )
    return tokens


def _infer_column_count(rows: List[List[Dict[str, float]]], expected_cols: Optional[int]) -> int:
    if expected_cols and expected_cols > 0:
        return expected_cols

    lengths = [len(row) for row in rows if row]
    if not lengths:
        return 0

    freq: Dict[int, int] = {}
    for count in lengths:
        freq[count] = freq.get(count, 0) + 1
    mode_count = max(freq.items(), key=lambda item: (item[1], -item[0]))[0]
    if mode_count <= 1:
        return int(median(lengths))
    return mode_count


def _column_centers(rows: List[List[Dict[str, float]]], columns: int) -> List[float]:
    if columns <= 0:
        return []

    buckets: List[List[float]] = [[] for _ in range(columns)]
    for row in rows:
        ordered = sorted(row, key=lambda item: item["center_x"])
        if len(ordered) < max(2, int(0.8 * columns)):
            continue
        if len(ordered) == columns:
            sampled = ordered
        else:
            sampled = []
            for index in range(columns):
                source_index = round(index * (len(ordered) - 1) / max(1, columns - 1))
                sampled.append(ordered[source_index])

        for idx, token in enumerate(sampled):
            buckets[idx].append(token["center_x"])

    if any(buckets):
        fallback = []
        for bucket in buckets:
            if bucket:
                fallback.append(float(median(bucket)))
            elif fallback:
                fallback.append(fallback[-1] + 20.0)
            else:
                fallback.append(0.0)
        return fallback

    widest_row = max(rows, key=lambda row: len(row), default=[])
    widest = sorted(widest_row, key=lambda item: item["center_x"])
    if not widest:
        return []
    if len(widest) == columns:
        return [item["center_x"] for item in widest]

    min_x = min(item["center_x"] for item in widest)
    max_x = max(item["center_x"] for item in widest)
    if columns == 1:
        return [min_x]
    step = (max_x - min_x) / max(1, columns - 1)
    return [min_x + idx * step for idx in range(columns)]


def _assign_row_to_columns(
    row_tokens: List[Dict[str, float]],
    centers: List[float],
) -> List[Optional[int]]:
    rebuilt: List[Optional[int]] = [None] * len(centers)
    if not centers:
        return rebuilt

    for token in sorted(row_tokens, key=lambda item: item["center_x"]):
        candidate_order = sorted(
            range(len(centers)),
            key=lambda index: abs(token["center_x"] - centers[index]),
        )
        for index in candidate_order:
            if rebuilt[index] is None:
                rebuilt[index] = token["value"]
                break

    return rebuilt


def _filter_numeric_tokens(tokens: List[Dict[str, float]]) -> List[Dict[str, float]]:
    if not tokens:
        return []

    confident = [token for token in tokens if token["confidence"] >= 0.25]
    if not confident:
        confident = tokens

    lengths = [len(token["text"].lstrip("-")) for token in confident if token["text"].lstrip("-").isdigit()]
    if lengths:
        freq: Dict[int, int] = {}
        for length in lengths:
            freq[length] = freq.get(length, 0) + 1
        dominant_length = max(freq.items(), key=lambda item: (item[1], item[0]))[0]
        confident = [
            token
            for token in confident
            if abs(len(token["text"].lstrip("-")) - dominant_length) <= 1
            and len(token["text"].lstrip("-")) >= 2
        ]

    row_clusters = _cluster_lines(confident)
    if not row_clusters:
        return confident

    row_sizes = [len(row) for row in row_clusters]
    typical_row_size = max(2, int(median(row_sizes)))
    kept_rows = [row for row in row_clusters if len(row) >= max(2, int(0.5 * typical_row_size))]

    filtered: List[Dict[str, float]] = []
    for row in kept_rows:
        filtered.extend(row)
    return filtered


def extract_lines(image_path: Path) -> List[str]:
    if not image_path.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    reader = easyocr.Reader(["en"], gpu=False)
    variants = _load_image_variants(image_path)
    detections = _best_detections(reader, variants, numeric_only=False)

    words = [_word_entry(detection) for detection in detections if detection[1].strip()]
    grouped_lines = _cluster_lines(words)

    line_array: List[str] = []
    for line_words in grouped_lines:
        line_text = " ".join(item["text"] for item in line_words if item["text"])
        if line_text:
            line_array.append(line_text)

    return line_array


def extract_numeric_array(image_path: Path, expected_cols: Optional[int] = None) -> List[List[Optional[int]]]:
    if not image_path.exists():
        raise FileNotFoundError(f"Image not found: {image_path}")

    reader = easyocr.Reader(["en"], gpu=False)
    variants = _load_image_variants(image_path)
    detections = _best_detections(reader, variants, numeric_only=True)

    tokens: List[Dict[str, float]] = []
    for detection in detections:
        tokens.extend(_number_tokens(detection))

    tokens = _filter_numeric_tokens(tokens)

    row_clusters = _cluster_lines(tokens)
    row_clusters = [sorted(row, key=lambda item: item["center_x"]) for row in row_clusters if row]
    if not row_clusters:
        return []

    columns = _infer_column_count(row_clusters, expected_cols=expected_cols)
    centers = _column_centers(row_clusters, columns)

    rebuilt = [_assign_row_to_columns(row, centers) for row in row_clusters]
    return rebuilt


def main() -> None:
    parser = argparse.ArgumentParser(description="OCR line and array extraction")
    parser.add_argument("--image", default="pic.jpeg", help="Path to image file")
    parser.add_argument("--mode", choices=["lines", "array"], default="lines")
    parser.add_argument("--cols", type=int, default=None, help="Expected number of array columns")
    args = parser.parse_args()

    image_path = Path(args.image)
    if args.mode == "array":
        array_data = extract_numeric_array(image_path, expected_cols=args.cols)
        print(json.dumps(array_data, ensure_ascii=False, indent=2))
        return

    lines = extract_lines(image_path)
    print(json.dumps(lines, ensure_ascii=False, indent=2))


if __name__ == "__main__":
    pass #main()

In [10]:
parser = argparse.ArgumentParser(description="OCR line and array extraction")
parser.add_argument("--image", default="pic.jpeg", help="Path to image file")
parser.add_argument("--mode", choices=["lines", "array"], default="lines")
parser.add_argument("--cols", type=int, default=None, help="Expected number of array columns")
args = parser.parse_args()

image_path = Path(args.image)
if args.mode == "array":
    array_data = extract_numeric_array(image_path, expected_cols=args.cols)
    print(json.dumps(array_data, ensure_ascii=False, indent=2))

lines = extract_lines(image_path)

usage: ipykernel_launcher.py [-h] [--image IMAGE] [--mode {lines,array}]
                             [--cols COLS]
ipykernel_launcher.py: error: unrecognized arguments: --f=c:\Users\khari\AppData\Roaming\jupyter\runtime\kernel-v379292039e63fcb933b13dc9a41a8a6159b7933ff.json


SystemExit: 2

In [6]:
lines

["B#Var Vicw 'Csc_ -CPinBikn_Psd_AdcRsult",
 'UI 0Adcvaiodac" 0 0 (13083 _ 1 8 8 8 0 8 8 G # & \'7678; 228# #5, % #2,55221%#3,1 10, # 16334 1821_ 16014 16112 16184 16340 , 12892 14858_ 14668 15878, 14668 14669 14668 "8668418 7a, # \' E, #3 1393 12873 333 13097 13026 i4866 54666 14666 414 68 ,14669 6 1 6 (16454 5 % 16126 14669 _ 14669 1o?_ 12889 12964 183 _ 13184 13113 14668 14669 14668 13283 14667 14668 E 14668 14565 , 15928 _ 16022 16118, 14665, 14667 14666 14 14665 14667 12995 _ 12948, 14688 , 14666 i4660; 13383 12954 14665 14666 14667 14667 13273 13164 13098 6 15838; 55435 "16315 55515 , 55888 14663 _ 15983 , 14667 14668 14670 14668 14668 1ozz 13025 1468}_ 1466} 13348 13248 , 13173 13010 . 14663, 14663 , 14664 14663 14664 54877 , 54214, 54664 58658; 5892, 56345 55308; 55537 56362 55537 55537 55538 , 55538 55537 56182 56233 56272 55662 51 5553 _ 551 55531 , 55531 55532 55532 , 55531 , 55512 , 55512 55512 55513 55513 , 1556; 14666; i4665 _ 12926, 14665 13099 13025 , 14665 14665 14666 