In [63]:
import os
import sys
import numpy as np
import torch
import easyocr
from PIL import Image
from transformers import AutoModelForCausalLM
import cv2


sys.path.append("..")
# from kiebids.parser import read_xml, get_ground_truth_text
from kiebids.utils import crop_image, get_ground_truth_data
from kiebids import pipeline_config

In [64]:
module_config = pipeline_config["text_recognition"]

# Compare text Models 

In [215]:
class TextRecognizer:
    """
    Text Recognizer class
    """

    def __init__(self, model):
        if model == "easyocr":
            self.model = EasyOcr()
        elif model == "moondream":
            self.model = Moondream()
        else:
            print(f"Model {model} not found. Using EasyOcr as default.")
            self.model = EasyOcr()

    # @task(name=module)
    # @debug_writer(debug_path, module=module)
    # @evaluator(module=module)
    def run(self, image: np.array, bounding_boxes: list, **kwargs):
        """
        Returns text for each bounding box in image
        Parameters:
            image: np.array
            bounding_boxes: list of bounding box coordinates of form [x_min,y_min,width,height]

        Returns:
            dictionary with bounding box and text
        """

        output = []

        for bounding_box in bounding_boxes:
            cropped_image = crop_image(image, bounding_box)

            text = self.model.get_text(image=cropped_image)

            output.append({"bbox": bounding_box, "text": text})

        return output


class EasyOcr:
    """
    EasyOcr
    """

    def __init__(self):
        gpu = torch.cuda.is_available()
        self.model = easyocr.Reader([module_config.easyocr.language], gpu=gpu)

    def get_text(self, image: np.array):
        """
        Returns text from image.
        """
        texts = self.model.readtext(
            image,
            decoder=module_config.easyocr.decoder,  #
            text_threshold=module_config.easyocr.text_threshold,
            paragraph=False,
            detail=0,
            y_ths=0.3,
        )
        print(len(texts))
        return "\n".join(texts) if texts else ""


class Moondream:
    """
    Moondream 1.9B 2025-01-09 Release
    Huggingface: https://huggingface.co/vikhyatk/moondream2
    Documentation: https://docs.moondream.ai/
    Blog post: https://moondream.ai/blog/introducing-a-new-moondream-1-9b-and-gpu-support
    """

    def __init__(self):
        gpu = torch.cuda.is_available()
        self.model = AutoModelForCausalLM.from_pretrained(
            module_config.moondream.name,
            revision=module_config.moondream.revision,
            trust_remote_code=module_config.moondream.trust_remote_code,
            device_map={"": "cuda"} if gpu else None,
        )
        self.prompt = module_config.moondream.prompt

    def get_text(self, image: np.array):
        pil_image = Image.fromarray(image)
        text = self.model.query(pil_image, self.prompt)["answer"]
        return self.clean_text(text)

    def clean_text(self, text):
        """
        Moondream specific text cleaning.
        """
        return text.replace("\n\n", "\n").strip()

In [216]:
# Path to folder with cropped images (after layout analysis step)
image_path = "/home/jupyter-lova/app-kiebids-2/data/debug/layout_analysis/20250116-135623_easyocr_test"

In [217]:
easyocr_model = TextRecognizer("easyocr")

moondream_model = TextRecognizer("moondream")

In [None]:
images = os.listdir(image_path)

image_number = 9

image = cv2.imread(os.path.join(image_path, images[image_number]))

# get the image name to get the ground truth data
image_name = images[image_number].split(".")[0][:-2] + ".jpg"
print(image_name)
ground_truth_data = get_ground_truth_data(image_name)

if ground_truth_data:
    texts = [region["text"] for region in ground_truth_data["text_regions"]]

In [None]:
display(Image.fromarray(image))

In [None]:
text_redone = [t.replace("\r\n", "\n") for t in texts]
text_redone[3]