In [None]:
import pandas as pd
import os
import json
import cv2
from PIL import Image
import sys
from lxml import etree
from tensorflow.python.summary.summary_iterator import summary_iterator

sys.path.append("..")
from kiebids.utils import crop_image

# Compare Results from evaluation runs

In [51]:
def read_xml(file_path: str) -> dict:  # noqa
    """
    Parses an XML file and extracts information about pages, text regions, and text lines.

    Args:
        file_path (str): The path to the XML file to be parsed.

    Returns:
        dict: A dictionary containing the extracted information with the following structure:
            {
                "image_filename": str,  # The filename of the image associated with the page
                "image_width": str,     # The width of the image
                "image_height": str,    # The height of the image
                "text_regions": [       # A list of text regions
                    {
                        "id": str,           # The ID of the text region
                        "orientation": str,  # The orientation of the text region
                        "coords": str,       # The coordinates of the text region
                        "text": str,         # The text content of the whole text region
                        "text_lines": [      # A list of text lines within the text region
                            {
                                "id": str,        # The ID of the text line
                                "coords": str,    # The coordinates of the text line
                                "baseline": str,  # The baseline coordinates of the text line
                                "text": str       # The text content of the text line
                            }
                        ]
                    }
                ]
            }
    """

    tree = etree.parse(file_path)  # noqa: S320  # Using `lxml` to parse untrusted data is known to be vulnerable to XML attacks
    ns = {"ns": tree.getroot().nsmap.get(None, "")}

    page = tree.find(".//ns:Page", namespaces=ns)
    output = {
        "image_filename": page.get("imageFilename"),
        "image_width": page.get("imageWidth"),
        "image_height": page.get("imageHeight"),
        "text_regions": [],
    }

    for region in page.findall(".//ns:TextRegion", namespaces=ns):
        text_region = {
            "id": region.get("id"),
            "orientation": region.get("orientation"),
            "coords": region.find(".//ns:Coords", namespaces=ns).get("points"),
            "text": (
                region.findall(".//ns:TextEquiv", namespaces=ns)[-1]
                .find(".//ns:Unicode", namespaces=ns)
                .text
                or ""
            ),
            "text_lines": [],
        }

        output["text_regions"].append(text_region)

    return output

In [7]:
evaluation_path = "../data/evaluation/tensorboard"
text_recognition_path = "../data/debug/text_recognition"
preprocessed_path = "../data/debug/preprocessing"
layout_analysis_path = "../data/debug/layout_analysis"


class Evaluation:
    def __init__(self, run_id):
        self.run_id = run_id
        self.evaluation_path = f"{evaluation_path}/{run_id}"
        self.text_recognition_path = f"{text_recognition_path}/{run_id}"
        self.preprocessed_path = f"{preprocessed_path}/{run_id}"
        self.layout_analysis_path = f"{layout_analysis_path}/{run_id}"
        self.image_mapping = self.map_image_to_index()
        self.df, self.cer_df, self.iou_df = self.convert_tb_data()

    def map_image_to_index(self):
        files = os.listdir(self.text_recognition_path)
        mapping = {}
        for file in files:
            if file.endswith(".json"):
                with open(os.path.join(self.text_recognition_path, file), "r") as f:
                    data = json.load(f)

            if isinstance(data, dict) and "image_index" in data.keys():
                mapping[data["image_index"]] = file.split(".")[0]
            else:
                continue
        return mapping

    def convert_tb_data(self):
        """Convert local TensorBoard data into Pandas DataFrame.

        Function takes the root directory path and recursively parses
        all events data.
        If the `sort_by` value is provided then it will use that column
        to sort values; typically `wall_time` or `step`.

        *Note* that the whole data is converted into a DataFrame.
        Depending on the data size this might take a while. If it takes
        too long then narrow it to some sub-directories.

        Paramters:
            root_dir: (str) path to root dir with tensorboard data.

        Returns:
            pandas.DataFrame with [wall_time, name, step, value] columns.

        """

        def convert_tfevent(filepath):
            return pd.DataFrame(
                [
                    parse_tfevent(e)
                    for e in summary_iterator(filepath)
                    if len(e.summary.value)
                ]
            )

        def parse_tfevent(tfevent):
            return dict(
                wall_time=tfevent.wall_time,
                name=tfevent.summary.value[0].tag,
                step=tfevent.step,
                value=float(tfevent.summary.value[0].simple_value),
            )

        columns_order = ["wall_time", "name", "step", "value"]

        out = []
        for root, _, filenames in os.walk(self.evaluation_path):
            for filename in filenames:
                if "events.out.tfevents" not in filename:
                    continue
                file_full_path = os.path.join(root, filename)
                out.append(convert_tfevent(file_full_path))

        # Concatenate (and sort) all partial individual dataframes

        all_df = pd.concat(out)[columns_order]

        cer_df = all_df[all_df["name"] == "Text_recognition/_CER"]
        iou_df = all_df[all_df["name"] == "Layout_analysis/_ious"]  # noqa: F841

        all_df = all_df[
            (all_df["name"] != "Text_recognition/_CER")
            & (all_df["name"] != "Layout_analysis/_ious")
        ]

        cer_df = (
            cer_df.reset_index()
            .rename(
                columns={"value": "CER", "step": "image_index", "index": "bbox_index"}
            )
            .drop(columns=["name"])
        )
        iou_df = (
            iou_df.reset_index()
            .rename(
                columns={"value": "IOU", "step": "image_index", "index": "bbox_index"}
            )
            .drop(columns=["name"])
        )

        pivoted_df = (
            pd.pivot_table(all_df, columns="name", values=["value"], index=["step"])
            .droplevel(0, axis=1)
            .reset_index()
            .rename_axis(None)
            .rename(columns={"step": "image_index"})
        )
        pivoted_df.columns.name = None
        return pivoted_df, cer_df, iou_df

    def display_image(self, image_name):
        image = Image.open(os.path.join(self.preprocessed_path, image_name + ".jpg"))
        display(image)

    def text_recognition_result(self, image_name):
        text_result = os.path.join(self.text_recognition_path, image_name + ".json")
        with open(text_result, "r") as f:
            text_data = json.load(f)
        return text_data

    def read_image(self, image_name):
        image = cv2.imread(os.path.join(self.preprocessed_path, image_name + ".jpg"))
        return image

In [None]:
# The run Ids of the experiments to compare
run_1 = Evaluation(run_id="20250115-174008_moondream")
run_2 = Evaluation(run_id="20250116-130831_easyocr_test")

In [None]:
print(run_1.run_id)
run_1.df.head(), run_1.cer_df.head(), run_1.iou_df.head()

In [None]:
print(run_2.run_id)
run_2.df.head(), run_2.cer_df.head(), run_2.iou_df.head()

In [None]:
print(
    f"Data with num ground_truth == num text predictions ({run_1.run_id}): {len(run_1.df.dropna(subset='Text_recognition/_average_CER'))} / {len(run_1.df)}"
)
print(
    f"Data with num ground_truth == num text predictions ({run_2.run_id}): {len(run_2.df.dropna(subset='Text_recognition/_average_CER'))} / {len(run_2.df)}"
)

# Average IOUS

In [None]:
run_1.df.plot.hist(column="Layout_analysis/_average_ious", bins=50)

In [None]:
run_2.df.plot.hist(column="Layout_analysis/_average_ious", bins=50)

# Text Evaluation

In [None]:
run_1.df.plot.hist(column="Text_recognition/_average_CER", bins=100)  # , range=(0, 1))
run_2.df.plot.hist(column="Text_recognition/_average_CER", bins=100)  # , range=(0, 1))

# Compare per image 

In [120]:
# List of images
images = list(run_1.image_mapping.values())  # list(run_2.image_mapping.values())

In [121]:
# Image number
i = 1
image_name = images[i]

In [None]:
run_2.display_image(image_name)

In [123]:
# Get layout analysis results

text_result_1 = run_1.text_recognition_result(image_name)
text_result_2 = run_2.text_recognition_result(image_name)

# xml = read_xml(os.path.join(results_path, images[i] + ".xml"))

In [None]:
run_1.df.iloc[i]["Text_recognition/_average_CER"]

In [None]:
image = run_1.read_image(image_name)

for i, region in enumerate(text_result_1["regions"]):
    cropped_image = crop_image(image, region["bbox"])
    display(Image.fromarray(cropped_image))
    print(run_1.run_id)
    print(region["text"])
    print("------------------------")
    print(run_2.run_id)
    print(text_result_2["regions"][i]["text"])

In [54]:
# ruff: noqa
def read_xml(file_path: str) -> dict:
    """
    Parses an XML file and extracts information about pages, text regions, and text lines.

    Args:
        file_path (str): The path to the XML file to be parsed.

    Returns:
        dict: A dictionary containing the extracted information with the following structure:
            {
                "image_filename": str,  # The filename of the image associated with the page
                "image_width": str,     # The width of the image
                "image_height": str,    # The height of the image
                "text_regions": [       # A list of text regions
                    {
                        "id": str,           # The ID of the text region
                        "orientation": str,  # The orientation of the text region
                        "coords": str,       # The coordinates of the text region
                        "text": str,         # The text content of the whole text region
                        "text_lines": [      # A list of text lines within the text region
                            {
                                "id": str,        # The ID of the text line
                                "coords": str,    # The coordinates of the text line
                                "baseline": str,  # The baseline coordinates of the text line
                                "text": str       # The text content of the text line
                            }
                        ]
                    }
                ]
            }
    """

    tree = etree.parse(file_path)  # noqa: S320  # Using `lxml` to parse untrusted data is known to be vulnerable to XML attacks
    ns = {"ns": tree.getroot().nsmap.get(None, "")}

    page = tree.find(".//ns:Page", namespaces=ns)
    output = {
        "image_filename": page.get("imageFilename"),
        "image_width": page.get("imageWidth"),
        "image_height": page.get("imageHeight"),
        "text_regions": [],
    }

    for region in page.findall(".//ns:TextRegion", namespaces=ns):
        text_region = {
            "id": region.get("id"),
            "orientation": region.get("orientation"),
            "coords": region.find(".//ns:Coords", namespaces=ns).get("points"),
            "text": (
                region.findall(".//ns:TextEquiv", namespaces=ns)[-1]
                if region.findall(".//ns:TextEquiv", namespaces=ns)
                else region.find(".//ns:Unicode", namespaces=ns).text or ""
            ),
            "text_lines": [],
        }

        for line in region.findall(".//ns:TextLine", namespaces=ns):
            text_region["text_lines"].append(
                {
                    "id": line.get("id"),
                    "coords": line.find(".//ns:Coords", namespaces=ns).get("points"),
                    "baseline": line.find(".//ns:Baseline", namespaces=ns).get(
                        "points"
                    ),
                    "text": (
                        line.find(".//ns:TextEquiv", namespaces=ns)
                        .find(".//ns:Unicode", namespaces=ns)
                        .text
                        or ""
                    ),
                }
            )

        output["text_regions"].append(text_region)

    return output