# Generate images to investigate the predictions from each approach, without considering evaluation
The idea is to produce a first, easier, set of images before taking more dev time to produce detailed visualization which re-run the evaluation selectively.

We will try to compare predictions between methods and against the ground truth, for each task.

We also want to be able to visualize:

- best results (different for every method **OR, better, samples easy for all methods**)
- worst results (different for every method **OR, better, samples hard for all methods**)
- random results, with fixed seed (same input for every method, for comparison)

For the RUMSEY subset, loading the predictions can be challenging because of the size of some files, the code developed for the IGN subset must be adapted.

Quickly check we have all images from test sets available.

Note: generation of figures with images may be extracted to another notebook for clarity and to avoid the need to download image test sets for basic graph generation.

In [None]:
from icdar_maptext_analysis.loaders import check_for_missing_images, VALID_SUBSETS
print("Valid subsets:", VALID_SUBSETS)
for subset in VALID_SUBSETS:
    if len(missing_image := check_for_missing_images(subset)) > 0:
        print(f"Missing images in {subset}: {missing_image}")
        raise RuntimeError(f"Missing images in {subset}")

## Select some random images for all tasks

In [None]:
from icdar_maptext_analysis.loaders import list_gt_images, TypeDatasetName

def select_random_images(subset: TypeDatasetName, n: int, seed: int = 42) -> list[str]:
    images_list = list_gt_images(subset)
    import random
    random.seed(42)
    random.shuffle(images_list)
    return images_list[:n]

## Load the list of valid submissions

In [None]:
from icdar_maptext_analysis.loaders import load_valid_submissions_metadata

valid_submissions_metadata = load_valid_submissions_metadata()
valid_submissions_metadata.sample(5, random_state=42)

In [None]:
from icdar_maptext_analysis.loaders import TypeTaskId
import pandas as pd
def select_valid_submissions(task_id: TypeTaskId, subset: TypeDatasetName) -> pd.DataFrame:
    return valid_submissions_metadata[(valid_submissions_metadata.task == task_id) & (valid_submissions_metadata.subset == subset)]

In [None]:
select_valid_submissions(1, "rumsey")

## Display raw predictions for task 1

In [None]:
# a submission has the following format:
# [ # Begin a list of images
#     {
#      "image": "IMAGE_NAME1",
#      "groups": [ # Begin a list of phrase groups for the image
#         [ # Begin a list of words for the phrase
#           {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT1"},
#           ...,
#           {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT2"}
#        ],
#        ...
#        [ {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT3}, ... ]
#     ] },
#     {
#      "image": "IMAGE_NAME2",
#      "groups": [
#         [
#           {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT4"},
#           ...,
#           {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT5"}
#         ],
#         ...
#         [ {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT6"}, ... ] 
#     ] },
#     ...
# ]

In [None]:
# the ground truth has the following format:
# [ # Begin a list of images
#     {
#      "image": "IMAGE_NAME1",
#      "groups": [ # Begin a list of phrase groups for the image
#          [  # Begin a list of words for the phrase
#            {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT1", "illegible": False, "truncated": False},
#            ...,
#            {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT2", "illegible": True, "truncated": False}
#          ],
#           ...
#          [ {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT3", "illegible": False, "truncated": True}, ... ]
#      ] },
#     {
#      "image": "IMAGE_NAME2",
#      "groups": [
#          [
#            {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT4", "illegible": False, "truncated": False},
#            ...,
#            {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT5", "illegible": False, "truncated": False}],
#           ...
#          [ {"vertices": [[x1, y1], [x2, y2], ..., [xN, yN]], "text": "TEXT6", "illegible": False, "truncated": False}, ... ] 
#      ] },
#      ...
# ]

In [None]:
def extract_detection_for_image(submission: list[dict], image_id: str) -> list[list]:
    """Extract predictions for a given image from  a submission or gt

        [ [GROUP1], [GROUP2], ... ],
    """
    for image_data in submission:
        image_name = image_data["image"]
        if image_name == image_id:
            return image_data["groups"]
    return None

In [None]:
from icdar_maptext_analysis.loaders import open_image, load_gt, load_submission

In [None]:
from PIL import Image
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import numpy as np

In [None]:
def display_image_comparison_detection(gt_data_image: list[list[dict]], submission_data_image: list[list[dict]], image_id: str, submission_name=None, ax=None):
    # TODO load the image in the caller
    image = open_image(image_id)
    image = np.array(image)

    # create a figure and axis
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(image)

    # plot the ground truth
    for group in gt_data_image:
        for word in group:
            vertices = np.array(word["vertices"])
            edgecolor = "red" if (word["illegible"] or word["truncated"]) else "green"
            ax.add_patch(patches.Polygon(vertices, edgecolor=edgecolor, facecolor="none", linewidth=1, alpha=0.5))

    # plot the submission
    if submission_data_image is not None:
        for group in submission_data_image:
            for word in group:
                vertices = np.array(word["vertices"])
                ax.add_patch(patches.Polygon(vertices, edgecolor="blue", facecolor="none", linewidth=1, alpha=0.5))
    
    # ensure the aspect ratio is correct
    ax.set_aspect("equal")
    # ensure display boundaries match the image size
    ax.set_xlim(0, image.shape[1])
    ax.set_ylim(image.shape[0], 0)
    
    # set the title
    # title = f"Predictions of {submission_name}" if submission_name else "Predictions"
    title = f"{submission_name}" if submission_name else "Predictions"
    title += f" vs GT" #\n{image_id}"
    ax.set_title(title)

In [None]:
from icdar_maptext_analysis.submissions_metadata import shorten_title

def compare_submissions_to_gt_task1(subset: str, image_id: str, filename: str = None, use_two_rows: bool = False):
    # load gt data
    gt_data = load_gt(subset)
    gt_data_image = extract_detection_for_image(gt_data, image_id)

    # load submissions metadata for the task and subset
    valid_submissions = select_valid_submissions(1, subset)

    # create a subplot with as many columns as there are submissions
    _fig, axs = None, None
    ax_row_len = None
    if use_two_rows:
        ax_row_len = (len(valid_submissions)+1)//2
        _fig, axs = plt.subplots(2, ax_row_len, figsize=(5*ax_row_len, 5*2))
    else:  # Use only 1 row
        _fig, axs = plt.subplots(1, len(valid_submissions), figsize=(5*len(valid_submissions), 5))

    # for each submission, display the image with the ground truth and the submission, in the right subplot
    for plot_id, (_row_id, row) in enumerate(valid_submissions.iterrows()):
        submission_id = row["submission_id"]
        submission_data = load_submission(1, subset, submission_id)
        submission_data_image = extract_detection_for_image(submission_data, image_id)
        submission_name = shorten_title(row["method_name"])
        plot_ax = axs[plot_id] if not use_two_rows else axs[plot_id//ax_row_len, plot_id%ax_row_len]
        display_image_comparison_detection(gt_data_image, submission_data_image, image_id, submission_name=submission_name, ax=plot_ax)
        # remove the submission data from memory
        del submission_data
        del submission_data_image

        # enlarge text if we use two rows
        if use_two_rows:
            plot_ax.title.set_fontsize(20)

    # add some extra vertical space between the two rows
    if use_two_rows:
        plt.subplots_adjust(hspace=0.5)

    # remove axis labels
    if use_two_rows:
        for ax in axs:
            for axx in ax:
                axx.axis("off")
                
    else:
        for ax in axs:
            ax.axis("off")
    
    # adjust layout
    plt.tight_layout()

    # save the figure
    if filename is not None:
        plt.savefig(filename, bbox_inches="tight")
    
    # release the figure to avoid keeping it in memory
    plt.close()

In [None]:
# compute the average quality for each image over the submissions for a given task and subset
from icdar_maptext_analysis.loaders import load_evaluations
def compute_average_quality(task_id: int, subset: str) -> pd.Series:
    valid_submission_ids = select_valid_submissions(task_id, subset).submission_id.to_list()
    _results_global, results_per_image = load_evaluations(task_id, subset, filter_fn=lambda x: int(x) in valid_submission_ids)
    criterion = "hmean"
    return results_per_image[criterion].groupby(by="image_id").mean().sort_values(ascending=False)
    

In [None]:
from typing import Callable
import os

def generate_plot_raw_predictions(taskid: int, output_dir:str, display_function: Callable, num_images: int=10):
    """
    display_function: (subset, image_id, filename) -> None
    """
    output_dir = f"{output_dir}/task{taskid}"
    # random images
    for subset in reversed(VALID_SUBSETS):
        out_dir = f"{output_dir}/{subset}/random"
        os.makedirs(out_dir, exist_ok=True)
        random_images_subset = select_random_images(subset, num_images)
        for ii, image_id in enumerate(random_images_subset):
            print(f"Generating plot for task {taskid}/{subset} {image_id} (random#{ii})")
            image_name = os.path.splitext(os.path.basename(image_id))[0]
            display_function(subset, image_id, filename=f"{out_dir}/{image_name}.pdf")

    # easy and hard images for current task
    for subset in reversed(VALID_SUBSETS):
        sample_sorted_values = compute_average_quality(taskid, subset)
        easy_images = sample_sorted_values.head(num_images).index.to_list()
        hard_images = sample_sorted_values.tail(num_images).index.to_list()
        # generate comparison plots for the easy and hard images
        for difficulty_name, difficulty_group in [
            ("easy", easy_images), 
            ("hard", hard_images),
            ]:
            for ii, image_id in enumerate(difficulty_group):
                print(f"Generating plot for task {taskid}/{subset} {image_id} ({difficulty_name}#{ii})")
                image_name = os.path.splitext(os.path.basename(image_id))[0]
                out_dir = f"{output_dir}/{subset}/{difficulty_name}"
                os.makedirs(out_dir, exist_ok=True)
                display_function(subset, image_id, filename=f"{out_dir}/{image_name}.pdf")

In [None]:
# define and create the output directory
OUTPUT_DIR_BASE = "data/20-raw-predictions"

In [None]:
# generate results for task 1
generate_plot_raw_predictions(1, OUTPUT_DIR_BASE, compare_submissions_to_gt_task1)

## Display raw predictions for task 2

In [None]:
import matplotlib.pyplot as plt
from typing import List, Tuple, Optional

# Optional reference of qualitative maps
# Paired: 12, Dark2: 8, Set1: 9, tab10: 10, tab20: 20
# _SUPPORTED_QUALITATIVE = {"Paired", "Dark2", "Set1", "tab10", "tab20"}

# Okabe–Ito colorblind-safe palette (RGB in [0,1])
_OKABE_ITO = [
    (0.0, 0.45, 0.70),   # blue
    (0.87, 0.80, 0.10),  # yellow
    (0.0, 0.62, 0.45),   # green
    (0.94, 0.40, 0.0),   # vermilion
    (0.80, 0.47, 0.74),  # purple
    (0.35, 0.70, 0.90),  # sky blue
    (0.0, 0.0, 0.0),     # black
    (0.90, 0.60, 0.0),   # orange
]

def generate_polygon_colors(
    k: int,
    *,
    cmap: Optional[str] = None,
    colorblind_safe: bool = False
) -> List[Tuple[float, float, float]]:
    """
    Return up to 20 *unique* colors with strong discriminability for polygon fills.

    Parameters
    ----------
    k : int
        Number of polygons visible at once (max concurrent, not total possible types).
    cmap : str, optional
        Name of a qualitative matplotlib colormap to force (ignored if colorblind_safe=True).
        If None and colorblind_safe=False, auto-selects:
            - "Paired" if k <= 12
            - "tab20" if 13 <= k (capped to 20)
    colorblind_safe : bool, default=False
        If True, returns the Okabe-Ito colorblind-friendly palette
        (length = min(k, 8)), no repeats.

    Returns
    -------
    List[Tuple[float, float, float]]
        Ordered list of RGB tuples in [0, 1], with no duplicates.
        Length = min(k, available_colors_in_map, 20).

    Notes
    -----
    - No shuffling is performed; order is stable across calls.
    - For k > available_colors_in_map, result is capped (no duplicates).
    """
    if k <= 0:
        return []

    # Colorblind-safe mode overrides all else
    if colorblind_safe:
        return _OKABE_ITO[:min(k, len(_OKABE_ITO))]

    # Auto-select qualitative map if none provided
    if cmap is None:
        cmap = "Paired" if k <= 12 else "tab20"

    cm = plt.get_cmap(cmap)

    # Prefer discrete color lists from qualitative maps
    if hasattr(cm, "colors"):
        base = [tuple(c[:3]) for c in cm.colors]  # strip alpha if present
    else:
        # Fallback: sample evenly from continuous map
        target = min(k, 20)
        if target == 1:
            base = [tuple(cm(0.5)[:3])]
        else:
            base = [tuple(cm(i / (target - 1))[:3]) for i in range(target)]

    # Enforce uniqueness
    unique: List[Tuple[float, float, float]] = []
    seen = set()
    for col in base:
        if col not in seen:
            unique.append(col)
            seen.add(col)

    return unique[:min(k, 20, len(unique))]



# Example:
# K = len(polygons_visible_now)
# COLORS = generate_polygon_colors(K)            # auto-selects Paired/tab20
# COLORS = generate_polygon_colors(K, cmap="Set1")  # force a specific palette

import matplotlib.patches as patches

# Simulate number of polygons visible now
K = 28
COLORS = generate_polygon_colors(K)

# Plot preview
fig, ax = plt.subplots(figsize=(10, 1))
for i, color in enumerate(COLORS):
    ax.add_patch(patches.Rectangle((i, 0), 1, 1, facecolor=color, edgecolor="none"))
ax.set_xlim(0, len(COLORS))
ax.set_ylim(0, 1)
ax.axis("off")
plt.show()


In [None]:
def display_image_groups(det_image: list[list], image: np.ndarray, submission_name=None, ax=None):
    # load the image
    # if submission_name is not None:
    #     print(f"Displaying image {image_id} for submission {submission_name}")
    # else:
    #     print(f"Displaying image {image_id}")

    # create a figure and axis
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(image)

    COLORS = generate_polygon_colors(len(det_image))
    # plot the groups
    for ii, group in enumerate(det_image):
        # pick a color for the group
        color = COLORS[ii % len(COLORS)]
        # draw boxes for each word in the group
        for word in group:
            vertices = np.array(word["vertices"])
            edgecolor = "gray" if (word.get("illegible") or word.get("truncated")) else color
            ax.add_patch(patches.Polygon(vertices, edgecolor=edgecolor, facecolor="none", linewidth=1, alpha=0.5))
        # draw lines between the words in the group
        for i in range(1, len(group)):
            vertices1 = np.array(group[i-1]["vertices"])
            center1 = np.mean(vertices1, axis=0)
            vertices2 = np.array(group[i]["vertices"])
            center2 = np.mean(vertices2, axis=0)
            ax.plot([center1[0], center2[0]], [center1[1], center2[1]], color=color, linewidth=2, alpha=0.8)

    # ensure the aspect ratio is correct
    ax.set_aspect("equal")
    # ensure display boundaries match the image size
    ax.set_xlim(0, image.shape[1])
    ax.set_ylim(image.shape[0], 0)
    
    # set the title
    title = f"{submission_name}" if submission_name else "Predictions"
    ax.set_title(title)

In [None]:
def plot_gouping_comparison_task2(subset: str, image_id: str, filename:str =None):
    TASK_ID = 2

    # Load the image once for all
    image = open_image(image_id)
    image = np.array(image)

    # load gt data
    gt_data = load_gt(subset)
    gt_data_image = extract_detection_for_image(gt_data, image_id)

    # load submissions metadata for the task and subset
    valid_submissions = select_valid_submissions(TASK_ID, subset)

    # create a subplot with as many columns as there are submissions
    fig, axs = plt.subplots(1, len(valid_submissions)+1, figsize=(5*(len(valid_submissions)+1), 5))
    # Ensure axs is always iterable
    if len(valid_submissions)+1 == 1:
        axs = [axs]

    # Display the ground truth
    display_image_groups(gt_data_image, image, submission_name="-- Ground Truth --", ax=axs[0])

    # for each submission, display the image with the ground truth and the submission, in the right subplot
    for plot_id, (_row_id, row) in enumerate(valid_submissions.iterrows()):
        submission_id = row["submission_id"]
        submission_data = load_submission(TASK_ID, subset, submission_id)
        submission_data_image = extract_detection_for_image(submission_data, image_id)
        submission_name = shorten_title(row["method_name"])
        display_image_groups(submission_data_image, image, submission_name=submission_name, ax=axs[plot_id+1])
        # remove the submission data from memory
        del submission_data
        del submission_data_image
    
    # remove axis labels
    for ax in axs:
        ax.axis("off")
    
    # adjust layout
    plt.tight_layout()

    # save the figure
    if filename is not None:
        plt.savefig(filename, bbox_inches="tight")
    
    # release the figure to avoid keeping it in memory
    plt.close()

In [None]:
generate_plot_raw_predictions(2, OUTPUT_DIR_BASE, plot_gouping_comparison_task2)

## Task 3

In [None]:
# For Asian characters, we need to have the Noto CJK fonts installed:
# This can be done on Ubuntu with:
# $ sudo apt install fonts-noto-cjk

from matplotlib import font_manager

def get_font_with_cjk_fallback():
    """
    Returns a FontProperties object for a font that supports both
    Chinese and Latin characters.
    Prefers Noto Sans CJK, falls back to common alternatives.
    """
    # Ordered list of preferred fonts
    preferred_fonts = [
        "Noto Sans CJK SC",    # Simplified Chinese
        "Noto Sans CJK TC",    # Traditional Chinese
        "Noto Sans CJK JP",    # Japanese (also supports Chinese)
        "WenQuanYi Zen Hei",   # Common in some Linux distros
        "AR PL UMing CN",      # Another common CJK font
        "SimHei",              # Windows
        "Microsoft YaHei",     # Windows
    ]
    
    # Get system font names
    system_fonts = set(f.name for f in font_manager.fontManager.ttflist)
    
    for font_name in preferred_fonts:
        if font_name in system_fonts:
            return font_manager.FontProperties(family=font_name)
    
    raise RuntimeError(
        "No suitable CJK font found. Please install 'fonts-noto-cjk' or another CJK-capable font."
    )

# Pick font with fallback
cjk_font = get_font_with_cjk_fallback()


def display_transcriptions_isolated(det_image: list[list], image: np.ndarray, submission_name=None, ax=None):
    # create a figure and axis
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(image)


    if not(det_image is None or len(det_image) == 0):
        # plot the groups
        for ii, group in enumerate(det_image):
            # draw boxes for each word in the group
            for jj, word in enumerate(group):
                # pick a different color for each word (we don't overlay GT because 2 transcriptions for each word would be a mess)
                color = COLORS[(ii*jj) % len(COLORS)]
                vertices = np.array(word["vertices"])
                edgecolor = "gray" if (word.get("illegible") or word.get("truncated")) else color
                ax.add_patch(patches.Polygon(vertices, edgecolor=edgecolor, facecolor="none", linewidth=1, alpha=0.5))
                # add the text
                center = np.mean(vertices, axis=0)
                ax.text(center[0], center[1], word["text"], color="black", fontsize=10, ha="center", va="center", fontproperties=cjk_font)


    # ensure the aspect ratio is correct
    ax.set_aspect("equal")
    # ensure display boundaries match the image size
    ax.set_xlim(0, image.shape[1])
    ax.set_ylim(image.shape[0], 0)
    
    # set the title
    title = f"{submission_name}" if submission_name else "Predictions"
    ax.set_title(title)


def plot_transcription_comparison_task3(subset: str, image_id: str, filename:str =None):
    TASK_ID = 3

    # Load the image once for all
    image = open_image(image_id)
    image = np.array(image)

    # load gt data
    gt_data = load_gt(subset)
    gt_data_image = extract_detection_for_image(gt_data, image_id)

    # load submissions metadata for the task and subset
    valid_submissions = select_valid_submissions(TASK_ID, subset)

    # create a subplot with as many columns as there are submissions
    fig, axs = plt.subplots(1, len(valid_submissions)+1, figsize=(5*(len(valid_submissions)+1), 5))

    # Display the ground truth
    display_transcriptions_isolated(gt_data_image, image, submission_name="-- Ground Truth --", ax=axs[0])

    # for each submission, display the image with the ground truth and the submission, in the right subplot
    for plot_id, (_row_id, row) in enumerate(valid_submissions.iterrows()):
        submission_id = row["submission_id"]
        submission_data = load_submission(TASK_ID, subset, submission_id)
        submission_data_image = extract_detection_for_image(submission_data, image_id)
        submission_name = shorten_title(row["method_name"])
        display_transcriptions_isolated(submission_data_image, image, submission_name=submission_name, ax=axs[plot_id+1])
        # remove the submission data from memory
        del submission_data
        del submission_data_image
    
    # remove axis labels
    for ax in axs:
        ax.axis("off")
    
    # adjust layout
    plt.tight_layout()

    # save the figure
    if filename is not None:
        plt.savefig(filename, bbox_inches="tight")
    
    # release the figure to avoid keeping it in memory
    plt.close()

In [None]:
generate_plot_raw_predictions(3, OUTPUT_DIR_BASE, plot_transcription_comparison_task3)

## Task 4

In [None]:
def display_transcriptions_grouped(det_image: list[list], image: np.ndarray, submission_name=None, ax=None):
    # create a figure and axis
    if ax is None:
        fig, ax = plt.subplots()
    ax.imshow(image)

    if not(det_image is None or len(det_image) == 0):
        # plot the groups
        for ii, group in enumerate(det_image):
            # pick a color for the group
            color = COLORS[ii % len(COLORS)]
            # draw lines between the words in the group
            for i in range(1, len(group)):
                vertices1 = np.array(group[i-1]["vertices"])
                center1 = np.mean(vertices1, axis=0)
                vertices2 = np.array(group[i]["vertices"])
                center2 = np.mean(vertices2, axis=0)
                ax.plot([center1[0], center2[0]], [center1[1], center2[1]], color=color, linewidth=2, alpha=0.8)
            # draw boxes for each word in the group
            for word in group:
                vertices = np.array(word["vertices"])
                edgecolor = "gray" if (word.get("illegible") or word.get("truncated")) else color
                ax.add_patch(patches.Polygon(vertices, edgecolor=edgecolor, facecolor="none", linewidth=1, alpha=0.5))
                # add the text
                center = np.mean(vertices, axis=0)
                ax.text(center[0], center[1], word["text"], color="black", fontsize=10, ha="center", va="center", fontproperties=cjk_font)


    # ensure the aspect ratio is correct
    ax.set_aspect("equal")
    # ensure display boundaries match the image size
    ax.set_xlim(0, image.shape[1])
    ax.set_ylim(image.shape[0], 0)
    
    # set the title
    title = f"{submission_name}" if submission_name else "Predictions"
    ax.set_title(title)

def plot_transcription_comparison_task4(subset: str, image_id: str, filename:str =None):
    TASK_ID = 4

    # Load the image once for all
    image = open_image(image_id)
    image = np.array(image)

    # load gt data
    gt_data = load_gt(subset)
    gt_data_image = extract_detection_for_image(gt_data, image_id)

    # load submissions metadata for the task and subset
    valid_submissions = select_valid_submissions(TASK_ID, subset)

    # create a subplot with as many columns as there are submissions
    fig, axs = plt.subplots(1, len(valid_submissions)+1, figsize=(5*(len(valid_submissions)+1), 5))
    # Ensure axs is always iterable
    if len(valid_submissions)+1 == 1:
        axs = [axs]

    # Display the ground truth
    display_transcriptions_grouped(gt_data_image, image, submission_name="-- Ground Truth --", ax=axs[0])

    # for each submission, display the image with the ground truth and the submission, in the right subplot
    for plot_id, (_row_id, row) in enumerate(valid_submissions.iterrows()):
        submission_id = row["submission_id"]
        submission_data = load_submission(TASK_ID, subset, submission_id)
        submission_data_image = extract_detection_for_image(submission_data, image_id)
        submission_name = shorten_title(row["method_name"])
        display_transcriptions_grouped(submission_data_image, image, submission_name=submission_name, ax=axs[plot_id+1])
        # remove the submission data from memory
        del submission_data
        del submission_data_image
    
    # remove axis labels
    for ax in axs:
        ax.axis("off")
    
    # adjust layout
    plt.tight_layout()

    # save the figure
    if filename is not None:
        plt.savefig(filename, bbox_inches="tight")
    
    # release the figure to avoid keeping it in memory
    plt.close()

In [None]:
generate_plot_raw_predictions(4, OUTPUT_DIR_BASE, plot_transcription_comparison_task4)

## Extra figure for the paper:
2-rows figure for `rumsey/test/11792030_h3_w5.png` task 1

In [None]:
outdir_extra_paper = f"{OUTPUT_DIR_BASE}/extrafigs"
os.makedirs(outdir_extra_paper, exist_ok=True)
for image_id in ("rumsey/test/11792030_h3_w5.png", "rumsey/test/3287004_h2_w6.png", ):
    basename = os.path.splitext(os.path.basename(image_id))[0]
    compare_submissions_to_gt_task1("rumsey", image_id, filename=f"{outdir_extra_paper}/{basename}.pdf", use_two_rows=True)

and also for task 2 for control for the following images:
- 9016007_h13_w9
- 9103002_h2_w6
- 9309000_h8_w11

In [None]:
# outdir_extra_check = f"{OUTPUT_DIR_BASE}/extracheck"
# os.makedirs(outdir_extra_check, exist_ok=True)
# manual_selection_t2 = [
#     "rumsey/test/9016007_h13_w9.png", 
#     "rumsey/test/9103002_h2_w6.png", 
#     "rumsey/test/9309000_h8_w11.png"]
# for image_id in manual_selection_t2:
#     basename = os.path.splitext(os.path.basename(image_id))[0]
#     plot_gouping_comparison_task2("rumsey", image_id, filename=f"{outdir_extra_check}/{basename}.pdf")

In [None]:
# below is an attempt to generate a plot task 3, sampling word images and displaying their transcriptions (cropping the word image)

In [None]:
# # display some a random selecton of words (from random groups) from a submission
# # show a crop of the image with the detected word and its transcription below
# def display_word_sample(submission_indexed: dict[list[list]], image_id: str, word_count: int, submission_name=None, ax=None):
#     # load the image
#     image = open_image(image_id)
#     image = np.array(image)

#     # randomly select word_count words from all words from all groups in the submission
#     rng = np.random.default_rng(seed=42)
#     words = []
#     for group in submission_indexed[image_id]:
#         words.extend(group)
#     word_count_ = min(word_count, len(words))
#     words = rng.choice(words, size=word_count_, replace=False)

#     # create a figure and axis
#     # for each word, plot the word and the transcription below
#     if ax is None:
#         fig, ax = plt.subplots(word_count, 1, figsize=(10, 10))
#     for i, word in enumerate(words):
#         vertices = np.array(word["vertices"])
#         x_min, y_min = np.min(vertices, axis=0).astype(int)
#         x_max, y_max = np.max(vertices, axis=0).astype(int)
#         # margin = 10
#         # x_min = max(0, x_min - margin)
#         # y_min = max(0, y_min - margin)
#         # x_max = min(image.shape[1], x_max + margin)
#         # y_max = min(image.shape[0], y_max + margin)
#         # resize the image to a fixed size, respecting the aspect ratio
#         image_crop = image[y_min:y_max, x_min:x_max]
#         image_crop = Image.fromarray(image_crop)
#         image_crop.thumbnail((128, 128))
#         ax[i].imshow(image_crop)
#         ax[i].axis("off")
#         ax[i].set_title(word["text"])
#         # increase title size
#         ax[i].title.set_size(10)
#     # if we have unused axes, hide them
#     for i in range(word_count_, word_count):
#         ax[i].axis("off")

In [None]:
# display_image_comparison_detection(sample_submission_indexed, sample_gt_indexed, random_image_ids[0])

In [None]:
# def compare_submissions_to_gt(task_id: int, subset: str, image_id: str):
#     # load gt data
#     gt = load_gt(subset)

#     # list available submissions
#     submissions_ids = list_valid_submissions(task_id, subset)
#     # print(f"Found {len(submissions_ids)} submissions")
#     # load all submissions and keep their ids
#     submissions = {submission_id: load_submission(task_id, subset, submission_id) for submission_id in submissions_ids}
#     # print(f"Loaded {len(submissions)} submissions")
#     # index the ground truth
#     gt_indexed = extract_detection_for_image(gt, image_id)
#     # index the submissions
#     submissions_indexed = {submission_id: index_detection_by_image(submission) for submission_id, submission in submissions.items()}
    
#     match task_id:
#         case 3:
#             # create a subplot with as many lines as there are submissions, plus one for the GT
#             num_words = 10
#             fig, axs = plt.subplots(len(submissions)+1, num_words, figsize=(num_words*5, (len(submissions)+1)*5))
#             # display the ground truth
#             display_word_sample(gt_indexed, image_id, num_words, submission_name="GT", ax=axs[0])
#             # display the submission title on the left of the first column
#             axs[0, 0].set_ylabel("GT")  # FIXME not displayed
#             # for each submission, display a random selection of words
#             for plot_id, (submission_id, submission_indexed) in enumerate(submissions_indexed.items()):
#                 # retreive the submission name
#                 submission_name = lookup_generate_title(int(submission_id), submissions_meta, user_to_team_name)
#                 display_word_sample(submission_indexed, image_id, num_words, submission_name=submission_name, ax=axs[plot_id+1])
#                 # display the title one the left of the first column
#                 axs[plot_id+1, 0].set_ylabel(submission_name)  # FIXME not displayed
#             fig.suptitle(f"Comparison of submissions to GT for image {image_id}", fontsize=16)

    
#     # adjust layout
#     plt.tight_layout()

#     # save the figure
#     # TODO: save the figure to a file
#     plt.show()

In [None]:
# compare_submissions_to_gt(task_id=3, subset="ign", image_id=random_image_ids[0])