In [3]:
from pathlib import Path
import json

ds_path = "/data/training/master_thesis/datasets/2023-05-23"
mode = "testing"

with (Path(ds_path) / f"{mode}.txt").open("r") as file:
    samples = [Path(line.rstrip()) for line in file.readlines()]

In [4]:
for letter_id, sample_path in enumerate(samples):
    sample_path = Path(ds_path) / sample_path
    document = json.load(open(sample_path / "document.json"))

    best_candidate = max(
        document["prediction"]["candidates"], key=lambda c: c["score"]
    )

    assert len(best_candidate["documents"]) > 0 and len(document["pages"]) > 0

    ground_truth = []
    for doc_id, predicted_doc in enumerate(best_candidate["documents"]):

        if "pages" not in predicted_doc or len(predicted_doc["pages"]) == 0:
            pages = [{"sourcePage": i} for i in range(len(document["pages"]))]
        else:
            pages = predicted_doc["pages"]

        class_identifier = str(
            Path(predicted_doc["documentClass"]).relative_to(
                document["documentClass"]
            )
        )
        
        doc = []
        for dst_page, page in enumerate(pages):
            src_page = page.get("sourcePage", 0)  # NOTE the default value is 0

            page = {
                "src_page": src_page,
                "letter_id": letter_id,
                "doc_id": doc_id,
                "page_nr": dst_page,
                "doc_class": class_identifier,
            }
            ground_truth.append(page)
            
    json.dump(ground_truth, open(sample_path / "ground_truth.json", "w+"))

In [6]:
from mosaic_dataset import MosaicDataModule, Bucket
from pathlib import Path
import json

ds_path = "/data/training/master_thesis/datasets/2023-05-23"

unique_classes =[]

for bucket in ["training", "validation", "testing"]:
    with (Path(ds_path) / f"{bucket}.txt").open("r") as file:
        inventory = [Path(line.rstrip()) for line in file.readlines()]

    for idx in range(len(inventory)):
        sample_path = ds_path / inventory[idx]

        document = json.loads((sample_path / "document.json").read_bytes().decode())

        best_candidate = max(document["prediction"]["candidates"], key=lambda c: c["score"])
        assert len(best_candidate["documents"]) > 0 and len(document["pages"]) > 0


        document_stack = []
        for doc_id, predicted_doc in enumerate(best_candidate["documents"]):
            class_identifier = str(
                Path(predicted_doc["documentClass"]).relative_to(document["documentClass"])
            )

            if class_identifier not in unique_classes:
                unique_classes.append(class_identifier)

unique_classes = sorted(unique_classes)

import json
json.dump(unique_classes, open("/data/training/master_thesis/datasets/bzuf_classes.json", "w+"))