In [1]:
from pathlib import Path
from typing import List
import json

In [2]:
DATASET_NAME = "IDL-less_5_pages"
BASE_PATH = "../dataset/ucsf-idl-resized"
SAMPLE_FILE  = "samples.json"

# Filters:
allowed_types = ["email", "letter", "agenda", "comments", "report", "conference proceedings", "article", "memo", 'photograph']
min_types = 1
max_types = 1

min_pages = 1
max_pages = 5

In [3]:
samples = json.load(open( Path(BASE_PATH) / SAMPLE_FILE ))

labels = []
for sample in samples:
    label = {}
    label["id"] = sample["id"]

    if "img_path" in sample:
        label["image_folder"] = sample["image_folder"]
    else:
        label["image_folder"] = str(Path("images") / str(sample["id"]))  
    
    # Filters 
    if min_pages and not int(sample["pages"]) >= min_pages:
        continue
    if max_pages and not int(sample["pages"]) <= max_pages:
        continue
    label["pages"] = sample["pages"]
    
    types = sample["type"].replace(",", "").replace(";", "").split(" ")
    types = [types] if len(types)== 0 else types
    types = [t for t in types if t in allowed_types]
    if min_types and not len(types) >= min_types:
        continue
    if max_types and not len(types) <= max_types:
        continue
    label["type"] = types[0] 

    labels.append(label)


ds = {
    "labels": labels,
    "metadata": {
        "path": str(Path(BASE_PATH).resolve()),
        "sample_file": SAMPLE_FILE,
        "filters": {
            "allowed_types": allowed_types,
            "min_types": min_types,
            "max_types": max_types,
            "min_pages": min_pages,
            "max_pages": max_pages,
        }
    }
}
# save labels  
json.dump(ds, open(Path(BASE_PATH) / f"{DATASET_NAME}.json", "w+"))