In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import json
import logging
import time
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor, as_completed

from pystac import Item

from stactools.hotosm.exceptions import AssetNotFoundError
from stactools.hotosm.oam_metadata import OamMetadata
from stactools.hotosm.oam_metadata_client import OamMetadataClient
from stactools.hotosm.stac import create_collection, create_item

logging.basicConfig()


class QuietLogFormatter(logging.Formatter):
    """Quieter exception logging for the notebook."""

    def format(self, record):
        """Only log the last line of an exception."""
        if not record.exc_text:
            record.exc_text = self.formatException(record.exc_info)
        record.exc_text = record.exc_text.split("\n")[-1]
        return super().format(record)


hotosm_logger = logging.getLogger("stactools.hotosm")
handler = logging.StreamHandler()
handler.setFormatter(QuietLogFormatter())
hotosm_logger.addHandler(handler)

In [3]:
collection = create_collection()
collection

In [4]:
with open("./collection.json", "w") as f:
    f.write(json.dumps(collection.to_dict(), indent=2))

In [5]:
client = OamMetadataClient.new()

In [None]:
total = client.get_count()

oam_items = client.get_all_items(page_size=20_000)


def process_metadata_to_stac(metadata: OamMetadata) -> Item:
    """Worker function to convert metadata to STAC."""
    # correct common issues in metadata, e.g., incorrect SPDX license identifiers
    sanitized_metadata = metadata.sanitize()
    return create_item(sanitized_metadata)


stac_items: list[Item] = []
failures: list[tuple[OamMetadata, Exception]] = []

time1 = time.time()
with ThreadPoolExecutor() as pool:
    futures = {}

    for oam_item in oam_items:
        future = pool.submit(process_metadata_to_stac, oam_item)
        futures[future] = oam_item

    for i, future in enumerate(as_completed(futures)):
        if i % 1000 == 0:
            print(f"Item {i}/{total}")

        oam_item = futures[future]

        try:
            stac_item = future.result()
        except AssetNotFoundError as e:
            print(f"Could not generate STAC Item for {oam_item.id}")
            failures.append((oam_item, e))
        except Exception as e:
            print(f"Unexpected error occurred for {oam_item.id}!")
            failures.append((oam_item, e))
        else:
            stac_items.append(stac_item)

time2 = time.time()
print(f"Took {(time2 - time1) / 60.0:0.3f} minutes to convert catalog to STAC")

Could not parse id=59e62b773d6412ef72209353
TypeError: fromisoformat: argument must be str
ERROR:stactools.hotosm.oam_metadata_client:Could not parse id=59e62b773d6412ef72209353
TypeError: fromisoformat: argument must be str
Could not parse id=59e62b773d6412ef722093b7
TypeError: fromisoformat: argument must be str
ERROR:stactools.hotosm.oam_metadata_client:Could not parse id=59e62b773d6412ef722093b7
TypeError: fromisoformat: argument must be str
Could not parse id=59e62b743d6412ef722091bd
TypeError: fromisoformat: argument must be str
ERROR:stactools.hotosm.oam_metadata_client:Could not parse id=59e62b743d6412ef722091bd
TypeError: fromisoformat: argument must be str


Item 0/17790
Could not generate STAC Item for 59e62b863d6412ef72209b56
Could not generate STAC Item for 59e62b863d6412ef72209b54
Could not generate STAC Item for 59e62b6f3d6412ef72208f1a
Could not generate STAC Item for 59e62b863d6412ef72209b4f
Could not generate STAC Item for 59e62b6f3d6412ef72208f2d
Could not generate STAC Item for 59e62b703d6412ef72208f44
Could not generate STAC Item for 678bbefa1b953900016c6935
Could not generate STAC Item for 59e62b703d6412ef72208f3f
Could not generate STAC Item for 59e62b703d6412ef72208f39
Could not generate STAC Item for 59e62b703d6412ef72208f33
Could not generate STAC Item for 59e62b703d6412ef72208f35
Could not generate STAC Item for 6791aec314a89b000127627d
Could not generate STAC Item for 67f3b4b51caa479c597a2a5e
Could not generate STAC Item for 67f3c0931caa479c597a3dcc
Could not generate STAC Item for 59e62b703d6412ef72208f4a
Could not generate STAC Item for 678a87a31b953900016c6913
Could not generate STAC Item for 679d23f8d5ec7d92c086a43b
C



Could not generate STAC Item for 5a9ffc185a9ef7cb5da92bef
Could not generate STAC Item for 649afe214395b26cd2af7ab3
Could not generate STAC Item for 67e953bfcaf9fa99f53a502c
Could not generate STAC Item for 5a9ffc185a9ef7cb5da92bea
Could not generate STAC Item for 67bb65366c97d8dc7ad569f5
Could not generate STAC Item for 67d44e8ac8ccd45a00a935db
Could not generate STAC Item for 67d991c27725741fc9199512
Could not generate STAC Item for 67d889a51325d3f9345c9a8c
Could not generate STAC Item for 67efef825d8e66413058391b
Could not generate STAC Item for 67ef3bd5ca6f76f97a4b02e7
Could not generate STAC Item for 67e42a25305c6aa345b84e48
Could not generate STAC Item for 649b3fe49ca545000157f81f
Could not generate STAC Item for 67eab77a3486d9a910f3286a
Could not generate STAC Item for 67ebdf693486d9a910f45327
Could not generate STAC Item for 67c75577e364a9da596a8474
Could not generate STAC Item for 678c64ffb320a40001889674
Could not generate STAC Item for 678fcbfcb1da9700012b248a
Could not gene



In [None]:
print(f"Converted {len(stac_items)} of {total} metadata entries to STAC Items")

## Deduplicate on Item ID

In [None]:
dupe_count: dict[str, int] = defaultdict(lambda: 0)
distinct_items = {}
for item in stac_items:
    if (other_item := distinct_items.get(item.id)) is not None:
        dupe_count[item.id] += 1
        item_created = item.assets["metadata"].extra_fields["updated"]
        other_created = other_item.assets["metadata"].extra_fields["updated"]
        if item_created > other_created:
            distinct_items[item.id] = item
    else:
        distinct_items[item.id] = item

In [None]:
print(f"Deduplicated from {len(stac_items)} to {len(distinct_items)} Items")
print(
    f"There were {len(dupe_count)} duplicates ranging between "
    f"{min(dupe_count.values())} to {max(dupe_count.values())} duplicates "
    f"(mean={sum(dupe_count.values()) / len(dupe_count):0.3f} dupes per ID)"
)

### Write to NDJSON for ingestion into (PgSTAC) STAC Catalog

In [None]:
import datetime as dt
import json

destination = f"openaerialmap-{dt.datetime.now().strftime('%Y%m%dT%H%M%S')}.ndjson"
with open(destination, "w") as dst:
    for item in distinct_items.values():
        item_json = json.dumps(item.to_dict())
        dst.write(f"{item_json}\n")