In [None]:
import itertools
import time
from pathlib import Path

import duckdb
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import pandas as pd
import psutil
import pyarrow.compute as pc
import pyarrow.dataset as ds
import pyarrow.fs as fs
import seaborn.objects as so
from cpuinfo import get_cpu_info
from drawarrow import fig_arrow
from overturemaps.core import geodataframe as om_gdf
from psutil._common import bytes2human
from pypalettes import load_cmap
from seaborn import axes_style

import overturemaestro as om
from overturemaestro.release_index import download_existing_release_index

In [None]:
def overturemaps_example(theme, type, bbox) -> None:
    om_gdf(type, bbox=bbox)


def overturemaestro_example(theme, type, bbox) -> None:
    om.convert_bounding_box_to_parquet(
        release="2024-09-18.0",
        theme=theme,
        type=type,
        bbox=bbox,
        ignore_cache=True,
    )


def _from_duckdb(path_or_paths, column, bbox):
    q = f"""
    SET s3_region='us-west-2';
    SELECT {column}, * EXCLUDE ({column})
    FROM read_parquet({path_or_paths})
    WHERE
        bbox.xmin < {bbox[2]}
        AND bbox.xmax > {bbox[0]}
        AND bbox.ymin < {bbox[3]}
        AND bbox.ymax > {bbox[1]};
    """
    rows = duckdb.sql(q)
    col = [r[0] for r in rows.fetchall()]
    return col


def duckdb_with_index_example(theme, type, bbox):
    filenames = _from_duckdb(f"'_index_{theme}_{type}.parquet'", "filename", bbox)
    duckdb_list = "[{}]".format(",".join([f"'s3://{f}'" for f in filenames]))
    ids = _from_duckdb(duckdb_list, "id", bbox)
    return len(ids)


def duckdb_example(theme, type, bbox):
    ids = _from_duckdb(
        f"'s3://overturemaps-us-west-2/release/2024-09-18.0/theme={theme}/type={type}/*.parquet'",
        "id",
        bbox,
    )
    return len(ids)


def _filter(bbox):
    xmin, ymin, xmax, ymax = bbox
    return (
        (pc.field("bbox", "xmin") < xmax)
        & (pc.field("bbox", "xmax") > xmin)
        & (pc.field("bbox", "ymin") < ymax)
        & (pc.field("bbox", "ymax") > ymin)
    )


def _from_pyarrow(path_or_paths, column, bbox, filesystem=None):
    # Fetch a single column from the path or paths and apply the bbox filter
    dataset = ds.dataset(path_or_paths, filesystem=filesystem)
    batches = dataset.to_batches(filter=_filter(bbox))
    return list(itertools.chain(*(b[column].to_pylist() for b in batches)))


def pyarrow_index(theme, type, bbox):
    # The index is currently behind a non-public location
    filenames = _from_pyarrow(f"_index_{theme}_{type}.parquet", "filename", bbox)
    ids = _from_pyarrow(
        filenames, "id", bbox, filesystem=fs.S3FileSystem(anonymous=True, region="us-west-2")
    )
    return len(ids)


BENCHMARK_EXAMPLES = {
    "OvertureMaps (PyArrow)": overturemaps_example,
    "DuckDB": duckdb_example,
    "DuckDB with file index": duckdb_with_index_example,
    "PyArrow with file index": pyarrow_index,
    "OvertureMaestro": overturemaestro_example,
}

In [None]:
small_bboxes = {
    "London": (-0.120, 51.498, -0.090, 51.508),
    "Boston": (-71.068, 42.353, -71.058, 42.363),
    "Tokyo": (139.708, 35.643, 139.720, 35.650),
}

big_bboxes = {
    "London": (-0.521, 51.388, 0.200, 51.640),
    "Boston": (-71.104, 42.325, -71.002, 42.383),
    "Tokyo": (139.326, 35.552, 139.969, 35.806),
}

bboxes = {
    "small": small_bboxes,
    "big": big_bboxes,
}

In [None]:
theme_types = [
    ("buildings", "building"),
    ("transportation", "segment"),
    ("places", "place"),
    ("base", "water"),
]

### Download OvertureMaestro indexes

In [None]:
download_existing_release_index("2024-09-18.0")

### Generate basic index

In [None]:
from generate_bbox_index import main as generate_bbox_index_fn

for ov_theme, ov_type in theme_types:
    generate_bbox_index_fn(ov_theme, ov_type)

### Calculate benchmark

In [None]:
for bbox_type, _bboxes in bboxes.items():
    if Path(f"{bbox_type}_bboxes.parquet").exists():
        print(bbox_type, "exists")
        continue

    results = []
    for ov_theme, ov_type in theme_types:
        for bbox_name, bbox in _bboxes.items():
            for benchmark_example_name, function in BENCHMARK_EXAMPLES.items():
                print(ov_theme, ov_type, bbox_name, benchmark_example_name)
                net_io_start = psutil.net_io_counters()
                start_time = time.time()

                function(ov_theme, ov_type, bbox)

                elapsed_time = time.time() - start_time
                net_io_end = psutil.net_io_counters()

                bytes_sent = net_io_end.bytes_sent - net_io_start.bytes_sent
                bytes_recv = net_io_end.bytes_recv - net_io_start.bytes_recv

                print(f"Elapsed Time: {elapsed_time:.4f} seconds")
                print(
                    f"Bytes Sent: {bytes2human(bytes_sent)}, Bytes Received: {bytes2human(bytes_recv)}"
                )
                results.append(
                    {
                        "theme_type": f"{ov_theme}_{ov_type}",
                        "bbox": bbox_name,
                        "benchmark": benchmark_example_name,
                        "time": elapsed_time,
                        "bytes_recv": bytes_recv,
                    }
                )

    pd.DataFrame(results).to_parquet(f"{bbox_type}_bboxes.parquet")
    results_df = pd.read_parquet(f"{bbox_type}_bboxes.parquet")

### Plot results

In [None]:
def _sizeof_fmt(x, pos):
    if x < 0:
        return ""
    return bytes2human(x)

cpu_info = get_cpu_info()
cpu_name = cpu_info["brand_raw"]
cpu_cores = cpu_info["count"]
cpu_freq = cpu_info["hz_advertised_friendly"]

total_ram = bytes2human(psutil.virtual_memory().total)

title_second_row = f"Run on {cpu_name} ({cpu_cores} cores) with {total_ram} total memory"

cmap = load_cmap("thunder_city2")
# cmap = load_cmap("OKeeffe")

for bbox_type in bboxes.keys():
    results_df = pd.read_parquet(f"{bbox_type}_bboxes.parquet")

    f = plt.figure(figsize=(15, 10))
    sf1, sf2 = f.subfigures(2, 1)

    theme_dict = {**axes_style("whitegrid"), "grid.linestyle": ":"}

    (
        so.Plot(results_df, x="bbox", y="time", color="benchmark")
        .theme(theme_dict)
        .facet(col="theme_type")
        .add(so.Bar(), so.Agg(), so.Dodge())
        # .scale(color="colorblind")
        .scale(color=cmap.colors)
        .on(sf1)
        .label(color=str.capitalize)
        .plot()
    )
    (
        so.Plot(results_df, x="bbox", y="bytes_recv", color="benchmark")
        .theme(theme_dict)
        .facet(col="theme_type")
        .add(so.Bar(), so.Agg(), so.Dodge())
        # .scale(color="colorblind")
        .scale(color=cmap.colors)
        .on(sf2)
        .label(color=str.capitalize)
        .plot()
    )

    l2 = f.legends.pop(1)
    l1 = f.legends.pop(0)

    f.legend(
        l2.legend_handles,
        [t.get_text() for t in l1.texts],
        loc="center right",
        bbox_to_anchor=(1.06, 0.5),
    )

    sf2.axes[0].yaxis.set_major_formatter(tkr.FuncFormatter(_sizeof_fmt))

    sf1.axes[0].set_ylabel("Time (s)\n(lower is better)")
    sf2.axes[0].set_ylabel("Data downloaded\n(lower is better)")
    for sf in [sf1, sf2]:
        for _ax in sf.axes:
            _ax.set_xlabel("Bounding Box")

    title_first_row = f"Benchmark - {bbox_type.capitalize()} bounding boxes (2024-09-18.0 release)"
    plt.suptitle(f"{title_first_row}\n{title_second_row}", y=1.01)

    # # add arrows
    # fig_arrow(
    #     head_position=(0.16, 0.65),
    #     tail_position=(0.12, 0.71),
    #     width=2,
    #     radius=0.3,
    #     color="darkred",
    #     fill_head=False,
    #     mutation_scale=2,
    # )
    # # add arrows

    plt.savefig(f"benchmark_bbox_{bbox_type.lower()}.png", bbox_inches="tight")
    plt.show()