In [None]:
from srai.loaders.osm_loaders.pbf_file_handler import PbfFileHandler
import geopandas as gpd
import pyogrio
from srai.loaders import download_file
from pathlib import Path
from srai.loaders.osm_loaders.filters import GEOFABRIK_LAYERS, merge_osm_tags_filter
from geoarrow.pyarrow import io
import pyarrow as pa
import json

In [None]:
pyogrio.__gdal_version__

In [None]:
pyogrio.list_drivers(read=True)

In [None]:
download_file(
    "https://download.geofabrik.de/europe/estonia-latest.osm.pbf", "estonia-latest.osm.pbf"
)

In [None]:
# pbf_file = Path("estonia-latest.osm.pbf")
pbf_file = Path(
    "/mnt/c/Development/Python/srai-1/tests/loaders/osm_loaders/test_files/monaco.osm.pbf"
)

In [None]:
pyogrio.list_layers(pbf_file)

In [None]:
gdal_options = dict(
    INTERLEAVED_READING=True,
    CONFIG_FILE="/mnt/c/Development/Python/srai-1/tests/loaders/osm_loaders/test_files/osmconf.ini",
    use_arrow=True,
)

# monaco
# pyogrio.read_dataframe(
#     pbf_file, layer="points", columns=["osm_id", "all_tags", "geometry"], **gdal_options
# )  # 3057
# pyogrio.read_dataframe(pbf_file, layer="lines", **gdal_options)  # 3066
pyogrio.read_dataframe(
    pbf_file,
    layer="multilinestrings",
    columns=["osm_id", "osm_way_id", "all_tags", "geometry"],
    **gdal_options,
)  # 60
# pyogrio.read_dataframe(
#     pbf_file,
#     layer="multipolygons",
#     columns=["osm_id", "osm_way_id", "all_tags", "geometry"],
#     **gdal_options,
# )  # 1733
# pyogrio.read_dataframe(pbf_file, layer="other_relations", **gdal_options)  # 101

In [None]:
import pghstore


def read_features_with_pyogrio(pbf_file) -> gpd.GeoDataFrame:
    gdal_options = dict(
        INTERLEAVED_READING=True,
        CONFIG_FILE=(
            "/mnt/c/Development/Python/srai-1/tests/loaders/osm_loaders/test_files/osmconf.ini"
        ),
        use_arrow=True,
    )
    gdfs = []
    for layer_info in pyogrio.list_layers(pbf_file):
        layer_name = layer_info[0]

        gdf = pyogrio.read_dataframe(
            pbf_file,
            layer=layer_name,
            columns=["osm_id", "osm_way_id", "all_tags", "geometry"],
            **gdal_options,
        )

        if layer_name == "points":
            gdf["feature_id"] = "node/" + gdf["osm_id"]
        elif layer_name == "lines":
            gdf["feature_id"] = "way/" + gdf["osm_id"]
        elif layer_name in ("multilinestrings", "other_relations"):
            gdf["feature_id"] = "relation/" + gdf["osm_id"]
        elif layer_name == "multipolygons":
            gdf["feature_id"] = gdf.apply(
                lambda row: (
                    "relation/" + row["osm_id"]
                    if row["osm_id"] is not None
                    else "way/" + row["osm_way_id"]
                ),
                axis=1,
            )

        gdfs.append(gdf)

    final_gdf = gpd.pd.concat(gdfs)
    final_gdf = final_gdf[~final_gdf["all_tags"].isnull()]
    final_gdf["tags"] = final_gdf["all_tags"].apply(pghstore.loads)
    # final_gdf["tags"] = final_gdf["all_tags"]
    # filter relations
    non_relations = ~final_gdf["feature_id"].str.startswith("relation/")
    # WHERE kind = 'relation' AND len(refs) > 0
    # AND list_contains(map_keys(tags), 'type')
    # AND list_has_any(map_extract(tags, 'type'), ['boundary', 'multipolygon'])
    relations = final_gdf["feature_id"].str.startswith("relation/")
    matching_relations = relations & final_gdf["tags"].apply(
        lambda x: x.get("type") in ("boundary", "multipolygon")
    )
    final_gdf = final_gdf[non_relations | matching_relations]
    return final_gdf[["feature_id", "tags", "geometry"]].set_index("feature_id")

In [None]:
gdal_gdf = read_features_with_pyogrio(pbf_file)  # 8017 -> 7845
gdal_gdf

In [None]:
# handler = PbfFileHandler(tags_filter=merge_osm_tags_filter(GEOFABRIK_LAYERS))
# handler = PbfFileHandler(tags_filter=GEOFABRIK_LAYERS)
handler = PbfFileHandler()
df = handler.get_features_gdf([pbf_file], explode_tags=False, ignore_cache=True)
# df = handler.get_features_gdf(['/mnt/Storage/Programming/srai/library/srai/tests/loaders/osm_loaders/test_files/monaco.osm.pbf'], explode_tags=False, ignore_cache=True)
df  # 7904 -> 7847

In [None]:
gdal_index = gdal_gdf.index
duckdb_index = df.index

missing_in_gdal = duckdb_index.difference(gdal_index)
missing_in_duckdb = gdal_index.difference(duckdb_index)

In [None]:
missing_in_gdal

In [None]:
len(missing_in_gdal)

In [None]:
df.loc[missing_in_gdal]

In [None]:
missing_in_duckdb

In [None]:
from tqdm import tqdm
from shapely import hausdorff_distance


distances = []


def iou_metric(geom_a, geom_b) -> float:
    if geom_a.geom_type not in ("Polygon", "MultiPolygon") or geom_b.geom_type not in (
        "Polygon",
        "MultiPolygon",
    ):
        return 0
    intersection = geom_a.intersection(geom_b).area
    union = geom_a.area + geom_b.area - intersection
    return intersection / union


for gdal_row_index in tqdm(gdal_index):
    duckdb_row = df.loc[gdal_row_index]
    gdal_row = gdal_gdf.loc[gdal_row_index]
    duckdb_tags = duckdb_row.tags
    gdal_tags = gdal_row.tags

    # gdal doesn't register area tag
    # if "area" in duckdb_tags:
    #     duckdb_tags.pop("area")

    # duckdb_tags_keys = set(duckdb_tags.keys())
    # gdal_tags_keys = set(gdal_tags.keys())
    # mismatched_keys = duckdb_tags_keys.difference(gdal_tags_keys).union(
    #     gdal_tags_keys.difference(duckdb_tags_keys)
    # )
    # assert
    assert (
        duckdb_row.tags == gdal_row.tags
    ), f"Tags aren't equal. ({gdal_row_index}, {duckdb_row.tags}, {gdal_row.tags})"
    # assert duckdb_row.geometry.geom_type == gdal_row.geometry.geom_type, (
    #     f"Geometries types aren't equal. ({gdal_row_index}, {duckdb_row.geometry.geom_type} !="
    #     f" {gdal_row.geometry.geom_type})"
    # )
    tolerance = 0.5 * 10 ** (-6)
    geometry_equal = duckdb_row.geometry.equals(gdal_row.geometry)
    geometry_almost_equal = duckdb_row.geometry.equals_exact(gdal_row.geometry, tolerance)
    iou_value = iou_metric(duckdb_row.geometry, gdal_row.geometry)
    geometry_iou_near_one = iou_value >= (1 - tolerance)
    hausdorff_distance_value = hausdorff_distance(
        duckdb_row.geometry, gdal_row.geometry, densify=0.5
    )
    geometry_close_hausdorff_distance = hausdorff_distance_value < 1e-10
    geometry_both_closed_or_not = duckdb_row.geometry.is_closed == gdal_row.geometry.is_closed
    is_different_geometry_type = duckdb_row.geometry.geom_type in (
        "Polygon",
        "MultiPolygon",
    ) and gdal_row.geometry.geom_type in ("LineString", "MultiLineString")
    assert (
        geometry_both_closed_or_not
        and (geometry_equal or geometry_almost_equal or geometry_iou_near_one)
    ) or (geometry_close_hausdorff_distance and is_different_geometry_type), (
        f"{gdal_row_index} geometries aren't equal. (equal: {geometry_equal}, almost_equal:"
        f" {geometry_almost_equal}, iou near one: {geometry_iou_near_one}, [{iou_value}], small"
        f" hausdorff distance: {geometry_close_hausdorff_distance} [{hausdorff_distance}], gdal"
        f" geom type: {gdal_row.geometry.geom_type}, duckdb geom type"
        f" {duckdb_row.geometry.geom_type}, both closed or not: {geometry_both_closed_or_not},"
        f" {duckdb_row.geometry.is_closed}, {gdal_row.geometry.is_closed})"
    )
    distances.append(hausdorff_distance_value)
    # break

In [None]:
gpd.pd.Series(distances).hist()

In [None]:
df.loc["way/94452782"].geometry.equals(gdal_gdf.loc["way/94452782"].geometry)

In [None]:
iou_metric(df.loc["way/94452782"].geometry, gdal_gdf.loc["way/94452782"].geometry)

In [None]:
df.loc[["way/1089844285"]].explore()

In [None]:
import osmnx as ox

ox.geocode_to_gdf(query=["W94452782"], by_osmid=True)

In [None]:
gdal_gdf.loc[feature_id].geometry.geom_type

In [None]:
feature_id = "way/1089844285"

ax = gdal_gdf.loc[[feature_id]].geometry.plot(color="red")
df.loc[[feature_id]].geometry.plot(ax=ax, color="blue")
gdal_gdf.loc[[feature_id]].geometry.intersection(df.loc[[feature_id]].geometry).plot(
    ax=ax, color="green"
)

In [None]:
way / 572934026

In [None]:
gdal_gdf.loc[[feature_id]].geometry.difference(df.loc[[feature_id]].geometry).plot()
df.loc[[feature_id]].geometry.difference(df.loc[[feature_id]].geometry).plot()

In [None]:
gdal_gdf.loc["way/4097656"].geometry.equals_exact(df.loc["way/4097656"].geometry, 0.5 * 10 ** (-6))

In [None]:
m = df.loc[["way/4097656"]].explore()
gdal_gdf.loc[["way/4097656"]].explore(m=m)

In [None]:
tbl = io.read_geoparquet_table(
    "/mnt/Storage/Programming/srai/library/srai/tests/loaders/osm_loaders/files/monaco_7518e8e7c0d46d33b0f8693b4ba560cdea65288b210d9c34e0e66114212dcf10_noclip.geoparquet"
)
tbl

# %%
pa.map_(pa.string(), pa.string()).to_pandas_dtype()


tbl.drop("geometry").to_pandas(maps_as_pydicts="strict")
# tbl.column('tags').to_pandas(maps_as_pydicts="strict")
# pa.concat_tables([tbl]).to_pandas().