In [0]:
%pip install scikit-image==0.20.0

In [0]:
import geopandas as gpd
from shapely.ops import unary_union
from shapely.geometry import MultiPolygon, Polygon
from shapely.geometry.base import BaseGeometry
from shapely import force_2d
from shapely import wkt
import pandas as pd
from PIL import Image
import numpy as np
from skimage import measure
from pyspark.sql import functions as F
from pyspark.sql import DataFrame, Row
from pyspark.sql.types import StructField, StringType, DoubleType
from delta.tables import DeltaTable
from typing import Tuple, Iterator
from sedona.spark import *

In [0]:
predicted_masks = "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/predicted_snuplasser"
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
bronze_table = "predicted_snuplasser_bronze"
silver_table = "predicted_snuplasser_silver"
endepunkt_silver_table = "endepunkt_silver"

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {silver_table} (
    row_hash STRING,
    centroid_x DOUBLE,
    centroid_y DOUBLE,
    diameter DOUBLE,
    fototid TIMESTAMP,
    source_file STRING,
    ingest_time TIMESTAMP
) USING DELTA
"""
spark.sql(q)

In [0]:
def mask_to_gdf(bbox: str, source_file: str) -> tuple[gpd.GeoDataFrame, str]:
    """
    Laster inn masken for en gitt source_file, og returnerer en GeoDataFrame med polygonene.
    """
    x_min, y_min, x_max, y_max = bbox

    mask_image = Image.open(f"{predicted_masks}/{source_file}").convert("L")
    width, height = mask_image.size
    x_res = (x_max - x_min) / width
    y_res = (y_max - y_min) / height

    mask = np.array(mask_image)
    mask_bin = (mask > 127).astype(np.uint8)

    contours = measure.find_contours(mask_bin, 0.5)
    polygons = []
    for contour in contours:
        coords = []
        for y, x in contour:
            x_coord = x_min + x * x_res
            y_coord = y_max - y * y_res
            coords.append((x_coord, y_coord))
        poly = Polygon(coords)
        if poly.is_valid:
            polygons.append(poly)

    gdf = gpd.GeoDataFrame(geometry=polygons, crs="EPSG:25833")
    return gdf, source_file

In [0]:
def masks_grouped_udf(iterator):
    """
    For hver gruppe leses alle maskene og polygonene slås sammen.
    """
    for pdf in iterator:
        groups = {}
        for _, row in pdf.iterrows():
            bbox = row["bbox"]
            source_file = row["source_file"]
            fototid = row.get("fototid", None)
            fototid = fototid.isoformat() if fototid is not None else None

            try:
                gdf, _ = mask_to_gdf(bbox, source_file)
            except Exception:
                continue

            if gdf is None or gdf.empty:
                continue

            merged_geom_for_file = unary_union(list(gdf.geometry))
            if merged_geom_for_file is None or merged_geom_for_file.is_empty:
                continue

            key = (source_file, fototid)
            groups.setdefault(key, []).append(merged_geom_for_file)

        out_rows = []
        for (source_file, fototid), geom_list in groups.items():
            try:
                merged_all = unary_union(geom_list)
                if merged_all is None or merged_all.is_empty:
                    continue
                centroid = merged_all.centroid
                out_rows.append(
                    {
                        "source_file": source_file,
                        "fototid": fototid,
                        "geometry_wkt": merged_all.wkt,
                        "centroid_x": float(centroid.x)
                        if centroid and not centroid.is_empty
                        else float("nan"),
                        "centroid_y": float(centroid.y)
                        if centroid and not centroid.is_empty
                        else float("nan"),
                    }
                )
            except Exception:
                continue

        if out_rows:
            yield pd.DataFrame(out_rows)
        else:
            yield pd.DataFrame(
                columns=[
                    "source_file",
                    "fototid",
                    "geometry_wkt",
                    "centroid_x",
                    "centroid_y",
                ]
            )

In [0]:
@F.pandas_udf("double")
def min_width_udf(wkt_series: pd.Series) -> pd.Series:
    """
    Returnerer en DataFrame med en kolonne med diameter for hver geometri.
    """
    results = []
    for w in wkt_series:
        geom = wkt.loads(w)
        if isinstance(geom, MultiPolygon):
            geom = max(geom.geoms, key=lambda g: g.area)
        hull = geom.convex_hull
        coords = list(hull.exterior.coords[:-1])
        min_width = float("inf")
        for i in range(len(coords)):
            a, b = coords[i], coords[(i + 1) % len(coords)]
            dx, dy = b[0] - a[0], b[1] - a[1]
            length = (dx**2 + dy**2) ** 0.5
            if length == 0:
                continue
            perp_dx, perp_dy = -dy / length, dx / length
            projections = [p[0] * perp_dx + p[1] * perp_dy for p in coords]
            width = max(projections) - min(projections)
            min_width = min(min_width, width)
        results.append(min_width)

    return pd.Series(results)

In [0]:
def write_delta_table(sdf: DataFrame):
    """
    Skriver data til deltatabellen og oppdaterer dersom row_hash allerede finnes.
    """
    if not spark.catalog.tableExists(silver_table):
        sdf.write.format("delta").mode("overwrite").saveAsTable(silver_table)
    else:
        delta_tbl = DeltaTable.forName(spark, silver_table)
        delta_tbl.alias("target").merge(
            source=sdf.alias("source"), condition="target.row_hash = source.row_hash"
        ).whenMatchedUpdate(
            condition="target.row_hash != source.row_hash",
            set={col: f"source.{col}" for col in sdf.columns},
        ).whenNotMatchedInsert(
            values={col: f"source.{col}" for col in sdf.columns}
        ).execute()

In [0]:
def main():
    """
    Henter alle snuplasser fra bronsetabellen og skriver til deltatabellen.
    """
    df_turning_spaces = spark.read.table(bronze_table).filter(F.col("turning_space"))

    # Henter fototid for hver snuplass
    df_fototid = df_turning_spaces.withColumn(
        "nodeid", F.expr("substring(source_file, 12, length(source_file)-15)")
    ).join(
        spark.read.table(endepunkt_silver_table).select("nodeid", "fototid"),
        on="nodeid",
        how="left",
    )

    schema = StructType(
        [
            StructField("source_file", StringType(), False),
            StructField("fototid", StringType(), True),
            StructField("geometry_wkt", StringType(), False),
            StructField("centroid_x", DoubleType(), False),
            StructField("centroid_y", DoubleType(), False),
        ]
    )

    df_grouped = df_fototid.mapInPandas(masks_grouped_udf, schema=schema)

    # Finner minste diameter på tvers av polygonet for hver snuplass
    sdf_diameter = df_grouped.withColumn(
        "diameter", min_width_udf(F.col("geometry_wkt"))
    )
    sdf_clean = sdf_diameter.drop("geometry_wkt")

    sdf = sdf_clean.withColumn("ingest_time", F.current_timestamp()).withColumn(
        "row_hash", F.sha2(F.concat_ws("||", *sdf_clean.columns), 256)
    )

    write_delta_table(sdf)

In [0]:
main()