In [0]:
%pip install scikit-image==0.20.0

In [0]:
import builtins
import pandas as pd
from shapely import wkt
from shapely.geometry import MultiPolygon
from pyspark.sql.functions import *
from pyspark.sql.types import StructField, StringType, DoubleType, DateType

from src.data.data_utils import write_delta_table
from src.data.predict_utils import make_masks_grouped_udf

In [0]:
predicted_masks = "/Volumes/land_auto-gen-kart_dev/external_dev/static_data/DL_bildesegmentering/predicted_snuplasser"
catalog_dev = "`land_auto-gen-kart_dev`"
schema_dev = "dl_bildesegmentering"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
bronze_table = "predicted_snuplasser_bronze"
silver_table = "predicted_snuplasser_silver"
endepunkt_silver_table = "endepunkt_silver"

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {silver_table} (
    row_hash STRING,
    centroid_x DOUBLE,
    centroid_y DOUBLE,
    diameter DOUBLE,
    photo_time TIMESTAMP,
    source_file STRING,
    ingest_time TIMESTAMP
) USING DELTA
"""
spark.sql(q)

In [0]:
@pandas_udf(DoubleType())
def min_width_udf(wkt_series: pd.Series) -> pd.Series:
    """
    Returnerer en DataFrame med en kolonne med diameter for hver geometri.
    """
    results = []
    for w in wkt_series:
        geom = wkt.loads(w)
        if isinstance(geom, MultiPolygon):
            geom = builtins.max(geom.geoms, key=lambda g: g.area)
        hull = geom.convex_hull
        coords = list(hull.exterior.coords[:-1])
        min_width = float("inf")
        for i in range(len(coords)):
            a, b = coords[i], coords[(i + 1) % len(coords)]
            dx, dy = b[0] - a[0], b[1] - a[1]
            length = (dx**2 + dy**2) ** 0.5
            if length == 0:
                continue
            perp_dx, perp_dy = -dy / length, dx / length
            projections = [p[0] * perp_dx + p[1] * perp_dy for p in coords]
            width = builtins.max(projections) - builtins.min(projections)
            min_width = builtins.min(min_width, width)
        results.append(min_width)

    return pd.Series(results)

In [0]:
df_turning_spaces = spark.read.table(bronze_table).filter(col("turning_space"))

# Henter photo_time for hver snuplass
df_photo_time = df_turning_spaces.withColumn(
    "nodeid", expr("substring(source_file, 12, length(source_file)-15)")
).join(
    spark.read.table(endepunkt_silver_table).select("nodeid", "photo_time"),
    on="nodeid",
    how="left",
)

schema = StructType(
    [
        StructField("source_file", StringType(), False),
        StructField("photo_time", DateType(), True),
        StructField("geometry_wkt", StringType(), False),
        StructField("centroid_x", DoubleType(), False),
        StructField("centroid_y", DoubleType(), False),
    ]
)

df_grouped = df_photo_time.mapInPandas(
    make_masks_grouped_udf(predicted_masks_path=predicted_masks), schema=schema
)
# Finner minste diameter p√• tvers av polygonet for hver snuplass
sdf_diameter = df_grouped.withColumn("diameter", min_width_udf(col("geometry_wkt")))
sdf_clean = sdf_diameter.drop("geometry_wkt")
sdf = sdf_clean.withColumn("ingest_time", current_timestamp()).withColumn(
    "row_hash", sha2(concat_ws("||", *sdf_clean.columns), 256)
)

write_delta_table(sdf, silver_table, id_col="row_hash")