In [0]:
from pyspark.sql.functions import *
from pyspark.sql import DataFrame
from pyspark.sql.types import (
    ArrayType,
    BinaryType,
    DoubleType,
    StringType,
    StructType,
    StructField,
)
from shapely import wkb
from shapely.ops import unary_union

from src.data.geometry_utils import (
    make_envelope,
    envelope_to_bboxes,
    random_adjusted_bbox_centered,
    add_bbox_columns,
)
from src.data.data_utils import write_delta_table
from src.data.token_utils import get_token
from src.data.image_utils import enrich_output

In [0]:
catalog_dev = "`land_auto-gen-kart_dev`"
schema_dev = "dl_bildesegmentering"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
bronze_table = "helipads_bronze"
silver_table = "helipads_silver"


SUBDIR = {"image": "helipad_images", "mask": "helipad_labels"}

bbox_length = 128
max_offset = 30
image_width, image_height = 512, 512

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {silver_table} (
    row_hash STRING,
    lokalids ARRAY<STRING>,
    geometry BINARY,
    bbox ARRAY<DOUBLE>,
    adjusted_struct STRUCT<bbox: ARRAY<DOUBLE>, bbox_str: STRING>,
    Adjusted_bbox ARRAY<DOUBLE>,
    bbox_str STRING,
    image_path STRING,
    mask_path STRING,
    image_wms STRING,
    image_status STRING,
    mask_status STRING,
    type STRING,
    oppdateringsdato STRING,
    ingest_time TIMESTAMP,
    photo_time TIMESTAMP
) USING DELTA
"""
spark.sql(q)

In [0]:
def read_table_to_wkt():
    """
    Leser polygonene fra bronsetabellen og returnerer dem som en GeoDataFrame med WKT geometri.
    """
    df_bronze = spark.read.table(bronze_table).withColumn(
        "geometry", expr("ST_GeomFromWKB(geometry)")
    )
    df = (
        df_bronze.withColumn("_geom_srid", expr("ST_SetSRID(geometry, 4326)"))
        .withColumn(
            "geometry", expr("ST_Force2D(ST_Transform(_geom_srid, 'EPSG:25833'))")
        )
        .select("lokalid", "geometry", "oppdateringsdato")
    )
    return df

In [0]:
def build_overlap_graph(df: DataFrame) -> DataFrame:
    df_with_id = df.withColumn("row_id", monotonically_increasing_id())
    overlap_df = (
        df_with_id.alias("a")
        .join(
            df_with_id.alias("b"),
            expr(
                """
                ST_Intersects(
                    ST_MakeEnvelope(a.bbox[0], a.bbox[1], a.bbox[2], a.bbox[3]),
                    ST_MakeEnvelope(b.bbox[0], b.bbox[1], b.bbox[2], b.bbox[3])
                )
            """
            ),
        )
        .select(
            col("a.lokalid").alias("lokalid_a"),
            col("b.lokalid").alias("lokalid_b"),
        )
    )
    return overlap_df

In [0]:
def connected_components(edges: DataFrame) -> DataFrame:
    """
    Finner sammenhengende komponenter i en graf.
    """
    vertices = (
        edges.select("src")
        .union(edges.select("dst"))
        .distinct()
        .withColumn("component", col("src"))
    )
    changed = True
    while changed:
        propagated = (
            edges.alias("e")
            .join(vertices.alias("v"), col("e.src") == col("v.src"))
            .select(col("e.dst").alias("src"), col("v.component"))
        )
        new_vertices = (
            vertices.union(propagated)
            .groupBy("src")
            .agg(min("component").alias("component"))
        )
        joined = new_vertices.alias("n").join(vertices.alias("v"), "src")
        changed = joined.filter(col("n.component") != col("v.component")).count() > 0
        vertices = new_vertices
    return vertices

In [0]:
def merge_chain_geometries(chains_with_geom: DataFrame) -> DataFrame:
    """
    Slår sammen polygoner i en linje og returnerer det som en polygon.
    """
    chains_union = (
        chains_with_geom.groupBy("component")
        .agg(
            collect_list("geometry").alias("geom_list"),
            collect_list("lokalid").alias("lokalids"),
            min("oppdateringsdato").alias("oppdateringsdato"),
        )
        .withColumn("merged_wkb", merge_udf("geom_list"))
        .withColumn("geometry", expr("ST_GeomFromWKB(merged_wkb)"))
        .withColumn("type", lit("chain"))
    )
    chains_union = make_envelope(chains_union, bbox_length)
    chains_union = envelope_to_bboxes(chains_union)
    return chains_union

In [0]:
def merge_polygons(polys):
    """
    Slår sammen polygoner fra en liste til en enkelt polygon.
    """
    shapes = [p for p in polys if p is not None]
    if not shapes:
        return None
    merged = unary_union(shapes)
    return merged.wkb

In [0]:
token = get_token()

merge_udf = udf(merge_polygons, BinaryType())

df = read_table_to_wkt()
df = make_envelope(df, bbox_length)
df = envelope_to_bboxes(df, bbox_length)

edges = (
    build_overlap_graph(df)
    .select("lokalid_a", "lokalid_b")
    .withColumnRenamed("lokalid_a", "src")
    .withColumnRenamed("lokalid_b", "dst")
)
vertices = connected_components(edges)

chains = vertices.groupBy("component").agg(collect_list("src").alias("lokalids"))
chains_filtered = chains.filter(size("lokalids") > 1)
chains_exploded = chains_filtered.withColumn("lokalid", explode("lokalids"))
chains_with_geom = chains_exploded.join(df, "lokalid", "inner").select(
    "component", "lokalid", "geometry", "oppdateringsdato"
)
chains_union = merge_chain_geometries(chains_with_geom)

chain_ids = chains_filtered.select(explode("lokalids").alias("lokalid"))
singletons = (
    df.join(chain_ids, "lokalid", "left_anti")
    .select("lokalid", "geometry", "bbox", "oppdateringsdato")
    .withColumn("lokalids", array("lokalid"))
    .withColumn("type", lit("singleton"))
)

new_df = chains_union.select(
    "component",
    "lokalids",
    "geometry",
    "bbox",
    "type",
    "oppdateringsdato",
).unionByName(
    singletons.select(
        col("lokalid").alias("component"),
        "lokalids",
        "geometry",
        "bbox",
        "type",
        "oppdateringsdato",
    )
)

df = new_df.withColumn(
    "row_hash", sha2(concat_ws("||", col("lokalids"), col("bbox")), 256)
).drop("component")
df = add_bbox_columns(df, bbox_length, max_offset)
df = enrich_output(df, token, "row_hash", SUBDIR, image_width, image_height)
df = df.withColumn("geometry", expr("ST_AsBinary(geometry)"))

write_delta_table(df, silver_table, "row_hash")