In [0]:
import geopandas as gpd
from shapely.ops import unary_union
from shapely.geometry import MultiPolygon
from shapely.geometry.base import BaseGeometry
from shapely import force_2d
from shapely import wkt
import pandas as pd
from pyspark.sql import functions as F
from pyspark.sql import DataFrame
from pyspark.sql.types import DoubleType
from delta.tables import DeltaTable
from datetime import datetime

In [0]:
predicted_masks = "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/predicted_masks"
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"
log_table = f"{catalog_dev}.{schema_dev}.logs_predicted_masks"
table = f"{catalog_dev}.{schema_dev}.predicted_silver"

In [0]:
# Sett kontekst med katalog og skjema 
spark.sql(f'USE CATALOG {catalog_dev}')
spark.sql(f'CREATE SCHEMA IF NOT EXISTS {schema_dev}')
spark.sql(f'USE SCHEMA {schema_dev}')

In [0]:
def check_for_new_predicted_masks() -> list:
    """
    Function that checks for new predicted mask. Returns a list of new predicted masks.
    """
    all_masks = [f.path for f in dbutils.fs.ls(predicted_masks) if f.name.endswith(".geojson")]
    processed_masks_df = spark.read.table(log_table).select("row_hash")
    processed_masks = [row["row_hash"] for row in processed_masks_df.collect()]

    return [mask for mask in all_masks if mask not in processed_masks]

In [0]:
def calculate_diameter(
    df: DataFrame,
    wkt_col: str = "geometry",
    id_col: str = "row_hash",
    out_col: str = "diameter"
) -> DataFrame:
    """
    Adds a column to df giving the minimum caliper width (shortest 
    distance across the largest continuous part of each polygon).
    """
    @udf(returnType=DoubleType())
    def _min_caliper_width(wkt_str: str) -> float:
        geom = wkt.loads(wkt_str)

        # Største sammenhengende polygon
        if isinstance(geom, MultiPolygon):
            geom = sorted(geom.geoms, key=lambda g: g.area)[-1]

        # Convex hull for enklere beregninger
        hull = geom.convex_hull
        coords = list(hull.exterior.coords[:-1])  # Fjerner siste punkt som er likt det første

        # Beregner alle perpendikulære avstander fra et punkt til motsatt kant
        min_width = float("inf")
        for i in range(len(coords)):
            a, b = coords[i], coords[(i + 1) % len(coords)]
            edge_dx = b[0] - a[0]
            edge_dy = b[1] - a[1]
            length = (edge_dx**2 + edge_dy**2)**0.5
            if length == 0:
                continue

            # Enhetsvektor perpendikulær til kant
            perp_dx, perp_dy = -edge_dy / length, edge_dx / length

            # Projiserer alle punkter til den perpendikulære vektoren
            projections = [p[0]*perp_dx + p[1]*perp_dy for p in coords]
            width = sorted(projections)[-1] - sorted(projections)[0] # Blir det samme som max(projections) - min(projections)
            min_width = sorted([min_width, width])[0] # Blir det samme som min(min_width, width)
        return float(min_width)
    return df.withColumn(out_col, _min_caliper_width(F.col(wkt_col)))

In [0]:
def to_wkt_2d(geom):
    if isinstance(geom, BaseGeometry):
        return force_2d(geom).wkt
    return None

In [0]:
def write_to_sdf(mask_path: str, mask_name: str) -> DataFrame:
    """
    Read GeoJSON and write one merged MultiPolygon row to SDF with centroid.
    """
    gdf = gpd.read_file(mask_path)

    # Reproject if needed
    if gdf.crs != "EPSG:25833":
        gdf = gdf.set_crs(gdf.crs).to_crs("EPSG:25833")

    # Merge all geometries into a single MultiPolygon
    merged_geom = unary_union(gdf.geometry)
    if merged_geom.geom_type == "Polygon":
        merged_geom = MultiPolygon([merged_geom])

    # Convert to WKT
    wkt_geom = to_wkt_2d(merged_geom)

    # Calculate centroid
    centroid = merged_geom.centroid
    centroid_x = centroid.x
    centroid_y = centroid.y

    # Create single-row DataFrame
    df = pd.DataFrame(
        [{"geometry": wkt_geom, "centroid_x": centroid_x, "centroid_y": centroid_y}]
    )
    basic_sdf = spark.createDataFrame(df)

    sdf_diameter = calculate_diameter(basic_sdf)
    sdf_clean = sdf_diameter.drop("geometry") # Bruker ikke geometry etter dette

    # Add metadata
    sdf = (
        sdf_clean.withColumn("ingest_time", F.current_timestamp())
        .withColumn("source_file", F.lit(mask_name))
        .withColumn("row_hash", F.sha2(F.concat_ws("||", *sdf_clean.columns), 256))
    )

    return sdf

In [0]:
def write_delta_table(sdf: DataFrame):
    """
    Write delta table from spark dataframe.
    """
    if not spark.catalog.tableExists(table):
        sdf.write.format("delta").mode("overwrite").saveAsTable(table)
    else:
        delta_tbl = DeltaTable.forName(spark, table)
        delta_tbl.alias("target").merge(
            source=sdf.alias("source"), condition="target.row_hash = source.row_hash"
        ).whenMatchedUpdate(
            condition="target.row_hash != source.row_hash",
            set={col: f"source.{col}" for col in sdf.columns},
        ).whenNotMatchedInsert(
            values={col: f"source.{col}" for col in sdf.columns}
        ).execute()

In [0]:
def main():
    """
    Reads predicted masks and writes them to delta table.
    """
    predicted_masks = check_for_new_predicted_masks()
    for mask in predicted_masks:
        mask_name = mask.rstrip("/").split("/")[-1]
        mask_path = mask.removeprefix("dbfs:")
        print(f"\nProcessing mask: {mask_name}")

        sdf = write_to_sdf(mask_path, mask_name)
        write_delta_table(sdf)

In [0]:
main()