In [0]:
import geopandas as gpd
from shapely.ops import unary_union
from shapely.geometry import MultiPolygon
from shapely.geometry.base import BaseGeometry
from shapely import force_2d
import pandas as pd
from pyspark.sql.functions import *
from delta.tables import DeltaTable
from datetime import datetime

In [0]:
predicted_masks = "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/predicted_masks"
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"
log_table = f"{catalog_dev}.{schema_dev}.logs_predicted_masks"
table = f"{catalog_dev}.{schema_dev}.predicted_silver"

In [0]:
# Sett kontekst med katalog og skjema 
spark.sql(f'USE CATALOG {catalog_dev}')
spark.sql(f'CREATE SCHEMA IF NOT EXISTS {schema_dev}')
spark.sql(f'USE SCHEMA {schema_dev}')

In [0]:
def check_for_new_predicted_masks() -> list:
    """
    Function that checks for new predicted mask. Returns a list of new predicted masks.
    """
    all_masks = [f.path for f in dbutils.fs.ls(predicted_masks) if f.name.endswith(".geojson")]
    processed_masks_df = spark.read.table(log_table).select("row_hash")
    processed_masks = [row["row_hash"] for row in processed_masks_df.collect()]

    return [mask for mask in all_masks if mask not in processed_masks]

In [0]:
def to_wkt_2d(geom):
    if isinstance(geom, BaseGeometry):
        return force_2d(geom).wkt
    return None

In [0]:
def write_to_sdf(mask_path: str, mask_name: str) -> DataFrame:
    """
    Read GeoJSON and write one merged MultiPolygon row to SDF with centroid.
    """
    gdf = gpd.read_file(mask_path)

    # Reproject if needed
    if gdf.crs != "EPSG:25833":
        gdf = gdf.set_crs(gdf.crs).to_crs("EPSG:25833")

    # Merge all geometries into a single MultiPolygon
    merged_geom = unary_union(gdf.geometry)
    if merged_geom.geom_type == "Polygon":
        merged_geom = MultiPolygon([merged_geom])

    # Convert to WKT
    wkt_geom = to_wkt_2d(merged_geom)

    # Calculate centroid
    centroid = merged_geom.centroid
    centroid_x = centroid.x
    centroid_y = centroid.y

    # Create single-row DataFrame
    df = pd.DataFrame(
        [{"geometry": wkt_geom, "centroid_x": centroid_x, "centroid_y": centroid_y}]
    )
    sdf = spark.createDataFrame(df)

    # Add metadata
    sdf = (
        sdf.withColumn("ingest_time", current_timestamp())
        .withColumn("source_file", lit(mask_name))
        .withColumn("row_hash", sha2(concat_ws("||", *sdf.columns), 256))
    )

    return sdf

In [0]:
def write_delta_table(sdf: DataFrame):
    """
    Write delta table from spark dataframe.
    """
    if not spark.catalog.tableExists(table):
        sdf.write.format("delta").mode("overwrite").saveAsTable(table)
    else:
        delta_tbl = DeltaTable.forName(spark, table)
        delta_tbl.alias("target").merge(
            source=sdf.alias("source"), condition="target.row_hash = source.row_hash"
        ).whenMatchedUpdate(
            condition="target.row_hash != source.row_hash",
            set={col: f"source.{col}" for col in sdf.columns},
        ).whenNotMatchedInsert(
            values={col: f"source.{col}" for col in sdf.columns}
        ).execute()

In [0]:
def main():
    """
    Reads predicted masks and writes them to delta table.
    """
    predicted_masks = check_for_new_predicted_masks()
    for mask in predicted_masks:
        mask_name = mask.rstrip("/").split("/")[-1]
        mask_path = mask.removeprefix("dbfs:")
        print(f"\nProcessing mask: {mask_name}")

        sdf = write_to_sdf(mask_path, mask_name)
        write_delta_table(sdf)

In [0]:
main()