In [0]:
from PIL import Image
import numpy as np
import time
from pyspark.sql.functions import *
from delta.tables import DeltaTable
from datetime import datetime
from pyspark.sql.types import (
    StructType,
    StructField,
    TimestampType,
    IntegerType,
    ArrayType,
    BooleanType,
    StringType,
)
from pyproj import CRS
from rasterio.features import rasterize
from rasterio.transform import from_bounds
from typing import Iterator, List

In [0]:
mask_path = "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/predicted_snuplasser"
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
log_table = "logs_predicted_snuplasser"
table = "predicted_snuplasser_bronze"
endepunkt_silver_table = "endepunkt_silver"

land_catalog = "land_ngis_dev"
bygning_schema = "silver_fkbbygning"
bygning_table = "bygning"
dataset = f"{land_catalog}.{bygning_schema}.{bygning_table}"

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {log_table} (
  processed_time TIMESTAMP,
  num_inserted INT,
  num_updated INT,
  num_deleted INT
) USING DELTA
"""
spark.sql(q)

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {table} (
    row_hash STRING,
    white_pixels INT,
    turning_space BOOLEAN,
    bbox ARRAY<DOUBLE>,
    source_file STRING,
    ingest_time TIMESTAMP
) USING DELTA
"""
spark.sql(q)

In [0]:
# Sett kontekst med katalog og skjema
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")

In [0]:
def log_predicted_mask(log_data: list):
    """
    Skriver logg med antall insert, update og deleter i deltatabellen.
    """
    schema = StructType(
        [
            StructField("processed_time", TimestampType(), True),
            StructField("num_inserted", IntegerType(), True),
            StructField("num_updated", IntegerType(), True),
            StructField("num_deleted", IntegerType(), True),
        ]
    )
    spark.createDataFrame(log_data, schema=schema).write.format("delta").mode(
        "append"
    ).saveAsTable(log_table)

In [0]:
def check_for_new_predicted_masks() -> list:
    """
    Returnerer en liste med masker som ikke er lagret i deltatabellen.
    """
    all_masks = [
        f.path.rstrip("/").split("/")[-1]
        for f in dbutils.fs.ls(mask_path)
        if f.path.endswith(".png")
    ]

    processed_masks_df = spark.read.table(table).select("source_file")
    processed_masks = [row["source_file"] for row in processed_masks_df.collect()]

    return [mask for mask in all_masks if mask not in processed_masks]

In [0]:
def get_srid():
    """
    Henter ut SRID fra metadata tags.
    """
    query = f"""
        SELECT tag_value
        FROM system.information_schema.table_tags
        WHERE catalog_name = '{land_catalog}'
          AND schema_name = '{bygning_schema}'
          AND table_name = '{bygning_table}'
          AND tag_name = 'SRID'
    """
    result = spark.sql(query).collect()
    return result[0]["tag_value"] if result else None


def crs_is_righthanded(srid: str) -> bool:
    """
    Sjekker om koordinatsystemet er høyrehåndsystem.
    Dvs. rekkefølge øst, nord
    """
    return CRS(srid).axis_info[0].direction.upper() == "EAST"


def transform_to_epsg(
    df: DataFrame,
    col: str = "geometry",
    source_srid: str = "EPSG:5942",
    target_srid: str = "EPSG:25833",
) -> DataFrame:
    """
    Transformerer geometri til ønsket EPSG-projeksjon.
    """

    # Flippar koordinatane x og y viss koordinatsystemet er venstrehandsystem.
    if crs_is_righthanded(source_srid) == False:
        df = df.withColumn(col, expr(f"ST_FlipCoordinates(ST_GeomFromWKB({col}))"))

    # Transformerer frå source_srid til target_srid (EPSG)
    df = df.withColumn(
        col, expr(f"ST_Transform(geometry, '{source_srid}', '{target_srid}')")
    )

    # Flippar koordinatane tilbake viss dei transformerte koordinatane er venstrehandsystem.
    if crs_is_righthanded(target_srid) == False:
        df = df.withColumn(col, expr(f"ST_FlipCoordinates({col})"))

    df = df.select(col, "kommunenummer")
    return df

In [0]:
def get_buildings(bbox: str, polygons: List) -> np.ndarray:
    """
    Returnerer et array med bygninger i bboxen.
    """
    minx, miny, maxx, maxy = bbox
    out_shape = (369, 369)
    transform = from_bounds(minx, miny, maxx, maxy, out_shape[1], out_shape[0])

    # Rasteriserer polygonene til et raster med samme størrelse som masken.
    geoms = [poly for poly in polygons]
    print(geoms)
    mask = rasterize(
        [(geom, 1) for geom in geoms],
        out_shape=out_shape,
        transform=transform,
        fill=0,
        dtype="uint8",
    )
    arr = np.array(mask)

    return arr

In [0]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    BooleanType,
    IntegerType,
    ArrayType,
)
import pandas as pd
import numpy as np
from PIL import Image


def write_to_sdf(predicted_masks: list) -> DataFrame:
    """
    Returnerer en spark dataframe med data fra deltatabellen.
    """

    # Step 1: Create a DataFrame of masks
    df = spark.createDataFrame([(m,) for m in predicted_masks], ["mask"])

    # Step 2: Join with silver table to get bbox + kommune_id
    silver = spark.read.table(endepunkt_silver_table).select(
        "nodeid", "bbox", "kommune_id"
    )
    buildings = spark.read.table(dataset).select("kommunenummer", "geometry")
    buildings = transform_to_epsg(
        buildings, col="geometry", source_srid=get_srid(), target_srid="EPSG:25833"
    )

    df = (
        spark.createDataFrame([(m,) for m in predicted_masks], ["mask"])
        .withColumn("nodeid", F.expr("substring(mask, 12, length(mask)-15)"))
        .join(silver, "nodeid")
        .join(
            buildings, F.col("kommune_id") == F.col("kommunenummer"), "left"
        )  # keep all rows
        .withColumn(
            "intersects",
            F.expr(
                "ST_Intersects(geometry, ST_PolygonFromEnvelope(bbox[0], bbox[1], bbox[2], bbox[3]))"
            ),
        )
        .groupBy("mask", "bbox", "kommune_id")
        .agg(
            F.collect_list(F.when(F.col("intersects"), F.col("geometry"))).alias(
                "geoms"
            )
        )
    )

    schema = StructType(
        [
            StructField("white_pixels", IntegerType(), False),
            StructField("turning_space", BooleanType(), False),
            StructField("bbox", ArrayType(IntegerType()), False),
            StructField("source_file", StringType(), False),
        ]
    )

    def process_masks(iterator):
        for pdf in iterator:
            out_rows = []
            for _, row in pdf.iterrows():
                mask = row["mask"]
                bbox = row["bbox"]
                polygons = row["geoms"]
                arr = np.array(Image.open(f"{mask_path}/{mask}").convert("L"))
                inv_arr = get_buildings(bbox, polygons)
                print(inv_arr)
                result_arr = np.clip(
                    arr.astype(int) - inv_arr.astype(int), 0, 255
                ).astype(np.uint8)
                count_255 = int((result_arr == 255).sum())
                out_rows.append(
                    {
                        "white_pixels": count_255,
                        "turning_space": bool(count_255 > 0),
                        "bbox": bbox,
                        "source_file": mask,
                    }
                )
            yield pd.DataFrame(out_rows)

    records_df = df.mapInPandas(process_masks, schema)

    sdf = records_df.withColumn(
        "row_hash", F.sha2(F.concat_ws("||", *records_df.columns), 256)
    ).withColumn("ingest_time", F.current_timestamp())

    return sdf

In [0]:
def write_delta_table(sdf: DataFrame):
    """
    Skriver data til deltatabellen og oppdaterer dersom row_hash allerede finnes.
    """
    if not spark.catalog.tableExists(table):
        sdf.write.format("delta").mode("overwrite").saveAsTable(table)
    else:
        delta_tbl = DeltaTable.forName(spark, table)
        delta_tbl.alias("target").merge(
            source=sdf.alias("source"), condition="target.row_hash = source.row_hash"
        ).whenMatchedUpdate(
            condition="target.row_hash != source.row_hash",
            set={col: f"source.{col}" for col in sdf.columns},
        ).whenNotMatchedInsert(
            values={col: f"source.{col}" for col in sdf.columns}
        ).execute()

In [0]:
def write_to_delta_table(predicted_masks: DataFrame):
    """
    Skriver logg med antall insert, update og deleter i deltatabellen og lagrer denne.
    """
    table_exists = False
    if spark.catalog.tableExists(table):
        delta_tbl = DeltaTable.forName(spark, table)
        version_before = delta_tbl.history(1).select("version").collect()[0][0]
        table_exists = True

    if predicted_masks:
        sdf = write_to_sdf(predicted_masks)
        write_delta_table(sdf)

    if table_exists:
        version_after = delta_tbl.history(1).select("version").collect()[0][0]
        if version_after > version_before:
            metrics = delta_tbl.history(1).select("operationMetrics").collect()[0][0]
            updated = int(metrics.get("numTargetRowsUpdated", 0))
            inserted = int(metrics.get("numTargetRowsInserted", 0))
            deleted = int(metrics.get("numTargetRowsDeleted", 0))
            print(f"Updated: {updated}, Inserted: {inserted}, Deleted: {deleted}")
        else:
            inserted, updated, deleted = 0, 0, 0
            print("No new Delta version found after merge.")
    else:
        inserted, updated, deleted = sdf.count(), 0, 0
        print(f"Updated: {updated}, Inserted: {inserted}, Deleted: {deleted}")

    log_predicted_mask(log_data=[(datetime.now(), inserted, updated, deleted)])

In [0]:
def main():
    """
    Finner nye masker og skriver til deltatabellen.
    """
    predicted_masks = check_for_new_predicted_masks()
    write_to_delta_table(predicted_masks)

In [0]:
main()