In [0]:
from PIL import Image
import numpy as np
from pyspark.sql.functions import *
from delta.tables import DeltaTable
from datetime import datetime

from src.data.data_utils import write_delta_table
from src.data.log_utils import check_for_new_predicted_masks, log_predicted_masks

In [0]:
mask_path = "/Volumes/land_auto-gen-kart_dev/external_dev/static_data/DL_bildesegmentering/predicted_helipads"
catalog_dev = "`land_auto-gen-kart_dev`"
schema_dev = "dl_bildesegmentering"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
log_table = "logs_predicted_helipads"
table = "predicted_helipads_bronze"
hospitals_table = "hospitals_gold"

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {log_table} (
  processed_time TIMESTAMP,
  num_inserted INT,
  num_updated INT,
  num_deleted INT
) USING DELTA
"""
spark.sql(q)

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {table} (
    white_pixels INT,
    helipad BOOLEAN,
    bbox ARRAY<DOUBLE>,
    source_file STRING,
    row_hash STRING,
    ingest_time TIMESTAMP
) USING DELTA
"""
spark.sql(q)

In [0]:
def read_bbox_from_table(row_hash: str) -> DataFrame:
    """
    Henter bbox fra deltatabellen med row_hash som matcher input
    """
    df = (
        spark.read.table(hospitals_table)
        .filter(col("row_hash") == row_hash)
        .select(col("bbox"))
        .first()
        .bbox
    )
    return df

In [0]:
def write_to_sdf(predicted_masks: list) -> DataFrame:
    """
    Returnerer en spark dataframe med data fra deltatabellen.
    """
    records = []
    for mask in predicted_masks:
        bbox = read_bbox_from_table(mask[11:-4])
        img = Image.open(f"{mask_path}/{mask}").convert("L")
        arr = np.array(img)
        count_255 = int((arr == 255).sum())

        records.append(
            {
                "row_hash": mask[11:-4],
                "white_pixels": count_255,
                "helipad": bool(count_255 > 0),
                "bbox": bbox,
                "source_file": mask,
            }
        )

    sdf = spark.createDataFrame(records)
    sdf = sdf.withColumn("ingest_time", current_timestamp())

    return sdf

In [0]:
def write_to_delta_table(predicted_masks: DataFrame):
    """
    Skriver logg med antall insert, update og deleter i deltatabellen og lagrer denne.
    """
    table_exists = False
    if spark.catalog.tableExists(table):
        delta_tbl = DeltaTable.forName(spark, table)
        version_before = delta_tbl.history(1).select("version").collect()[0][0]
        table_exists = True

    if predicted_masks:
        sdf = write_to_sdf(predicted_masks)
        write_delta_table(sdf, table, id_col="row_hash")

    if table_exists:
        version_after = delta_tbl.history(1).select("version").collect()[0][0]
        if version_after > version_before:
            metrics = delta_tbl.history(1).select("operationMetrics").collect()[0][0]
            updated = int(metrics.get("numTargetRowsUpdated", 0))
            inserted = int(metrics.get("numTargetRowsInserted", 0))
            deleted = int(metrics.get("numTargetRowsDeleted", 0))
            print(f"Updated: {updated}, Inserted: {inserted}, Deleted: {deleted}")
        else:
            inserted, updated, deleted = 0, 0, 0
            print("No new Delta version found after merge.")
    else:
        inserted, updated, deleted = sdf.count(), 0, 0
        print(f"Updated: {updated}, Inserted: {inserted}, Deleted: {deleted}")

    log_predicted_masks([(datetime.now(), inserted, updated, deleted)], log_table)

In [0]:
predicted_masks = check_for_new_predicted_masks(mask_path, table)
write_to_delta_table(predicted_masks)