In [0]:
%pip install python-dotenv

In [0]:
from pyspark.sql.functions import *
from pyspark.sql import Window

from src.data.geometry_utils import (
    make_envelope,
    envelope_to_bboxes,
)
from src.data.data_utils import write_delta_table
from src.data.token_utils import get_token
from src.data.image_utils import enrich_output

In [0]:
catalog_dev = "`land_auto-gen-kart_dev`"
schema_dev = "dl_bildesegmentering"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
bronze_table = "hospitals_bronze"
silver_table = "hospitals_silver"
gold_table = "hospitals_gold"

SUBDIR = {"image": "hospital_images"}

bbox_length = 128
image_width, image_height = 512, 512

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {gold_table} (
    row_hash STRING,
    geometry BINARY,
    bbox ARRAY<DOUBLE>,
    bbox_str STRING,
    image_wms STRING,
    image_status STRING,
    image_path STRING,
    kommunenummer BIGINT,
    oppdateringsdato TIMESTAMP,
    ingest_time TIMESTAMP,
    photo_time DATE
) USING DELTA
"""
spark.sql(q)

In [0]:
def read_table_to_wkt():
    """
    Leser polygonene fra bronsetabellen og returnerer dem som en GeoDataFrame med WKT geometri.
    """
    df_bronze = spark.read.table(silver_table).withColumn(
        "geometry", expr("ST_GeomFromWKB(geometry)")
    )
    df = (
        df_bronze.withColumn("_geom_srid", expr("ST_SetSRID(geometry, 4326)"))
        .withColumn("geometry", expr("ST_Transform(_geom_srid, 'EPSG:25833')"))
        .select("geometry", "kommunenummer", "oppdateringsdato", "ingest_time")
    )
    return df

In [0]:
def merge_tables(bronze_df: DataFrame, silver_df: DataFrame) -> DataFrame:
    """
    Finner sykehusene i fkb som ogs√• er i N50 innenfor en maksimal avstand.
    """
    MAX_DISTANCE = 100  # meter
    bronze_df = bronze_df.withColumn("geometry", expr("ST_GeomFromWKT(geometry)"))
    bronze_buffered = bronze_df.withColumn(
        "geometry_buffer", expr(f"ST_Buffer(geometry, {MAX_DISTANCE})")
    )
    joined = (
        bronze_buffered.alias("b")
        .join(
            silver_df.alias("t"),
            expr(
                f"ST_Intersects(b.geometry_buffer, t.geometry) AND ST_DWithin(b.geometry, t.geometry, {MAX_DISTANCE})"
            ),
            how="inner",
        )
        .withColumn("distance", expr("ST_Distance(b.geometry, t.geometry)"))
    )
    window_spec = Window.partitionBy("b.row_hash").orderBy(col("distance").asc())
    ranked = joined.withColumn("rank", row_number().over(window_spec)).filter(
        col("rank") == 1
    )
    result = ranked.select("t.*")
    return result

In [0]:
token = get_token()

bronze_df = spark.read.table(bronze_table)
silver_df = read_table_to_wkt()

df = merge_tables(bronze_df, silver_df)
df = make_envelope(df, bbox_length)
df = envelope_to_bboxes(df, bbox_length)

df = df.withColumn("geometry", expr("ST_AsBinary(geometry)"))
df = df.withColumn("row_hash", sha2(concat_ws("||", "geometry", "kommunenummer"), 256))

df = enrich_output(df, token, "row_hash", SUBDIR, image_width, image_height)

write_delta_table(df, gold_table, id_col="row_hash")