In [0]:
%pip install python-dotenv

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import requests
import time
from pathlib import Path
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from datetime import datetime
from typing import Optional

In [0]:
spark = SparkSession.builder.getOrCreate()
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")

bronze_table = "endepunkt_bronze"
silver_table = "endepunkt_silver"
buffer = 128  # Gir bildeområdet (meter) 256*256 med pikselareal 0.25kvm

In [0]:
def generate_dom_url(bbox):
    bbox_str = ",".join(map(str, bbox))
    width, height = 512, 512
    resolution = 0.5  # Styrer pikselareal. 0.5 gir 0.25kvm pikselareal sammen med buffer = 128
    return (
        f"https://wms.geonorge.no/skwms1/wms.hoyde-dom-nhm-25833?request=GetMap&Format=image/png&"
        f"GetFeatureInfo=text/plain&CRS=EPSG:25833&Layers=NHM_DOM_25833:skyggerelieff&"
        f"BBOX={bbox_str}&width={width}&height={height}&RESOLUTION={resolution}"
    )

generate_dom_url_udf = udf(generate_dom_url, StringType())

def dom_file_exists(nodeid: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/endepunkt_dom/dom_{nodeid}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

dom_file_status_udf = udf(dom_file_exists, StringType())

In [0]:
def generate_image_url(bbox):
    try:
        bbox_str = ",".join(map(str, bbox))
        width, height = 512, 512
        resolution = 0.5
        return (
            f"https://wms.geonorge.no/skwms1/wms.nib?VERSION=1.3.0"
            f"&service=WMS&request=GetMap&Format=image/png&"
            f"GetFeatureInfo=text/plain&CRS=EPSG:25833&Layers=ortofoto&"
            f"BBox={bbox_str}&width={width}&height={height}&RESOLUTION={resolution}&TICKET="
        )  # token legges til etter TICKET når UDF kjører
    except Exception as e:
        return "INVALID"
    
generate_image_url_udf = udf(generate_image_url, StringType())

def image_file_exists(nodeid: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/endepunkt_images/image_{nodeid}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

image_file_status_udf = udf(image_file_exists, StringType())

In [0]:
load_dotenv()
BRUKERID = os.getenv("GEONORGE_BRUKERID")
PASSORD  = os.getenv("GEONORGE_PASSORD")

def get_token():
    url = (
        f"https://baat.geonorge.no/skbaatts/req?brukerid={BRUKERID}"
        f"&passord={PASSORD}&tjenesteid=wms.nib&retformat=s"
    )
    raw_token = requests.get(url).text.strip("`")
    return raw_token

token = get_token()
token_start_time = time.time()
token_lifetime = 55 * 60  # sekunder

def refresh_token_if_needed():
    global token, token_start_time
    if time.time() - token_start_time > token_lifetime:
        print("🔄 Fornyer token...")
        token = get_token()
        token_start_time = time.time()

In [0]:
def get_fotodato(bbox: str, token: str):
    url = f"https://wms.geonorge.no/skwms1/wms.nib?SERVICE=WMS&VERSION=1.3.0&REQUEST=GetFeatureInfo&CRS=EPSG:25833&BBOX={bbox}&WIDTH=512&HEIGHT=512&LAYERS=ortofoto&QUERY_LAYERS=ortofoto&INFO_FORMAT=text/html&I=256&J=256&TICKET={token}"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    field_value = None
    
    if table:
        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all("td")
            if len(cells) >= 2 and cells[0].text.strip() == "Fotodato":
                field_value = cells[1].text.strip()
                field_value = datetime.strptime(field_value, "%d.%m.%Y").date()
                return field_value
    
    return None

In [0]:
def add_ortofoto_date(df, token: str):

    """
    Add the ortofoto date to the dataframe based on bbox.
    Returns a new DataFrame with the 'fototid' column added.
    """

    # Select only relevant columns for processing
    sample_rows = df.select("row_hash", "bbox").collect()

    # Fetch fotodato values from WMS
    bbox_date_pairs = [
        (row["row_hash"], get_fotodato(','.join(map(str, row["bbox"])), token))
        for row in sample_rows
    ]

    # Schema for intermediate DataFrame
    schema = StructType([
        StructField("row_hash", StringType(), True),
        StructField("fototid", DateType(), True)  # Use StringType if needed
    ])

    # Create small lookup DataFrame
    bbox_date_df = spark.createDataFrame(bbox_date_pairs, schema)

    # Join back on row_hash
    df_with_date = df.join(bbox_date_df, on="row_hash", how="left")

    return df_with_date

In [0]:
BASE_PATH = "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER"

def add_silver_columns(df: DataFrame, buffer: Optional[int], kommune_id: str = "") -> DataFrame:
    buffer = buffer or 128
    df = df.withColumn("bbox", expr(f"array(x - {buffer}, y - {buffer}, x + {buffer}, y + {buffer})"))
    df = df.withColumn("image_wms", generate_image_url_udf(col("bbox"))) \
           .withColumn("dom_wms", generate_dom_url_udf(col("bbox"))) \
           .withColumn("image_status", image_file_status_udf(col("nodeid"))) \
           .withColumn("dom_status", dom_file_status_udf(col("nodeid"))) \
           .withColumn("lastet_tid", current_timestamp()) \
           .withColumn("kommune_id", lit(kommune_id)) \
           .withColumn("row_hash", sha2(concat_ws("||", *df.columns), 256))
    df = df.withColumn("image_path", concat(lit(BASE_PATH), lit("/endepunkt_images/image_"), col("nodeid"), lit(".png")))
    df = df.withColumn("dom_path", concat(lit(BASE_PATH), lit("/endepunkt_dom/dom_"), col("nodeid"), lit(".png")))
    df = add_ortofoto_date(df, token)
    return df

In [0]:
def write_delta_table(sdf: DataFrame, mode: str = "merge") -> None:
    if mode == "overwrite":
        sdf.write.format("delta") \
            .option("mergeSchema", "true") \
            .mode("overwrite") \
            .saveAsTable(silver_table)
    else:
        from delta.tables import DeltaTable

        delta_tbl = DeltaTable.forName(spark, silver_table)

        delta_tbl.alias("target").merge(
            sdf.alias("source"),
            condition="target.nodeid = source.nodeid" 
        ).whenMatchedUpdate(
            condition="target.hentet_tid < source.hentet_tid OR target.image_path IS NULL", 
            set={col: f"source.{col}" for col in sdf.columns}
        ).whenNotMatchedInsert(
            values={col: f"source.{col}" for col in sdf.columns}
        ).execute()


In [0]:
def process_silver_for_kommune(kommune_id: str) -> None:
    kommune_id = str(kommune_id)  # sikkerhet
    bronze_df = spark.read.table(bronze_table)
    bronze_df = bronze_df.filter(col("kommune_id") == lit(kommune_id))

    silver_df = add_silver_columns(bronze_df, buffer=buffer, kommune_id=kommune_id)

    if not spark.catalog.tableExists(silver_table):
        write_delta_table(silver_df, mode="overwrite")
        return
    
    expected_schema = spark.table(silver_table).schema

    silver_df = silver_df.select([
        lit("").cast("string").alias(c.name) if c.dataType.typeName() == "void"
        else col(c.name).cast(c.dataType)
        for c in expected_schema
    ])

    w = Window.partitionBy("nodeid").orderBy(col("hentet_tid").desc())
    silver_df = silver_df.withColumn("row_number", row_number().over(w)) \
                          .filter(col("row_number") == lit(1)) \
                          .drop("row_number")

    write_delta_table(silver_df)

In [0]:
kommune_id_rows = [
    row.asDict() for row in spark.read.table(bronze_table).select("kommune_id").distinct().collect()
]

for row in kommune_id_rows:
    print(f"Row: {row}, type: {type(row)}, kommune_id: {row['kommune_id']}, type: {type(row['kommune_id'])}")

    kommune_id = row["kommune_id"]
    if kommune_id is not None:
        kommune_id = str(kommune_id)
        process_silver_for_kommune(kommune_id)


In [0]:
#spark.sql(f"DROP TABLE IF EXISTS {silver_table}")

In [0]:
display(spark.read.table(silver_table))

In [0]:
df_retable = spark.read.table(silver_table)
df_overview = df_retable.drop("x", "y", "wkt", "kommune_id", "hentet_tid", "row_hash", "bbox", "image_wms", "dom_wms",  "lastet_tid", "fototid")

df_overview.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("endepunkt_status_overview")