In [0]:
%pip install python-dotenv

In [0]:
from pyspark.sql import SparkSession
from dotenv import load_dotenv
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DataType, DateType, DoubleType
from pyspark.sql.window import Window
from delta.tables import DeltaTable
import requests
import time
from pathlib import Path
import os
from bs4 import BeautifulSoup
from datetime import datetime

In [0]:
catalog_dev = "`land_topografisk-gdb_dev`"
schema_dev = "ai2025"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")

bronze_table = "endepunkt_bronze"
silver_table = "endepunkt_silver"
buffer= 64

In [0]:
load_dotenv()
BRUKERID = os.getenv("GEONORGE_BRUKERID")
PASSORD  = os.getenv("GEONORGE_PASSORD")

def get_token():
    url = (
        f"https://baat.geonorge.no/skbaatts/req?brukerid={BRUKERID}"
        f"&passord={PASSORD}&tjenesteid=wms.nib&retformat=s"
    )
    raw_token = requests.get(url).text.strip("`")
    return raw_token

token = get_token()
token_start_time = time.time()
token_lifetime = 55 * 60  

def refresh_token_if_needed():
    global token, token_start_time
    if time.time() - token_start_time > token_lifetime:
        print("🔄 Fornyer token...")
        token = get_token()
        token_start_time = time.time()

In [0]:
def generate_dom_url(bbox):
    bbox_str = ",".join(map(str, bbox))
    width, height = 512, 512
    resolution =0.2
    return (
        f"https://wms.geonorge.no/skwms1/wms.hoyde-dom-nhm-25833?request=GetMap&Format=image/png&"
        f"GetFeatureInfo=text/plain&CRS=EPSG:25833&Layers=NHM_DOM_25833:skyggerelieff&"
        f"BBOX={bbox_str}&width={width}&height={height}&RESOLUTION={resolution}"
    )

generate_dom_url_udf = udf(generate_dom_url, StringType())

def dom_file_exists(nodeid: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/storreendepunkt_dom/dom_{nodeid}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

dom_file_status_udf = udf(dom_file_exists, StringType())

In [0]:
def generate_image_url(bbox):
    try:
        bbox_str = ",".join(map(str, bbox))
        width, height = 512, 512
        resolution = 0.2
        return (
            f"https://wms.geonorge.no/skwms1/wms.nib?VERSION=1.3.0"
            f"&service=WMS&request=GetMap&Format=image/png&"
            f"GetFeatureInfo=text/plain&CRS=EPSG:25833&Layers=ortofoto&"
            f"BBox={bbox_str}&width={width}&height={height}&RESOLUTION={resolution}&TICKET="
        ) 
    except Exception as e:
        return "INVALID"
    
generate_image_url_udf = udf(generate_image_url, StringType())

def image_file_exists(nodeid: str) -> str:
    path = f"/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER/storreendepunkt_images/image_{nodeid}.png"
    return "DOWNLOADED" if os.path.exists(path) else "PENDING"

image_file_status_udf = udf(image_file_exists, StringType())

In [0]:
def get_fotodato(bbox: str, token: str):
    url = f"https://wms.geonorge.no/skwms1/wms.nib?SERVICE=WMS&VERSION=1.3.0&REQUEST=GetFeatureInfo&CRS=EPSG:25833&BBOX={bbox}&WIDTH=512&HEIGHT=512&LAYERS=ortofoto&QUERY_LAYERS=ortofoto&INFO_FORMAT=text/html&I=256&J=256&TICKET={token}"
    refresh_token_if_needed()
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    table = soup.find("table")
    field_value = None
    
    if table:
        rows = table.find_all("tr")
        for row in rows:
            cells = row.find_all("td")
            if len(cells) >= 2 and cells[0].text.strip() == "Fotodato":
                field_value = cells[1].text.strip()
                field_value = datetime.strptime(field_value, "%d.%m.%Y").date()
                return field_value
    
    return None

In [0]:
def add_ortofoto_date(df, token: str):

    """
    Add the ortofoto date to the dataframe based on bbox.
    Returns a new DataFrame with the 'fototid' column added.
    """

    # Select only relevant columns for processing
    sample_rows = df.select("nodeid", "bbox").collect()

    # Fetch fotodato values from WMS
    bbox_date_pairs = [
        (row["nodeid"], get_fotodato(','.join(map(str, row["bbox"])), token))
        for row in sample_rows
    ]

    # Schema for intermediate DataFrame
    schema = StructType([
        StructField("nodeid", StringType(), True),
        StructField("fototid", DateType(), True)  # Use StringType if needed
    ])

    # Create small lookup DataFrame
    bbox_date_df = spark.createDataFrame(bbox_date_pairs, schema)

    # Join back on nodeid
    df_with_date = df.join(bbox_date_df, on="nodeid", how="left")

    return df_with_date

In [0]:
def write_delta_table(sdf: DataFrame, mode: str = "merge") -> None:
    if not spark.catalog.tableExists(silver_table):
        sdf.write.format("delta") \
            .option("overwriteSchema", "true") \
            .mode("overwrite") \
            .saveAsTable(silver_table)
    else:
        delta_tbl = DeltaTable.forName(spark, silver_table)
        delta_tbl.alias("target") \
            .merge(
                source=sdf.alias("source"),
                condition="target.nodeid = source.nodeid"
            ) \
            .whenMatchedUpdateAll() \
            .whenNotMatchedInsertAll() \
            .execute()

In [0]:
df = spark.read.table(bronze_table)
generate_dom_url_udf = udf(generate_dom_url, StringType())
generate_image_url_udf = udf(generate_image_url, StringType())
dom_file_exists_udf = udf(dom_file_exists, StringType())
image_file_exists_udf = udf(image_file_exists, StringType())

BASE_PATH = "/Volumes/land_topografisk-gdb_dev/external_dev/static_data/DL_SNUPLASSER"
SUBDIR = {
    "image": "storreimage",
    "dom":   "storredom",
    "mask":  "storrelabel"  
}

buffer = buffer or 64
df = df.withColumn("bbox", expr(f"array(x - {buffer}, y - {buffer}, x + {buffer}, y + {buffer})"))
df = df.withColumn("bbox_str", concat_ws(",", col("bbox"))) 
for dt in ["image", "dom", "mask"]:
    sub = SUBDIR[dt]
    df = df.withColumn(
        f"{dt}_path",
        concat(
            lit(f"{BASE_PATH}/{sub}/{dt}_"),
            col("nodeid"),
            lit(".png")
        )
    )
df = df.withColumn("image_wms", generate_image_url_udf(col("bbox"))) \
        .withColumn("dom_wms", generate_dom_url_udf(col("bbox"))) \
        .withColumn("image_status", image_file_exists_udf(col("nodeid"))) \
        .withColumn("dom_status", dom_file_exists_udf(col("nodeid"))) \
        .withColumn("lastet_tid", current_timestamp()) \
        .withColumn("row_hash", sha2(concat_ws("||", *df.columns), 256))
#df = add_ortofoto_date(df, token)
write_delta_table(df)

In [0]:
df.display()