In [0]:
import geopandas as gpd
from pyspark.sql.functions import *
from delta.tables import DeltaTable
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    TimestampType,
    IntegerType,
)

from src.data.data_utils import write_delta_table, write_to_delta_table
from src.data.geometry_utils import to_wkt_2d
from src.data.log_utils import check_for_new_gdbs, log_processed_gdb

In [0]:
gcs_landing_zone = "/Volumes/land_auto-gen-kart_dev/external_dev/landing_zone/"
catalog_dev = "`land_auto-gen-kart_dev`"
schema_dev = "dl_bildesegmentering"
spark.sql(f"USE CATALOG {catalog_dev}")
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema_dev}")
spark.sql(f"USE SCHEMA {schema_dev}")
log_table = "logs_processed_gdbs_utensnuplass"
bronze_table = "utensnuplass_bronze"
layer = "ikke_snuplass"

layer_crs = 32633

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {bronze_table} (
   geometry STRING,
   ingest_time TIMESTAMP,
   source_file STRING,
   source_layer STRING,
   row_hash STRING
) USING DELTA
"""
spark.sql(q)

In [0]:
q = f"""
CREATE TABLE IF NOT EXISTS {log_table} (
  gdb_name STRING,
  processed_time TIMESTAMP,
  num_inserted INT,
  num_updated INT,
  num_deleted INT
) USING DELTA
"""
spark.sql(q)

In [0]:
def write_to_sdf(gdb_path: str, gdb_name: str, layer: str) -> DataFrame:
    """
    Returnerer en spark dataframe med data fra deltatabellen.
    """
    gdf = (
        gpd.read_file(gdb_path, layer=layer).set_crs("EPSG:25833").to_crs("EPSG:25833")
    )
    gdf["wkt_geometry"] = gdf["geometry"].apply(to_wkt_2d)
    gdf = gdf.drop(columns=["geometry"])

    sdf = spark.createDataFrame(gdf)
    sdf = sdf.withColumnRenamed("wkt_geometry", "geometry")
    sdf = (
        sdf.withColumn("ingest_time", current_timestamp())
        .withColumn("source_file", lit(gdb_name))
        .withColumn("source_layer", lit(layer))
        .withColumn("row_hash", sha2(concat_ws("||", *sdf.columns), 256))
    )

    target_cols = DeltaTable.forName(spark, bronze_table).toDF().columns
    sdf = sdf.select([c for c in sdf.columns if c in target_cols])
    sdf = sdf.dropDuplicates(["row_hash"])

    return sdf

In [0]:
def main():
    """
    Finner nye geodatabaser og skriver til deltatabellen.
    """
    gdbs = check_for_new_gdbs(gcs_landing_zone, log_table, "Snuplasser")
    for gdb in gdbs:
        gdb_name = gdb.rstrip("/").split("/")[-1]
        gdb_path = gdb.removeprefix("dbfs:")

        sdf = write_to_sdf(gdb_path, gdb_name, layer)
        write_to_delta_table(sdf, gdb_name, bronze_table, log_table, "row_hash")

In [0]:
main()