# PySpark metadata generator

Genereer een JSON configuratiebestand voor datapipeline metadata op basis van SQL Server metadata.

## Parameters
Stel de Fabric parameters in voor de gewenste bron en het pad naar het parquetbestand met SQL metadata.

In [1]:
# Parameters
source = "anva_concern"
#source = "anva_meeus"
#source = "ccs_level"
#source = "insurance_data_im"
#source = "ods_reports"
#source = "vizier"

#metadata_path = "Files/metadata/connection_anva_concern_prod_metadata.parquet"
metadata_path = f"Files/metadata/connection_{source}_prod_metadata.parquet"

#metadata_path = "Files/metadata/connection_anva_meeus_prod_metadata.parquet"
#metadata_path = "Files/metadata/sqlmetadata.parquet"

# Toggle: gebruik load_mode configuratie of alles snapshot
use_load_mode_config = False  # True = gebruik df_load_mode, False = alles snapshot

log_to_console = True

In [2]:
# Module fabric.bootstrap
# ---------------------
# This cell enables a flexible module loading strategy:
#
# PRODUCTION (default): The `Files/code` directory is empty. This function does nothing,
# and Python imports all modules from the stable, versioned Wheel in the Environment.
#
# DEVELOPMENT / HOTFIX: To bypass the 15-20 minute Fabric publish cycle for urgent fixes,
# upload individual .py files to `Files/code` in the Lakehouse. This function prepends
# that path to sys.path, so Python finds the override files first. All other modules
# continue to load from the Wheel - only the uploaded files are replaced.
#
# Usage: Keep `Files/code` empty for production stability. Use it only for rapid
# iteration during development or emergency hotfixes.

from modules.fabric_bootstrap import ensure_module_path
ensure_module_path()  # Now Python can find the rest

''

## Imports en helper functies

In [3]:
from typing import Iterable, Dict
from modules.logging_utils import configure_logging
from modules.path_utils import resolve_files_path, detect_environment
import logging

from pyspark.sql import functions as F
from pyspark.sql import types as T

import re  
import json 

log_file = configure_logging(run_name="fabric_metadata_generator", enable_console_logging=log_to_console)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.info("Logfile: %s", log_file)

2025-12-09 09:12:50,381 [INFO] - Logfile: /data/lakehouse/gh_b_avd/lh_gh_bronze/Files/notebook_outputs/logs/fabric_metadata_generator_20251209_091250.log


In [4]:
from modules.spark_session import get_or_create_spark_session

spark = get_or_create_spark_session(
    app_name="Metadata_Generator",
    enable_hive=True
)

# Configure mssparkutils based on environment
environment = detect_environment(spark)
logger.info(f"Detected environment: {environment}")

if environment == 'fabric':
    # Use Fabric's native mssparkutils
    logger.info("Using Fabric native mssparkutils")
    try:
        from notebookutils import mssparkutils
    except ImportError:
        logger.warning("notebookutils not found, falling back to mock")
        from modules.notebook_utils import get_mssparkutils
        mssparkutils = get_mssparkutils(spark)
else:
    # Use mock for local/cluster environments
    logger.info("Using mock mssparkutils for local/cluster environment")
    from modules.notebook_utils import get_mssparkutils
    mssparkutils = get_mssparkutils(spark)

2025-12-09 09:12:50,387 [INFO] - Creating new Spark session for local/cluster environment...
2025-12-09 09:12:50,389 [INFO] -   - Hive support enabled
2025-12-09 09:12:50,390 [INFO] -   - Detection: Local/Cluster environment detected
2025-12-09 09:12:50,391 [INFO] -   - Python VENV gevonden: /home/sparkadmin/source/repos/dwh_spark_processing/.venv/bin/python
2025-12-09 09:12:50,400 [INFO] -   - Modules gezipt voor distributie: /tmp/dwh_modules_package.zip
25/12/09 09:12:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/09 09:12:51 WARN StandaloneSchedulerBackend: Dynamic allocation enabled without spark.executor.cores explicitly set, you may get more executors allocated than expected. It's recommended to set spark.executor.cores explicitly. Please check SPARK-30299 for more d

In [None]:
# Import metadata generation utilities from module
from modules.metadata_utils import (
    make_safe_identifier,
    column_expression,
    build_base_query,
    load_metadata,
    validate_metadata
)

logger.info("Metadata generation functions imported from modules.metadata_utils")

2025-12-09 09:12:53,754 [INFO] - ✓ Metadata generation functions imported from modules.metadata_utils


## Configuratie DataFrames

In [6]:
# --- CONFIGURATIE SCHEMA DEFINITIES ---
sources_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("Server", T.StringType(), False),
    T.StructField("Database", T.StringType(), False),
])

_disabled_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
])

size_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("size_class", T.StringType(), False),
])

load_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("load_mode", T.StringType(), False),
    T.StructField("filter_column", T.StringType(), True),
    T.StructField("kind", T.StringType(), True),
])

window_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("partition_column", T.StringType(), False),
    T.StructField("granularity", T.StringType(), False),
    T.StructField("lookback_months", T.IntegerType(), False),
])

excluded_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("excluded", T.IntegerType(), False),
])

# --- JSON LOADER FUNCTIE ---
def load_config_table(filename: str, schema: T.StructType):
    """Laadt een JSON configuratiebestand of geeft een lege DataFrame terug bij fouten."""
    try:
        # Pad oplossen (werkt voor Fabric en Lokaal)
        file_path = resolve_files_path(f"Files/config/definitions/{filename}", spark)
        
        # Lezen met multiline optie voor leesbaarheid van JSON
        logger.info(f"Loading config from: {filename}")
        return spark.read.schema(schema).option("multiline", "true").option("mode", "PERMISSIVE").json(file_path)
    except Exception as e:
        logger.warning(f"Unable to load {filename} (or file does not exist): {e}")
        return spark.createDataFrame([], schema)

# --- CONFIGURATIE INLEZEN ---
df_sources = load_config_table("sources.json", sources_schema)
df_disabled_tables = load_config_table("disabled_tables.json", _disabled_schema)
df_size_class = load_config_table("size_class.json", size_schema)
df_load_mode = load_config_table("load_mode.json", load_schema)
df_window_config = load_config_table("window_config.json", window_schema)
df_excluded_tables = load_config_table("excluded_tables.json", excluded_schema)

logger.info("✓ All configuration files loaded successfully.")

2025-12-09 09:12:55,101 [INFO] - Loading config from: sources.json
2025-12-09 09:12:56,412 [INFO] - Loading config from: disabled_tables.json
2025-12-09 09:12:57,272 [INFO] - Loading config from: size_class.json
2025-12-09 09:12:58,164 [INFO] - Loading config from: load_mode.json
2025-12-09 09:12:59,033 [INFO] - Loading config from: window_config.json
2025-12-09 09:12:59,960 [INFO] - Loading config from: excluded_tables.json
2025-12-09 09:12:59,974 [INFO] - ✓ All configuration files loaded successfully.


## Metadata inlezen uit Parquet

In [7]:
# Resolve metadata path to correct environment-specific location
resolved_metadata_path = resolve_files_path(metadata_path, spark)
logger.info(f"Resolved metadata path: {resolved_metadata_path}")

metadata_df = validate_metadata(load_metadata(spark, resolved_metadata_path))
logger.info(f"Loaded metadata records: {metadata_df.count()}")

source_mapping = df_sources.filter(F.col("Bron") == F.lit(source))
if source_mapping.count() == 0:
    raise ValueError(f"Unknown source '{source}' in df_sources")

metadata_filtered = (
    metadata_df.alias("m")
    .join(
        source_mapping.alias("s"),
        (F.col("m.server_name") == F.col("s.Server")) & (F.col("m.db_name") == F.col("s.Database")),
        "inner",
    )
    .withColumn("Bron", F.col("s.Bron"))
    .filter(F.col("s.Bron") == F.lit(source))
)

logger.info(f"Filtered metadata records for source '{source}': {metadata_filtered.count()}")
if metadata_filtered.limit(1).count() == 0:
    raise ValueError("No metadata records found for the specified source")

2025-12-09 09:13:00,810 [INFO] - Resolved metadata path: /data/lakehouse/gh_b_avd/lh_gh_bronze/Files/metadata/connection_anva_concern_prod_metadata.parquet
2025-12-09 09:13:03,208 [INFO] - Loaded metadata records: 1557                  
2025-12-09 09:13:03,998 [INFO] - Filtered metadata records for source 'anva_concern': 1557


## Base query generatie

In [8]:
base_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("base_query", T.StringType(), False),
])

col_struct = F.struct(
    "ordinal_position", "column_name", "data_type", "numeric_precision", "numeric_scale", "max_len"
)

base_query_df = (
    metadata_filtered
    .select("Bron", "schema_name", "obj_name", col_struct.alias("column"))
    .groupBy("Bron", "schema_name", "obj_name")
    .agg(F.collect_list("column").alias("columns"))
    .rdd
    .map(lambda row: (row.Bron, row.schema_name, row.obj_name, build_base_query(row.schema_name, row.obj_name, row.columns)))
    .toDF(schema=base_schema)
)


## Configuratie samenvoegen

In [9]:
# Helper functie voor case-insensitive joins
def normalize_keys(df):
    """Normaliseert de Bron, schema_name en obj_name kolommen."""
    return df.withColumn("join_key_bronze", F.lower(F.trim(F.col("Bron")))) \
             .withColumn("join_key_schema", F.lower(F.trim(F.col("schema_name")))) \
             .withColumn("join_key_obj", F.lower(F.trim(F.col("obj_name"))))

# 1. Bereid de basis tabel voor (metadata)
# We gebruiken 'bq' als alias zodat we later expliciet deze kolommen kunnen kiezen
tables = normalize_keys(base_query_df).alias("bq")

# 2. Bereid de altijd-nodige config tabellen voor
df_dis_norm = normalize_keys(df_disabled_tables).alias("dis")
df_sz_norm = normalize_keys(df_size_class).alias("sz")
df_ex_norm = normalize_keys(df_excluded_tables).alias("ex")

# Definieer de sleutels waarop we joinen
join_keys = ["join_key_bronze", "join_key_schema", "join_key_obj"]

# 3. Voer de standaard joins uit
tables = (
    tables
    .join(df_dis_norm, join_keys, "left")
    .join(df_sz_norm, join_keys, "left")
    .join(df_ex_norm, join_keys, "left")
)

# 4. Conditionele joins voor Load Mode configuratie
if use_load_mode_config:
    logger.info("Using load_mode configuration from df_load_mode")
    df_lm_norm = normalize_keys(df_load_mode).alias("lm")
    df_wnd_norm = normalize_keys(df_window_config).alias("wnd")
    
    tables = (
        tables
        .join(df_lm_norm, join_keys, "left")
        .join(df_wnd_norm, join_keys, "left")
    )
else:
    logger.info("Load_mode toggle OFF - all tables will use 'snapshot' mode")

# 5. Pas logica toe (Enabled / Size / Excluded) en ruim op
tables = (
    tables
    .drop(*join_keys)  # Verwijder de hulp-kolommen
    .withColumn("enabled", F.when(F.col("dis.obj_name").isNull(), F.lit(True)).otherwise(F.lit(False)))
    .withColumn("size_class", F.when(F.col("sz.size_class").isNull(), F.lit("S")).otherwise(F.col("sz.size_class")))
    .withColumn("excluded", F.when(F.col("ex.excluded").isNull(), F.lit(0)).otherwise(F.col("ex.excluded")))
)

# 6. Finale Selectie en Load Mode logica
if use_load_mode_config:
    # Bepaal load_mode uit config (fallback naar snapshot)
    tables = tables.withColumn("load_mode", 
        F.when(F.col("lm.load_mode").isNull(), F.lit("snapshot")).otherwise(F.col("lm.load_mode"))
    )
    
    # Selecteer de specifieke kolommen uit de config-joins
    tables = tables.select(
        F.col("bq.obj_name").alias("name"),
        F.col("bq.schema_name"),  # Gebruik expliciet bq.* om ambiguïteit te voorkomen
        F.col("bq.Bron"),
        "enabled",
        "size_class",
        "load_mode",
        F.col("bq.base_query"),
        F.col("lm.filter_column"),
        F.col("lm.kind"),
        F.col("wnd.partition_column"),
        F.col("wnd.granularity"),
        F.col("wnd.lookback_months"),
        "excluded",
    )
else:
    # Forceer snapshot en zet config kolommen op NULL
    tables = (
        tables
        .withColumn("load_mode", F.lit("snapshot"))
        .select(
            F.col("bq.obj_name").alias("name"),
            F.col("bq.schema_name"),
            F.col("bq.Bron"),
            "enabled",
            "size_class",
            "load_mode",
            F.col("bq.base_query"),
            F.lit(None).cast(T.StringType()).alias("filter_column"),
            F.lit(None).cast(T.StringType()).alias("kind"),
            F.lit(None).cast(T.StringType()).alias("partition_column"),
            F.lit(None).cast(T.StringType()).alias("granularity"),
            F.lit(None).cast(T.IntegerType()).alias("lookback_months"),
            "excluded",
        )
    )

# 7. Filter en sorteer
tables = (
    tables
    .filter(F.col("excluded") == 0)
    .orderBy("name")
)

2025-12-09 09:13:04,839 [INFO] - Load_mode toggle OFF - all tables will use 'snapshot' mode


## JSON constructie en wegschrijven

In [10]:
def table_record(row: T.Row) -> Dict[str, object]:
    record = {
        "name": row.name,
        "enabled": bool(row.enabled),
        "size_class": row.size_class or "S",
        "load_mode": row.load_mode or "snapshot",
        "delta_schema": source,
        "delta_table": row.name,
        "base_query": row.base_query,
    }
    if record["load_mode"] == "window":
        record["window"] = {
            "partition_column": row.partition_column,
            "granularity": row.granularity,
            "lookback_months": int(row.lookback_months) if row.lookback_months is not None else None,
        }
    if record["load_mode"] == "incremental":
        record["incremental_column"] = {
            "column": row.filter_column,
            "kind": row.kind,
        }
    return record


defaults = {
    "concurrency_large": 2,
    "concurrency_small": 8,
    "max_rows_per_file_large": 15_000_000,
    "max_rows_per_file_small": 1_000_000,
}

result = {
    "source": source,
    "run_date_utc": None,
    "watermarks_path": "config/watermarks.json",
    "base_files": "greenhouse_sources",
    "connection_name": f"connection_{source}_prod",
    "defaults": defaults,
    "tables": [table_record(row) for row in tables.collect()],
}

# Fabric-style path (works in both Fabric and cluster via mssparkutils mock)
output_path = f"Files/config/{source}_metadata.json"
json_payload = json.dumps(result, ensure_ascii=False, indent=4)

# Use mssparkutils.fs.put (native in Fabric, mock in cluster)
mssparkutils.fs.put(output_path, json_payload, True)

logger.info(f"Metadata JSON geschreven naar {output_path}")
logger.info(f"  Tables: {len(result['tables'])}")
logger.info(f"  JSON size: {len(json_payload)} bytes")

2025-12-09 09:13:06,833 [INFO] - Metadata JSON geschreven naar Files/config/anva_concern_metadata.json
2025-12-09 09:13:06,835 [INFO] -   Tables: 84
2025-12-09 09:13:06,836 [INFO] -   JSON size: 99879 bytes
