# PySpark metadata generator

Genereer een JSON configuratiebestand voor datapipeline metadata op basis van SQL Server metadata.

## Parameters
Stel de Fabric parameters in voor de gewenste bron en het pad naar het parquetbestand met SQL metadata.

In [None]:
# Parameters
source = "anva_concern"
#source = "anva_meeus"
#source = "ccs_level"
#source = "insurance_data_im"
#source = "ods_reports"
#source = "vizier"

#metadata_path = "Files/metadata/connection_anva_concern_prod_metadata.parquet"
metadata_path = f"Files/metadata/connection_{source}_prod_metadata.parquet"

#metadata_path = "Files/metadata/connection_anva_meeus_prod_metadata.parquet"
#metadata_path = "Files/metadata/sqlmetadata.parquet"

# Toggle: gebruik load_mode configuratie of alles snapshot
use_load_mode_config = False  # True = gebruik df_load_mode, False = alles snapshot

log_to_console = True

In [None]:
# Module fabric.bootstrap
# ---------------------
# This cell enables a flexible module loading strategy:
#
# PRODUCTION (default): The `Files/code` directory is empty. This function does nothing,
# and Python imports all modules from the stable, versioned Wheel in the Environment.
#
# DEVELOPMENT / HOTFIX: To bypass the 15-20 minute Fabric publish cycle for urgent fixes,
# upload individual .py files to `Files/code` in the Lakehouse. This function prepends
# that path to sys.path, so Python finds the override files first. All other modules
# continue to load from the Wheel - only the uploaded files are replaced.
#
# Usage: Keep `Files/code` empty for production stability. Use it only for rapid
# iteration during development or emergency hotfixes.

from modules.fabric_bootstrap import ensure_module_path
ensure_module_path()  # Now Python can find the rest

## Imports en helper functies

In [None]:
from typing import Iterable, Dict
from modules.logging_utils import configure_logging
from modules.path_utils import resolve_files_path, detect_environment
import logging

from pyspark.sql import functions as F
from pyspark.sql import types as T

import re  
import json 

log_file = configure_logging(run_name="fabric_metadata_generator", enable_console_logging=log_to_console)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.info("Logfile: %s", log_file)

In [None]:
from modules.spark_session import get_or_create_spark_session

spark = get_or_create_spark_session(
    app_name="Metadata_Generator",
    enable_hive=True
)

# Configure mssparkutils based on environment
environment = detect_environment(spark)
logger.info(f"Detected environment: {environment}")

if environment == 'fabric':
    # Use Fabric's native mssparkutils
    logger.info("Using Fabric native mssparkutils")
    try:
        from notebookutils import mssparkutils
    except ImportError:
        logger.warning("notebookutils not found, falling back to mock")
        from modules.notebook_utils import get_mssparkutils
        mssparkutils = get_mssparkutils(spark)
else:
    # Use mock for local/cluster environments
    logger.info("Using mock mssparkutils for local/cluster environment")
    from modules.notebook_utils import get_mssparkutils
    mssparkutils = get_mssparkutils(spark)

In [None]:
# Import metadata generation utilities from module
from modules.metadata_utils import (
    make_safe_identifier,
    column_expression,
    build_base_query,
    load_metadata,
    validate_metadata
)

logger.info("âœ“ Metadata generation functions imported from modules.metadata_utils")

## Configuratie DataFrames

In [None]:
sources_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("Server", T.StringType(), False),
    T.StructField("Database", T.StringType(), False),
])

df_sources = spark.createDataFrame([
    ("ccs_level", "vmdwhidpweu01", "InsuranceData_CCS_DWH"),
    (r"anva_meeus", r"vmdwhidpweu01\MEEUS", "InsuranceData_ANVA_DWH"),
    ("vizier", "viz-sql01-mi-p.1d57ac4f4d63.database.windows.net", "CRM_DWH"),
    ("ods_reports", "vmdwhodsanvpweu", "OG_ODS_Reports"),
    ("anva_concern", "vmdwhidpweu01", "InsuranceData_ANVA_DWH"),
    ("insurance_data_im", "vmdwhidpweu01", "InsuranceData_OpGroen_DWH"),
], schema=sources_schema)

_disabled_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
])

df_disabled_tables = spark.createDataFrame([
    ("anva_concern", "dbo", "Jobmonitor"),
    ("anva_concern", "dbo", "LaatsteVerversing"),
    ("anva_concern", "dbo", "Metadata"),
    ("anva_concern", "dbo", "VrijeLabels"),
    ("anva_concern", "pbi", "Nulmeting_Clausules"),
    ("anva_concern", "pbi", "Nulmeting_CodesDekking"),
    ("anva_concern", "pbi", "Nulmeting_CodesNAW"),
    ("anva_concern", "pbi", "Nulmeting_CodesPolis"),
    ("anva_concern", "pbi", "Nulmeting_LabelDekking"),
    ("anva_concern", "pbi", "Nulmeting_LabelNAW"),
    ("anva_concern", "pbi", "Nulmeting_LabelPolis"),
    ("anva_concern", "pbi", "Nulmeting_NAWDetails"),
    ("anva_concern", "pbi", "Nulmeting_NAWLabels"),
    ("anva_concern", "pbi", "Nulmeting_PolisDetails"),
    ("anva_concern", "pbi", "Nulmeting_PolisLabels"),
    ("anva_concern", "pbi", "Nulmeting_Voorwaarden"),
    ("anva_meeus", "dbo", "Jobmonitor"),
    ("anva_meeus", "dbo", "LaatsteVerversing"),
    ("anva_meeus", "dbo", "Metadata"),
    ("anva_meeus", "dbo", "VrijeLabels"),
    ("anva_meeus", "pbi", "Nulmeting_Clausules"),
    ("anva_meeus", "pbi", "Nulmeting_CodesDekking"),
    ("anva_meeus", "pbi", "Nulmeting_CodesNAW"),
    ("anva_meeus", "pbi", "Nulmeting_CodesPolis"),
    ("anva_meeus", "pbi", "Nulmeting_LabelDekking"),
    ("anva_meeus", "pbi", "Nulmeting_LabelNAW"),
    ("anva_meeus", "pbi", "Nulmeting_LabelPolis"),
    ("anva_meeus", "pbi", "Nulmeting_NAWDetails"),
    ("anva_meeus", "pbi", "Nulmeting_NAWLabels"),
    ("anva_meeus", "pbi", "Nulmeting_PolisDetails"),
    ("anva_meeus", "pbi", "Nulmeting_PolisLabels"),
    ("anva_meeus", "pbi", "Nulmeting_Voorwaarden"),
], schema=_disabled_schema)

size_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("size_class", T.StringType(), False),
])

df_size_class = spark.createDataFrame([
    ("anva_concern", "pbi", "Fact_PremieFacturen", "L"),
    ("ccs_level", "pbi", "Fact_PremieBoekingen", "L"),
    ("geintegreerd_model", "pbi", "Fact_PremieFacturen", "L"),
    ("anva_meeus", "pbi", "Fact_PremieFacturen", "L"),
], schema=size_schema)

load_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("load_mode", T.StringType(), False),
    T.StructField("filter_column", T.StringType(), True),
    T.StructField("kind", T.StringType(), True),
])

df_load_mode = spark.createDataFrame([
    ("anva_concern", "pbi", "Fact_PremieFacturen", "window", "Boek_Datum", "datetime"),
    ("ccs_level", "pbi", "Fact_PremieBoekingen", "window", "Boek_Datum", "datetime"),
    ("geintegreerd_model", "pbi", "Fact_PremieFacturen", "window", "Boek_Datum", "datetime"),
    ("anva_meeus", "pbi", "Fact_PremieFacturen", "window", "Boek_Datum", "datetime"),
    ("vizier", "dbo", "Relaties", "incremental", "Updatedatum", "stamp17"),
    ("vizier", "dbo", "Contactpersonen", "incremental", "Upd_dt", "stamp17"),
    ("vizier", "dbo", "Sleutels", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Polissen", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "Schades", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "DnB", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "Contactmomenten", "incremental", "Upd", "stamp17"),
    ("vizier", "dbo", "Taken", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Sales", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "Retenties", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "Adresbeeld", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "UBO_Onderzoeken", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "Producten", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Medewerkers", "incremental", "id_upd", "stamp17"),
    ("vizier", "dbo", "Klachten", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "Verkoopkansen", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Interesses", "incremental", "UPD_DT", "stamp17"),
], schema=load_schema)

window_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("partition_column", T.StringType(), False),
    T.StructField("granularity", T.StringType(), False),
    T.StructField("lookback_months", T.IntegerType(), False),
])

df_window_config = spark.createDataFrame([
    ("anva_concern", "pbi", "Fact_PremieFacturen", "Boek_Datum", "month", 12),
    ("geintegreerd_model", "pbi", "Fact_PremieFacturen", "Boek_Datum", "month", 12),
    ("anva_meeus", "pbi", "Fact_PremieFacturen", "Boek_Datum", "month", 12),
    ("ccs_level", "pbi", "Fact_PremieBoekingen", "Boek_Datum", "month", 12),
], schema=window_schema)

excluded_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("excluded", T.IntegerType(), False),
])

df_excluded_tables = spark.createDataFrame([
    ("vizier", "dbo", "BO_sleutels_Wim_Verheijen", 1),
    ("vizier", "dbo", "UMG_Historie", 1),
], schema=excluded_schema)

## Metadata inlezen uit Parquet

In [None]:
# Resolve metadata path to correct environment-specific location
resolved_metadata_path = resolve_files_path(metadata_path, spark)
logger.info(f"Resolved metadata path: {resolved_metadata_path}")

metadata_df = validate_metadata(load_metadata(spark, resolved_metadata_path))
logger.info(f"Loaded metadata records: {metadata_df.count()}")

source_mapping = df_sources.filter(F.col("Bron") == F.lit(source))
if source_mapping.count() == 0:
    raise ValueError(f"Unknown source '{source}' in df_sources")

metadata_filtered = (
    metadata_df.alias("m")
    .join(
        source_mapping.alias("s"),
        (F.col("m.server_name") == F.col("s.Server")) & (F.col("m.db_name") == F.col("s.Database")),
        "inner",
    )
    .withColumn("Bron", F.col("s.Bron"))
    .filter(F.col("s.Bron") == F.lit(source))
)

logger.info(f"Filtered metadata records for source '{source}': {metadata_filtered.count()}")
if metadata_filtered.limit(1).count() == 0:
    raise ValueError("No metadata records found for the specified source")

## Base query generatie

In [None]:
base_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("base_query", T.StringType(), False),
])

col_struct = F.struct(
    "ordinal_position", "column_name", "data_type", "numeric_precision", "numeric_scale", "max_len"
)

base_query_df = (
    metadata_filtered
    .select("Bron", "schema_name", "obj_name", col_struct.alias("column"))
    .groupBy("Bron", "schema_name", "obj_name")
    .agg(F.collect_list("column").alias("columns"))
    .rdd
    .map(lambda row: (row.Bron, row.schema_name, row.obj_name, build_base_query(row.schema_name, row.obj_name, row.columns)))
    .toDF(schema=base_schema)
)


## Configuratie samenvoegen

In [None]:
# Build tables DataFrame - conditionally join df_load_mode based on toggle
tables = base_query_df.alias("bq")

# Always join these
tables = (
    tables
    .join(df_disabled_tables.alias("dis"), ["Bron", "schema_name", "obj_name"], "left")
    .join(df_size_class.alias("sz"), ["Bron", "schema_name", "obj_name"], "left")
)

# Conditionally join load_mode configuration
if use_load_mode_config:
    logger.info("Using load_mode configuration from df_load_mode")
    tables = (
        tables
        .join(df_load_mode.alias("lm"), ["Bron", "schema_name", "obj_name"], "left")
        .join(df_window_config.alias("wnd"), ["Bron", "schema_name", "obj_name"], "left")
    )
else:
    logger.info("Load_mode toggle OFF - all tables will use 'snapshot' mode")

# Always join excluded tables
tables = tables.join(df_excluded_tables.alias("ex"), ["Bron", "schema_name", "obj_name"], "left")

# Apply transformations
tables = (
    tables
    .withColumn("enabled", F.when(F.col("dis.obj_name").isNull(), F.lit(True)).otherwise(F.lit(False)))
    .withColumn("size_class", F.when(F.col("sz.size_class").isNull(), F.lit("S")).otherwise(F.col("sz.size_class")))
    .withColumn("excluded", F.when(F.col("ex.excluded").isNull(), F.lit(0)).otherwise(F.col("ex.excluded")))
)

# Conditionally set load_mode
if use_load_mode_config:
    tables = tables.withColumn("load_mode", 
        F.when(F.col("lm.load_mode").isNull(), F.lit("snapshot")).otherwise(F.col("lm.load_mode"))
    )
    # Select with load_mode columns
    tables = tables.select(
        F.col("bq.obj_name").alias("name"),
        "schema_name",
        "Bron",
        "enabled",
        "size_class",
        "load_mode",
        F.col("bq.base_query"),
        F.col("lm.filter_column"),
        F.col("lm.kind"),
        F.col("wnd.partition_column"),
        F.col("wnd.granularity"),
        F.col("wnd.lookback_months"),
        "excluded",
    )
else:
    # All snapshot - no filter_column or window config
    tables = (
        tables
        .withColumn("load_mode", F.lit("snapshot"))
        .select(
            F.col("bq.obj_name").alias("name"),
            "schema_name",
            "Bron",
            "enabled",
            "size_class",
            "load_mode",
            F.col("bq.base_query"),
            F.lit(None).cast(T.StringType()).alias("filter_column"),
            F.lit(None).cast(T.StringType()).alias("kind"),
            F.lit(None).cast(T.StringType()).alias("partition_column"),
            F.lit(None).cast(T.StringType()).alias("granularity"),
            F.lit(None).cast(T.IntegerType()).alias("lookback_months"),
            "excluded",
        )
    )

# Filter and order
tables = (
    tables
    .filter(F.col("excluded") == 0)
    .orderBy("name")
)

## JSON constructie en wegschrijven

In [None]:
def table_record(row: T.Row) -> Dict[str, object]:
    record = {
        "name": row.name,
        "enabled": bool(row.enabled),
        "size_class": row.size_class or "S",
        "load_mode": row.load_mode or "snapshot",
        "delta_schema": source,
        "delta_table": row.name,
        "base_query": row.base_query,
    }
    if record["load_mode"] == "window":
        record["window"] = {
            "partition_column": row.partition_column,
            "granularity": row.granularity,
            "lookback_months": int(row.lookback_months) if row.lookback_months is not None else None,
        }
    if record["load_mode"] == "incremental":
        record["incremental_column"] = {
            "column": row.filter_column,
            "kind": row.kind,
        }
    return record


defaults = {
    "concurrency_large": 2,
    "concurrency_small": 8,
    "max_rows_per_file_large": 15_000_000,
    "max_rows_per_file_small": 1_000_000,
}

result = {
    "source": source,
    "run_date_utc": None,
    "watermarks_path": "config/watermarks.json",
    "base_files": "greenhouse_sources",
    "connection_name": f"connection_{source}_prod",
    "defaults": defaults,
    "tables": [table_record(row) for row in tables.collect()],
}

# Fabric-style path (works in both Fabric and cluster via mssparkutils mock)
output_path = f"Files/config/{source}_metadata.json"
json_payload = json.dumps(result, ensure_ascii=False, indent=4)

# Use mssparkutils.fs.put (native in Fabric, mock in cluster)
mssparkutils.fs.put(output_path, json_payload, True)

logger.info(f"Metadata JSON geschreven naar {output_path}")
logger.info(f"  Tables: {len(result['tables'])}")
logger.info(f"  JSON size: {len(json_payload)} bytes")