# Fabric PySpark metadata generator

Genereer een JSON configuratiebestand voor datapipeline metadata op basis van SQL Server metadata.

## Parameters
Stel de Fabric parameters in voor de gewenste bron en het pad naar het parquetbestand met SQL metadata.

In [None]:
# Parameters
source = "anva_meeus"
metadata_path = "Files/metadata/metadata_anva_meeus.parquet

## Imports en helper functies

In [None]:
import json
import re
import importlib
from typing import Iterable, Dict

from pyspark.sql import functions as F
from pyspark.sql import types as T

if importlib.util.find_spec("notebookutils"):
    from notebookutils import mssparkutils
else:
    from pyspark.dbutils import DBUtils
    dbutils = DBUtils(spark)
    mssparkutils = dbutils.notebookutils


In [None]:

def make_safe_identifier(name: str) -> str:
    """Normaliseer kolomnamen voor gebruik in Delta Lake."""
    if name is None:
        return ""
    cleaned = re.sub(r"[^0-9A-Za-z_ ]+", "", name)
    cleaned = re.sub(r"\s+", "_", cleaned.strip())
    if cleaned and cleaned[0].isdigit():
        cleaned = f"_{cleaned}"
    return cleaned or name


def column_expression(col: T.Row) -> str:
    dt = (col.data_type or "").lower()
    col_ref = f"[{col.column_name}]"

    if dt in ("decimal", "numeric"):
        precision = col.numeric_precision or 38
        scale = col.numeric_scale or 18
        expr = f"CAST({col_ref} AS decimal({precision},{scale}))"
    elif dt == "money":
        expr = f"CAST({col_ref} AS decimal(19,4))"
    elif dt == "smallmoney":
        expr = f"CAST({col_ref} AS decimal(10,4))"
    elif dt == "tinyint":
        expr = f"CAST({col_ref} AS smallint)"
    elif dt in {"smallint", "int", "bigint", "bit", "float", "real"}:
        expr = f"CAST({col_ref} AS {dt})"
    elif dt == "date":
        expr = f"CAST({col_ref} AS date)"
    elif dt == "datetime":
        expr = f"CAST({col_ref} AS datetime2(3))"
    elif dt == "smalldatetime":
        expr = f"CAST({col_ref} AS datetime2(0))"
    elif dt == "datetime2":
        expr = f"CAST({col_ref} AS datetime2(6))"
    elif dt == "time":
        expr = f"CONVERT(varchar(8), {col_ref}, 108)"
    elif dt == "datetimeoffset":
        expr = f"CAST(SWITCHOFFSET({col_ref}, '+00:00') AS datetime2(6))"
    elif dt in {"char", "varchar", "nchar", "nvarchar"}:
        expr = col_ref
    elif dt == "text":
        expr = f"CONVERT(varchar(max), {col_ref})"
    elif dt == "ntext":
        expr = f"CONVERT(nvarchar(max), {col_ref})"
    elif dt == "uniqueidentifier":
        expr = f"CONVERT(varchar(36), {col_ref})"
    elif dt == "xml":
        expr = f"CONVERT(nvarchar(max), {col_ref})"
    else:
        expr = col_ref

    alias = make_safe_identifier(col.column_name)
    return f"{expr} AS [{alias}]"


def build_base_query(schema_name: str, table_name: str, columns):
    ordered_cols = sorted(columns, key=lambda r: r.ordinal_position or 0)
    select_parts = [column_expression(col) for col in ordered_cols]
    select_clause = ",
    ".join(select_parts)
    return f"SELECT
    {select_clause}
FROM [{schema_name}].[{table_name}]"


def load_metadata(path: str):
    return spark.read.parquet(path)


def validate_metadata(df):
    required_cols = [
        "external_obj_id", "server_type_id", "server_name", "db_name", "schema_name", "obj_name",
        "obj_type_id", "ordinal_position", "column_name", "column_type_id", "is_nullable",
        "data_type", "max_len", "numeric_precision", "numeric_scale", "primary_key_sorting",
        "default_value", "source", "src_obj_id", "obj_def_id", "is_incremental",
        "is_incremental_column", "incremental_criteria",
    ]
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"Metadata ontbreekt kolommen: {', '.join(missing)}")
    if df.filter(F.col('column_name').isNull()).count() > 0:
        raise ValueError("Metadata bevat lege kolomnamen")
    return df


## Configuratie DataFrames

In [None]:
sources_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("Server", T.StringType(), False),
    T.StructField("Database", T.StringType(), False),
])

df_sources = spark.createDataFrame([
    ("ccs_level", "vmdwhidpweu01", "InsuranceData_CCS_DWH"),
    ("anva_meeus", "vmdwhidpweu01\MEEUS", "InsuranceData_ANVA_DWH"),
    ("vizier", "viz-sql01-mi-p.1d57ac4f4d63.database.windows.net", "CRM_DWH"),
    ("ods_reports", "vmdwhodsanvpweu", "OG_ODS_Reports"),
    ("anva_concern", "vmdwhidpweu01", "InsuranceData_ANVA_DWH"),
    ("insurance_data_im", "vmdwhidpweu01", "InsuranceData_OpGroen_DWH"),
], schema=sources_schema)

_disabled_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
])

df_disabled_tables = spark.createDataFrame([
    ("anva_concern", "dbo", "Jobmonitor"),
    ("anva_concern", "dbo", "LaatsteVerversing"),
    ("anva_concern", "dbo", "Metadata"),
    ("anva_concern", "dbo", "VrijeLabels"),
    ("anva_concern", "pbi", "Nulmeting_Clausules"),
    ("anva_concern", "pbi", "Nulmeting_CodesDekking"),
    ("anva_concern", "pbi", "Nulmeting_CodesNAW"),
    ("anva_concern", "pbi", "Nulmeting_CodesPolis"),
    ("anva_concern", "pbi", "Nulmeting_LabelDekking"),
    ("anva_concern", "pbi", "Nulmeting_LabelNAW"),
    ("anva_concern", "pbi", "Nulmeting_LabelPolis"),
    ("anva_concern", "pbi", "Nulmeting_NAWDetails"),
    ("anva_concern", "pbi", "Nulmeting_NAWLabels"),
    ("anva_concern", "pbi", "Nulmeting_PolisDetails"),
    ("anva_concern", "pbi", "Nulmeting_PolisLabels"),
    ("anva_concern", "pbi", "Nulmeting_Voorwaarden"),
    ("anva_meeus", "dbo", "Jobmonitor"),
    ("anva_meeus", "dbo", "LaatsteVerversing"),
    ("anva_meeus", "dbo", "Metadata"),
    ("anva_meeus", "dbo", "VrijeLabels"),
    ("anva_meeus", "pbi", "Nulmeting_Clausules"),
    ("anva_meeus", "pbi", "Nulmeting_CodesDekking"),
    ("anva_meeus", "pbi", "Nulmeting_CodesNAW"),
    ("anva_meeus", "pbi", "Nulmeting_CodesPolis"),
    ("anva_meeus", "pbi", "Nulmeting_LabelDekking"),
    ("anva_meeus", "pbi", "Nulmeting_LabelNAW"),
    ("anva_meeus", "pbi", "Nulmeting_LabelPolis"),
    ("anva_meeus", "pbi", "Nulmeting_NAWDetails"),
    ("anva_meeus", "pbi", "Nulmeting_NAWLabels"),
    ("anva_meeus", "pbi", "Nulmeting_PolisDetails"),
    ("anva_meeus", "pbi", "Nulmeting_PolisLabels"),
    ("anva_meeus", "pbi", "Nulmeting_Voorwaarden"),
], schema=_disabled_schema)

size_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("size_class", T.StringType(), False),
])

df_size_class = spark.createDataFrame([
    ("anva_concern", "pbi", "Fact_PremieFacturen", "L"),
    ("ccs_level", "pbi", "Fact_PremieBoekingen", "L"),
    ("geintegreerd_model", "pbi", "Fact_PremieFacturen", "L"),
    ("anva_meeus", "pbi", "Fact_PremieFacturen", "L"),
], schema=size_schema)

load_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("load_mode", T.StringType(), False),
    T.StructField("filter_column", T.StringType(), True),
    T.StructField("kind", T.StringType(), True),
])

df_load_mode = spark.createDataFrame([
    ("anva_concern", "pbi", "Fact_PremieFacturen", "window", "Boek_Datum", "datetime"),
    ("ccs_level", "pbi", "Fact_PremieBoekingen", "window", "Boek_Datum", "datetime"),
    ("geintegreerd_model", "pbi", "Fact_PremieFacturen", "window", "Boek_Datum", "datetime"),
    ("anva_meeus", "pbi", "Fact_PremieFacturen", "window", "Boek_Datum", "datetime"),
    ("vizier", "dbo", "Relaties", "incremental", "Updatedatum", "stamp17"),
    ("vizier", "dbo", "Contactpersonen", "incremental", "Upd_dt", "stamp17"),
    ("vizier", "dbo", "Sleutels", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Polissen", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "Schades", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "DnB", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "Contactmomenten", "incremental", "Upd", "stamp17"),
    ("vizier", "dbo", "Taken", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Sales", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "Retenties", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "Adresbeeld", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "UBO_Onderzoeken", "incremental", "UPD_DT", "stamp17"),
    ("vizier", "dbo", "Producten", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Medewerkers", "incremental", "id_upd", "stamp17"),
    ("vizier", "dbo", "Klachten", "incremental", "upd_dt", "stamp17"),
    ("vizier", "dbo", "Verkoopkansen", "incremental", "upd", "stamp17"),
    ("vizier", "dbo", "Interesses", "incremental", "UPD_DT", "stamp17"),
], schema=load_schema)

window_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("partition_column", T.StringType(), False),
    T.StructField("granularity", T.StringType(), False),
    T.StructField("lookback_months", T.IntegerType(), False),
])

df_window_config = spark.createDataFrame([
    ("anva_concern", "pbi", "Fact_PremieFacturen", "Boek_Datum", "month", 12),
    ("geintegreerd_model", "pbi", "Fact_PremieFacturen", "Boek_Datum", "month", 12),
    ("anva_meeus", "pbi", "Fact_PremieFacturen", "Boek_Datum", "month", 12),
    ("ccs_level", "pbi", "Fact_PremieBoekingen", "Boek_Datum", "month", 12),
], schema=window_schema)

excluded_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("excluded", T.IntegerType(), False),
])

df_excluded_tables = spark.createDataFrame([
    ("vizier", "dbo", "BO_sleutels_Wim_Verheijen", 1),
    ("vizier", "dbo", "UMG_Historie", 1),
], schema=excluded_schema)


## Metadata inlezen uit Parquet

In [None]:
metadata_df = validate_metadata(load_metadata(metadata_path))

source_mapping = df_sources.filter(F.col("Bron") == F.lit(source))

if source_mapping.count() == 0:
    raise ValueError(f"Onbekende source '{source}' in df_sources")

metadata_filtered = (
    metadata_df.alias("m")
    .join(
        source_mapping.alias("s"),
        (F.col("m.server_name") == F.col("s.Server")) & (F.col("m.db_name") == F.col("s.Database")),
        "inner",
    )
    .withColumn("Bron", F.col("s.Bron"))
    .filter(F.col("s.Bron") == F.lit(source))
)

if metadata_filtered.limit(1).count() == 0:
    raise ValueError("Geen metadata records gevonden voor de opgegeven source")


## Base query generatie

In [None]:
base_schema = T.StructType([
    T.StructField("Bron", T.StringType(), False),
    T.StructField("schema_name", T.StringType(), False),
    T.StructField("obj_name", T.StringType(), False),
    T.StructField("base_query", T.StringType(), False),
])

col_struct = F.struct(
    "ordinal_position", "column_name", "data_type", "numeric_precision", "numeric_scale", "max_len"
)

base_query_df = (
    metadata_filtered
    .select("Bron", "schema_name", "obj_name", col_struct.alias("column"))
    .groupBy("Bron", "schema_name", "obj_name")
    .agg(F.collect_list("column").alias("columns"))
    .rdd
    .map(lambda row: (row.Bron, row.schema_name, row.obj_name, build_base_query(row.schema_name, row.obj_name, row.columns)))
    .toDF(schema=base_schema)
)


## Configuratie samenvoegen

In [None]:
tables = (
    base_query_df.alias("bq")
    .join(df_disabled_tables.alias("dis"), ["Bron", "schema_name", "obj_name"], "left")
    .join(df_size_class.alias("sz"), ["Bron", "schema_name", "obj_name"], "left")
    .join(df_load_mode.alias("lm"), ["Bron", "schema_name", "obj_name"], "left")
    .join(df_window_config.alias("wnd"), ["Bron", "schema_name", "obj_name"], "left")
    .join(df_excluded_tables.alias("ex"), ["Bron", "schema_name", "obj_name"], "left")
    .withColumn("enabled", F.when(F.col("dis.obj_name").isNull(), F.lit(True)).otherwise(F.lit(False)))
    .withColumn("size_class", F.when(F.col("sz.size_class").isNull(), F.lit("S")).otherwise(F.col("sz.size_class")))
    .withColumn("load_mode", F.when(F.col("lm.load_mode").isNull(), F.lit("snapshot")).otherwise(F.col("lm.load_mode")))
    .withColumn("excluded", F.when(F.col("ex.excluded").isNull(), F.lit(0)).otherwise(F.col("ex.excluded")))
    .select(
        F.col("bq.obj_name").alias("name"),
        "schema_name",
        "Bron",
        "enabled",
        "size_class",
        "load_mode",
        F.col("bq.base_query"),
        F.col("lm.filter_column"),
        F.col("lm.kind"),
        F.col("wnd.partition_column"),
        F.col("wnd.granularity"),
        F.col("wnd.lookback_months"),
        "excluded",
    )
    .filter(F.col("excluded") == 0)
    .orderBy("name")
)

tables.cache()


## JSON constructie en wegschrijven

In [None]:
def table_record(row: T.Row) -> Dict[str, object]:
    record = {
        "name": row.name,
        "enabled": bool(row.enabled),
        "size_class": row.size_class or "S",
        "load_mode": row.load_mode or "snapshot",
        "delta_schema": source,
        "delta_table": row.name,
        "base_query": row.base_query,
    }
    if record["load_mode"] == "window":
        record["window"] = {
            "partition_column": row.partition_column,
            "granularity": row.granularity,
            "lookback_months": int(row.lookback_months) if row.lookback_months is not None else None,
        }
    if record["load_mode"] == "incremental":
        record["incremental_column"] = {
            "column": row.filter_column,
            "kind": row.kind,
        }
    return record


defaults = {
    "concurrency_large": 2,
    "concurrency_small": 8,
    "max_rows_per_file_large": 15_000_000,
    "max_rows_per_file_small": 1_000_000,
}

result = {
    "source": source,
    "run_date_utc": None,
    "watermarks_path": "config/watermarks.json",
    "base_files": "greenhouse_sources",
    "connection_name": f"connection_{source}_prod",
    "defaults": defaults,
    "tables": [table_record(row) for row in tables.collect()],
}

output_path = f"Files/config/{source}_metadata.json"
json_payload = json.dumps(result, ensure_ascii=False, indent=4)

mssparkutils.fs.put(output_path, json_payload, True)
print(f"Metadata JSON geschreven naar {output_path}")
