# 00 — Master Orchestrator: Bronze → Silver Processing

Main orchestration notebook for processing parquet files through Bronze and Silver layers.

## Architecture Overview

```
Parquet Files (Files/{source}/{run_ts}/)
    ↓
Bronze Layer (append with run_ts for CDC)
    ↓
Silver Layer (CDC merge: INSERT/UPDATE/DELETE)
    ↓
Watermark Update (incremental tables only)
```

## Process Flow

1. **Load Configuration** (DAG, enabled tables, retry filter)
2. **Check Incremental** → Run watermark merge if needed
3. **Bronze Processing** → Parallel table loading (10 workers)
4. **Bronze Logging** → Batch log all results
5. **Silver Processing** → Parallel CDC merge (tables with business_keys)
6. **Silver Logging** → Batch log all results
7. **Summary Statistics** → Performance metrics, efficiency

## Key Features

- **Parallel Processing**: ThreadPoolExecutor for 5-10x speedup
- **Idempotency**: Check logs before reprocessing
- **Retry Support**: Process only specific tables
- **Error Resilience**: Continue on failure, comprehensive logging
- **Performance Tracking**: Efficiency metrics (theoretical vs actual time)

## Parameters

- `source`: Source system name (e.g., "vizier")
- `run_ts`: Run timestamp (e.g., "20251105T142752505")
- `dag_path`: Path to DAG configuration JSON
- `retry_tables`: Optional list of tables to retry
- `force_reload`: Ignore log and reload all
- `max_workers`: Parallel workers (default: 10)
- `debug`: Enable debug output

In [None]:
# Parameters (Papermill compatible)
source = "anva_meeus"                               # Source system name
run_ts = "20251001T183103260"                       # Run timestamp
dag_path = "config/dag_anva_meeus_week.json"        # DAG configuration path
retry_tables = None                                 # Optional: list of table names to retry
force_reload = True                                 # If True, ignore logs and reload all
max_workers = 10                                    # Parallel workers for table processing
debug = True                                        # Enable debug output
log_to_console = True                               # Also stream logs to stdout/stderr


## [1] Setup and Imports

In [None]:
from datetime import datetime, timezone
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Dict, Any
import json
from uuid import uuid4
from modules.logging_utils import configure_logging
import logging

log_file = configure_logging(run_name="master_orchestrator", enable_console_logging=log_to_console)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.info("Logfile: %s", log_file)

logger.info("="*80)
logger.info("MASTER ORCHESTRATOR STARTING")
logger.info("="*80)
logger.info(f"Source: {source}")
logger.info(f"Run TS: {run_ts}")
logger.info(f"DAG: {dag_path}")
logger.info(f"Retry tables: {retry_tables}")
logger.info(f"Force reload: {force_reload}")
logger.info(f"Max workers: {max_workers}")
logger.info(f"Debug: {debug}")
logger.info("="*80)


## [2] Load Utility Notebooks

# Load logging utilities
# Load config utilities


In [None]:
## [1.5] Initialize Spark Session

# Check if Spark session exists (Fabric/Databricks has it by default)
# For local environments, create it
try:
    spark
    logger.info("✓ Spark session already available")
except NameError:
    logger.info("Creating Spark session for local environment...")
    from pyspark.sql import SparkSession
    
    #todo: use spark_config.py and override spark defaults. Enable Hive support is needed.
    spark = SparkSession.builder \
        .appName("DWH_Bronze_Silver_Processing") \
        .enableHiveSupport() \
        .getOrCreate()
    
    logger.info("✓ Spark session created")

spark.conf.set("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")

# Verify Spark session
logger.info(f"  Spark version: {spark.version}")
logger.info(f"  Application ID: {spark.sparkContext.applicationId}")

In [None]:
spark.table("anva_meeus.Dim_Tekenjaar") \
     .show(20, truncate=False)

In [None]:
from pyspark.sql import functions as F

(
    spark.table("anva_meeus.Dim_DekkingVariabel") 
        .groupBy("Index") 
        .count() 
        .filter(F.col("count") > 1) 
        #.select("Index")
        .orderBy(F.desc("count"))
        .show(20, truncate=False)
)

In [None]:
spark.table("anva_meeus.Dim_DekkingVariabel").createOrReplaceTempView("mytable")

spark.sql("""
    SELECT Index
    FROM mytable
    GROUP BY Index
    HAVING COUNT(1) > 1
""").show(20, truncate=False)

In [None]:
# loop over all dimension tables and check for duplicates
tables = spark.catalog.listTables("anva_meeus")

dim_tables = [t.name for t in tables if t.name.lower().startswith("dim_")]

for table_name in dim_tables:
    logger.info(f"Processing table: {table_name}")
    df = spark.table(f"anva_meeus.{table_name}")
    total_count = df.count()
    logger.info(f"  Total records: {total_count}")
    duplicate_counts = (
        df.groupBy(df.columns)
        .count()
        .filter(F.col("count") > 1)
    )
    dup_count = duplicate_counts.count()
    if dup_count > 0:
        logger.warning(f"  Found {dup_count} duplicate records in {table_name}")
        duplicate_counts.show(20, truncate=False)
    else:
        logger.info(f"  No duplicate records found in {table_name}")

In [None]:
# loop over all dimension tables and check for duplicates on columns that end with _id
tables = spark.catalog.listTables("anva_meeus") 
fact_tables = [t.name for t in tables if t.name.lower().startswith("fact_") and t.name.lower() == "fact_agenda"]

for table_name in fact_tables:
    logger.info(f"Processing table {table_name} for _id duplicates..")
    df = spark.table(f"anva_meeus.{table_name}")
    id_columns = [col for col in df.columns if col.lower().endswith("_id") or col.lower() == "schadenummer" or col.lower() == "taak_datum" or col.lower() == "aanmaak_datum"]
    if not id_columns:
        logger.info(f"  No _id columns found in {table_name}, skipping.")
        continue
    duplicate_counts = (
        df.groupBy(id_columns)
        .count()
        .filter(F.col("count") > 1)
    )
    dup_count = duplicate_counts.count()
    if dup_count > 0:
        logger.warning(f"  Found {dup_count} duplicate records based on _id columns in {table_name}")
        duplicate_counts.show(20, truncate=False)
    else:
        logger.info(f"  No duplicate records found based on _id columns in {table_name}")

In [None]:
(
spark.table("anva_meeus.Fact_Agenda")
    #.where("Medewerker_Id == 'A0779275'" and "Relatie_Id == '1658625'" and "Polis_Id == '1658625020032003'" and "Schadenummer == '3034776'")
    .where("Medewerker_Id == 'REGGSOE'" and "Relatie_Id == '3066025'" and "Polis_Id == '3066025020012001'" and "Schadenummer == '2743553'")
    .show(20, truncate=False)
)

In [None]:
#spark.table("logs.bronze_processing_log").show(20, truncate=False)
#spark.sql("DESCRIBE EXTENDED logs.bronze_processing_log").show(200, truncate=False)
#spark.sql("DESCRIBE EXTENDED logs.bronze_run_summary").show(200, truncate=False)
#spark.catalog.refreshTable("logs.bronze_processing_log")

# detail = spark.sql("DESCRIBE DETAIL logs.bronze_processing_log")
# detail.show(truncate=False)


#spark.sql("create table logs.bronze_processing_log_copy uSING DELTA AS SELECT * FROM logs.bronze_processing_log;")

#spark.table("logs.bronze_processing_log_copy").show(20, truncate=False)
# loc = (spark.sql("DESCRIBE DETAIL logs.bronze_processing_log")
#          .select("location")
#          .collect()[0][0])

# spark.sql("DROP TABLE IF EXISTS logs.bronze_processing_log")

# spark.sql(f"""
# CREATE TABLE logs.bronze_processing_log
# USING DELTA
# LOCATION '{loc}'
# """)

# spark.sql(f"""
# INSERT INTO logs.bronze_processing_log
# SELECT * FROM logs.bronze_processing_log_copy
# """)

#spark.table("logs.bronze_processing_log").show(truncate=False)

#spark.sql("DESCRIBE EXTENDED logs.bronze_processing_log").show(200, truncate=False)

#spark.sql("DESCRIBE EXTENDED anva_meeus.dim_agent").show(200,truncate=False)
#spark.catalog.listColumns("logs.bronze_processing_log")

# spark.sql("""
# CREATE OR REPLACE VIEW logs.bronze_processing_log_pbi AS
# SELECT
#     log_id,
#     run_log_id,
#     run_id,
#     run_date,
#     run_ts,
#     source,
#     table_name,
#     partition_key,
#     load_mode,
#     status,
#     rows_processed,
#     start_time,
#     end_time,
#     duration_seconds,
#     error_message
# FROM logs.bronze_processing_log
# """)

# spark.sql("""
# CREATE OR REPLACE VIEW logs.bronze_run_summary_pbi AS
# SELECT
#     log_id
#     ,run_id
#     ,run_date
#     ,run_ts
#     ,source
#     ,status
#     ,run_start
#     ,run_end
#     ,duration_seconds
#     ,total_tables
#     ,tables_success
#     ,tables_empty
#     ,tables_failed
#     ,tables_skipped
#     ,total_rows
#     ,workers
#     ,sum_task_seconds
#     ,theoretical_min_sec
#     ,actual_time_sec
#     ,efficiency_pct
#     ,failed_tables
#     ,error_message
# FROM logs.bronze_run_summary
# """)
