# Master Orchestrator: Bronze → Silver (→ Gold)

Orchestrates the complete bronze and silver layer processing for a single source and run_ts.

Called by pipeline after parquet files are written.

In [21]:
# Parameters (passed from pipeline)
source = "ods_reports"  # Will be set by pipeline
run_ts = "20250923T060123389"  # Will be set by pipeline
dag_path = "/lakehouse/default/Files/config/dag_ods_reports_week.json"  # Will be set by pipeline
drop_existing_tables = False  # Optional: force recreate
retry_tables = None #['Fact_Polis']

# Validate parameters when called from pipeline
if not source or not run_ts or not dag_path:
    raise ValueError("Missing required parameters: source, run_ts, and dag_path must be provided")

print(f"Master Orchestrator Starting...")
print(f"Source: {source}")
print(f"Run TS: {run_ts}")
print(f"DAG path: {dag_path}")
print(f"Retry tables: {retry_tables}")
print("=" * 80)

StatementMeta(, 53c29526-6348-40e2-a89f-697e3d9be508, 23, Finished, Available, Finished)

Master Orchestrator Starting...
Source: ods_reports
Run TS: 20250923T060123389
DAG path: /lakehouse/default/Files/config/dag_ods_reports_week.json
Retry tables: None


## Step 2b: Check for Incremental Tables

Parse the DAG configuration to identify tables with incremental load mode. If incremental tables are present, execute the incremental processing logic within the current Spark session to avoid additional startup overhead.

**Processing Logic:**
- Scans DAG for enabled tables with `load_mode: incremental`
- Executes incremental notebook if needed (session reuse)
- Skips if no incremental tables present

This optimization ensures that incremental processing, bronze loading, and silver transformation all occur within a single Spark session, eliminating redundant 4-minute startup times.

In [22]:
import json

# Parse DAG to check for incremental tables
print(f"Checking DAG for incremental tables...")

with open(dag_path, 'r') as f:
    dag_config = json.load(f)


# Filter incremental tables based on retry_tables parameter
if retry_tables and len(retry_tables) > 0:
    # Only check retry tables for incremental mode
    incremental_tables = [t for t in dag_config['tables'] 
                         if t.get('load_mode') == 'incremental' 
                         and t.get('enabled', True)
                         and t['name'] in retry_tables]
else:
    # Check all enabled tables
    incremental_tables = [t for t in dag_config['tables'] 
                         if t.get('load_mode') == 'incremental' 
                         and t.get('enabled', True)]

if len(incremental_tables) > 0:
    print(f"Found {len(incremental_tables)} incremental tables")
    print(f"Tables: {[t['name'] for t in incremental_tables[:5]]}")
    print("Running incremental logic within this session...")
    
    # Extract watermarks path from DAG
    wm_configpath = dag_config.get('watermarks_path', 'config/watermarks.json')
    
    # Build run_id from run_ts (format: run_20251005T142752505)
    run_id = f"{run_ts}"
    
    # Build watermark folder path
    wm_folder = f"runtime/{source}/{run_id}/"
    
    print(f"Config: wm_configpath={wm_configpath}")
    print(f"Run ID: {run_id}")
    print(f"WM Folder: {wm_folder}")
    
    try:
        result = mssparkutils.notebook.run(
            path="1.1 nb_inc_merge_watermarkfiles",
            timeout_seconds=1800,
            arguments={
                "wm_configpath": wm_configpath,
                "run_id": run_id,
                "source": source,
                "wm_folder": wm_folder
            }
        )
        print(f"  ✓ Incremental processing complete")
    except Exception as e:
        print(f"  ✗ Incremental processing FAILED: {str(e)}")
        raise
else:
    print(f"○ No incremental tables found - skipping incremental processing")

print("=" * 80)

StatementMeta(, 53c29526-6348-40e2-a89f-697e3d9be508, 24, Finished, Available, Finished)

Checking DAG for incremental tables...
○ No incremental tables found - skipping incremental processing


In [23]:
from datetime import datetime

bronze_start = datetime.now()
print(f"BRONZE: Loading parquet to Delta Tables...")

try:
    result = mssparkutils.notebook.run(
        "1. nb_load_bronze",
        timeout_seconds=3600,
        arguments={
            "source": source,
            "run_ts": run_ts,
            "dag_path": dag_path,
            "drop_existing_tables": str(drop_existing_tables).lower(),
            "retry_tables" : ','.join(retry_tables) if retry_tables and len(retry_tables) > 0 else "None"
        }
    )
    
    bronze_duration = (datetime.now() - bronze_start).total_seconds()
    print(f"Bronze completed in {bronze_duration:.1f}s")
    
except Exception as e:
    print(f"Bronze FAILED: {str(e)}")
    raise  # Stop pipeline

StatementMeta(, 53c29526-6348-40e2-a89f-697e3d9be508, 25, Finished, Available, Finished)

BRONZE: Loading parquet to Delta Tables...


Bronze completed in 860.5s


In [24]:
silver_start = datetime.now()
print(f"SILVER: Transforming to business layer...")

try:
    # TODO: Build Silver_Transform notebook
    # result = mssparkutils.notebook.run("Silver_Transform", ...)
    print("  (Silver notebook not built yet - skipping)")
    
    silver_duration = (datetime.now() - silver_start).total_seconds()
    print(f"Silver completed in {silver_duration:.1f}s")
    
except Exception as e:
    print(f"Silver FAILED: {str(e)}")
    raise

StatementMeta(, 53c29526-6348-40e2-a89f-697e3d9be508, 26, Finished, Available, Finished)

SILVER: Transforming to business layer...
  (Silver notebook not built yet - skipping)
Silver completed in 0.0s


In [25]:
# # Test cell - direct testen
# test_table = "Dim_Agent"  # Klein tabel
# test_path = f"/lakehouse/default/Files/greenhouse_sources/vizier/2025/10/05/20251005T144446117/Contactmomenten/Contactmomenten_I_00000.parquet"

# print(f"Testing file:// protocol...")
# print(f"Path: {test_path}")

# try:
#     # Test 1: file:// protocol
#     df1 = spark.read.parquet(f"file://{test_path}")
#     print(f"✓ file:// WORKS - {df1.count()} rows")
# except Exception as e:
#     print(f"✗ file:// FAILED: {str(e)[:150]}")
    
#     # Test 2: PyArrow fallback
#     print(f"\nTrying PyArrow fallback...")
#     import pyarrow.dataset as ds
#     arrow_table = ds.dataset(test_path, format='parquet').to_table()
#     print(f"✓ PyArrow works - {arrow_table.num_rows} rows")

# df = spark.read.parquet("Files/greenhouse_sources/vizier/2025/10/05/20251005T144446117/Contactmomenten/Contactmomenten_I_00000.parquet")
# # df now is a Spark DataFrame containing parquet data from "Files/greenhouse_sources/vizier/2025/10/05/20251005T144446117/Contactmomenten/Contactmomenten_I_00000.parquet".
# display(df)

StatementMeta(, 53c29526-6348-40e2-a89f-697e3d9be508, 27, Finished, Available, Finished)