# 90 – Master per source (bronze load orchestrator)

This notebook orchestrates the full bronze load for **one source** and **one run_ts**.

Responsibilities:

- Read the DAG.json for the given `SOURCE_NAME`.
- Determine `base_files` and `bronze_schema`.
- Sort tables (largest first, based on `size_class`).
- Run all tables in parallel using `ThreadPoolExecutor` on the driver.
- Collect per-table results (status, timing, rowcounts).
- Write **one batch** of log rows into `logs.bronze_processing_log`.
- Print a concise summary including a parallelism efficiency metric.


## [1] Parameter cell (for pipeline and manual testing)

Adjust these values when running manually. In a pipeline, they can be
injected by the ForEach / pipeline parameters.


In [None]:
# [1] Parameters (for pipeline and manual testing)

# Name of the source as used in the DAG.json
# Run timestamp for this batch, e.g. "20251005T142752505"
# Path to the DAG.json in OneLake

#SOURCE_NAME = "anva_meeus"
#RUN_TS = "20251005T142752505"
#DAG_PATH = "/lakehouse/default/Files/config/dag_anva_meeus_week.json"

#SOURCE_NAME = "ods_reports"
#RUN_TS = "20250923T060123389"
#DAG_PATH = "/lakehouse/default/Files/config/dag_ods_reports_week.json"

SOURCE_NAME = "insurance_data_im"
RUN_TS = "20250923T050205479"
DAG_PATH = "/lakehouse/default/Files/config/dag_insurance_data_im_week.json"

#SOURCE_NAME = "anva_concern"
#RUN_TS = "20250923T190122062"
#DAG_PATH = "/lakehouse/default/Files/config/dag_anva_concern_week.json"

#SOURCE_NAME = "ccs_level"
#RUN_TS = "20250923T040214235"
#DAG_PATH = "/lakehouse/default/Files/config/dag_ccs_level_week.json"

#SOURCE_NAME = "vizier"
#RUN_TS = "20250923T050205479"
#DAG_PATH = "/lakehouse/default/Files/config/dag_insurance_data_im_week.json"

#SOURCE_NAME = "vizier"
#RUN_TS = "20250923T050205479"
#DAG_PATH = "/lakehouse/default/Files/config/dag_insurance_data_im_week.json"


# Enable extra debug logging
DEBUG = False

# Hoeveel historische runs meenemen (1..5)
HISTORY_LOOKBACK_RUNS = 3

# Min/max workers voor deze omgeving
MIN_WORKERS = 2
MAX_WORKERS_CAP = 12



StatementMeta(, , -1, SessionError, , SessionError)

InvalidHttpRequestToLivy: [CapacityLimitExceeded] Unable to complete the action because your organization’s Fabric compute capacity has exceeded its limits. Try again later. HTTP status code: 429.

## [2] Import utility notebooks

These `%run` statements must be **standalone** in their own cells.
They load:

- `/01_utils_logging` – logging schema + helpers
- `/02_utils_config` – DAG and configuration utilities
- `/10_bronze_load` – worker function for a single table


In [None]:
%run "/01_utils_logging"

StatementMeta(, , -1, Cancelled, , Cancelled)

In [None]:
%run "/02_utils_config"

StatementMeta(, , -1, Cancelled, , Cancelled)

In [None]:
%run "/10_bronze_load"

StatementMeta(, , -1, Cancelled, , Cancelled)

## [3] Load DAG, validate source and prepare table list

This cell:

- Validates the parameters.
- Reads the DAG.json from OneLake.
- Ensures it belongs to the requested `SOURCE_NAME`.
- Extracts the list of enabled tables.
- Determines `base_files` and `bronze_schema`.
- Creates the bronze schema if needed.
- Sorts tables by `size_class` (`L` > `M` > `S`) and name.


In [None]:
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor, as_completed
from pyspark.sql import functions as F
import uuid

spark.conf.set("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
spark.conf.set("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
spark.conf.set("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")

StatementMeta(, , -1, Cancelled, , Cancelled)

In [None]:
# [3] Load DAG, validate source and prepare tables

if not SOURCE_NAME:
    raise ValueError("Parameter 'SOURCE_NAME' is required (usually provided by Pipeline ForEach).")

if not RUN_TS:
    raise ValueError("Parameter 'RUN_TS' is required.")

if not DAG_PATH:
    raise ValueError("Parameter 'DAG_PATH' is required.")

if DEBUG:
    print(f"[MASTER] source={SOURCE_NAME} run_ts={RUN_TS} dag={DAG_PATH}")

# Read and validate DAG
dag = read_dag(DAG_PATH)
validate_dag_for_source(dag, SOURCE_NAME)

#get tables
tables = get_tables_for_source(dag, SOURCE_NAME)

# Determine global settings
base_files = get_base_files(dag)

#raise SystemExit("klaar, stop cel")

# Ensure bronze schema exists

# 3) Verzamel alle unieke schema's en maak ze aan (VOOR parallel processing!)
unique_schemas = set()
for t in tables:
    schema = t.get("delta_schema")
    if schema:
        unique_schemas.add(schema)

# Maak alle unieke schema's 1x aan
for schema in unique_schemas:
    spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")
    if DEBUG:
        print(f"[MASTER] Ensured schema exists: {schema}")

if DEBUG:
    print(f"[MASTER] source={SOURCE_NAME} run_ts={RUN_TS} → {len(tables)} tables")
    print(f"[MASTER] Unique schemas: {sorted(unique_schemas)}")

# Get enabled tables
tables = get_tables_for_source(dag, SOURCE_NAME)

if DEBUG:
    print(f"[MASTER] found {len(tables)} enabled tables in DAG")

# 3) Sorteer op historische row counts (grootste eerst!)
try:
    size_results = spark.sql(f"""
        WITH latest_runs AS (
            SELECT 
                table_name,
                rows_written,
                ROW_NUMBER() OVER (PARTITION BY table_name ORDER BY run_ts DESC) as rn
            FROM logs.bronze_processing_log
            WHERE source = '{SOURCE_NAME}'
            AND status = 'SUCCESS'
            AND rows_written > 0
        )

        SELECT table_name, rows_written
        FROM latest_runs
        WHERE rn = 1
    """).collect()
    

    # Bouw lookup dict
    size_map = {row.table_name: row.rows_written for row in size_results}
except Exception as e:
    size_map = {}

def get_size(table_def):
    return size_map.get(table_def["name"], 0)

# Sorteer: grootste eerst (op basis van historische data!)
tables_sorted = sorted(tables, key=get_size, reverse=True)

if DEBUG:
    print("\nTable processing order (by historical size):")
    total_estimated = 0
    for i, t in enumerate(tables_sorted, 1):
        size = size_map.get(t["name"], 0)
        total_estimated += size
        print(f"  {i:2d}. {t['name']:40s} ~{size:>12,} rows")
    print(f"\nEstimated total: {total_estimated:,} rows")

StatementMeta(, , -1, Cancelled, , Cancelled)

## [4] Execute bronze loads in parallel

This cell:

- Starts a `ThreadPoolExecutor` with `MAX_WORKERS`.
- Submits one `process_bronze_table(...)` task per table.
- Collects results into a list of dicts (`results`).
- Prints progress:
  - With `DEBUG=True`: more detailed lines.
  - With `DEBUG=False`: one concise line per table.


In [None]:
# [4] Run bronze loads in parallel
RUN_ID = uuid.uuid4().hex   # of str(uuid.uuid4())

start_all = datetime.utcnow()
results = []

# Maximum number of tables to process in parallel on the driver
# Dynamisch aantal workers bepalen op basis van historie
MAX_WORKERS = choose_worker_profile_from_history(
    source_name      = SOURCE_NAME,
    default_workers  = 8,                 # startpunt als er nog geen historie is
    min_workers      = MIN_WORKERS,
    max_workers_cap  = MAX_WORKERS_CAP,
    lookback_runs    = HISTORY_LOOKBACK_RUNS,
)

MAX_WORKERS = min(MAX_WORKERS, len(tables_sorted))

# print (MAX_WORKERS)
# raise SystemExit("klaar, stop cel")

if DEBUG:
    print(f"[MASTER] starting ThreadPool, using MAX_WORKERS={MAX_WORKERS} (tables={len(tables_sorted)})")

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    future_to_table = {
        executor.submit(
            process_bronze_table,
            t,
            SOURCE_NAME,
            RUN_TS,
            base_files,
            DEBUG
        ): t
        for t in tables_sorted
    }

    for future in as_completed(future_to_table):
        t = future_to_table[future]
        t_name = t.get("name")
        try:
            result = future.result()
        except Exception as e:
            # This should be rare; process_bronze_table normally catches its own errors.
            end_time = datetime.utcnow()
            result = {
                "log_id": f"{SOURCE_NAME}:{t_name}:{RUN_TS}:ERROR",
                "run_id": RUN_ID,
                "run_ts": RUN_TS,
                "source": SOURCE_NAME,
                "table_name": t_name,
                "load_mode": t.get("load_mode"),
                "status": "FAILED",
                "rows_read": None,
                "rows_written": None,
                "start_time": start_all,
                "end_time": end_time,
                "duration_seconds": int((end_time - start_all).total_seconds()),
                "error_message": f"Unexpected error in master: {str(e)}",
                "parquet_path": None,
                "delta_table": t.get("delta_table"),
            }
        results.append(result)

        if DEBUG:
            print(f"[MASTER] {result['table_name']} → {result['status']} (read:{result['rows_read']} written:{result['rows_written']} {result['duration_seconds']}s)")
        else:
            # Minimal but still visible progress
            print(f"[{result['table_name']}] {result['status']} read:{result['rows_read']} written:{result['rows_written']} ({result['duration_seconds']}s)")

end_all = datetime.utcnow()
total_duration = (end_all - start_all).total_seconds()


StatementMeta(, , -1, Cancelled, , Cancelled)

## [5] Batch logging and run summary

This final cell:

- Writes **all** per-table log records in a single batch using
  `log_table_processing_batch(results)`.
- Computes and prints:
  - counts of SUCCESS / EMPTY / FAILED / SKIPPED,
  - total rows,
  - total runtime,
  - theoretical minimum runtime (sum of per-table durations / workers),
  - parallelism efficiency = theoretical_min / actual_time.


In [None]:
# [5] Batch logging and run summary

# 5a) Write all log records in a single append
log_table_processing_batch(results)

# 5b) Compute summary stats from the in-memory results
total_tables = len(results)
success_tables = [r for r in results if r["status"] == "SUCCESS"]
empty_tables   = [r for r in results if r["status"] == "EMPTY"]
failed_tables  = [r for r in results if r["status"] == "FAILED"]
skipped_tables = [r for r in results if r["status"] == "SKIPPED"]

total_success = len(success_tables)
total_empty   = len(empty_tables)
total_failed  = len(failed_tables)
total_skipped = len(skipped_tables)

total_rows = sum((r.get("rows_written") or 0) for r in results)

sum_task_time = sum(
    (r.get("duration_seconds") or 0)
    for r in results
)

theoretical_min = sum_task_time / MAX_WORKERS if MAX_WORKERS > 0 else sum_task_time
efficiency = (theoretical_min / total_duration) if total_duration > 0 else 0.0

# --- Write single run-summary record to Delta ---

# Als je tabel anders heet, vervang dan 'logs.bronze_run_summary'
RUN_SUMMARY_TABLE = "logs.bronze_run_summary"

# Veiligheid: bereken skipped_count als je die niet al hebt
tables_total   = len(tables_sorted)
tables_success = total_success
tables_empty   = total_empty
tables_failed  = total_failed
tables_skipped = max(0, tables_total - tables_success - tables_empty - tables_failed)


summary_row = {
    "run_id":              RUN_ID,
    "source":              SOURCE_NAME,
    "run_ts":              RUN_TS,
    "run_start":           start_all,
    "run_end":             end_all,
    "total_tables":        int(tables_total),
    "tables_success":      int(tables_success),
    "tables_empty":        int(tables_empty),
    "tables_failed":       int(tables_failed),
    "tables_skipped":      int(tables_skipped),
    "total_rows":          int(total_rows),
    "duration_seconds":    int(total_duration),
    "workers":             int(MAX_WORKERS),
    "sum_task_seconds":    float(sum_task_time),
    "theoretical_min_sec": float(theoretical_min),
    "actual_time_sec":     float(total_duration),
    "efficiency_pct":      float(efficiency*100),
}

summary_df = spark.createDataFrame([summary_row])

# Append precies één record voor deze run
summary_df.write.format("delta").mode("append").saveAsTable(RUN_SUMMARY_TABLE)

if DEBUG:
    print(f"[MASTER] Run summary written to {RUN_SUMMARY_TABLE}")



StatementMeta(, , -1, Cancelled, , Cancelled)

In [None]:
print("\n============================================================")
print(f"SUMMARY: {SOURCE_NAME} @ {RUN_TS}")
print("============================================================")
print(f"Total tables:    {total_tables}")
print(f"✓ Success:       {total_success}")
print(f"  - Empty:       {total_empty}")
print(f"✗ Failed:        {total_failed}")
print(f"⏭ Skipped:       {total_skipped}")
print(f"Total rows:      {total_rows:,}")
print(f"Duration:        {total_duration:.1f}s ({total_duration/60.0:.1f}m)")
print(f"Throughput:      { (total_rows / total_duration) if total_duration > 0 else 0.0:,.0f} rows/sec")
print()
print("PARALLELIZATION:")
print(f"Workers:         {MAX_WORKERS}")
print(f"Sum of tasks:    {sum_task_time:.1f}s")
print(f"Theoretical min: {theoretical_min:.1f}s")
print(f"Actual time:     {total_duration:.1f}s")
print(f"Efficiency:      {efficiency*100:.1f}%")
print("============================================================")

StatementMeta(, , -1, Cancelled, , Cancelled)