# 10 – Bronze load worker

This notebook defines the *worker* function that loads a **single table** from
parquet into a bronze Delta table.

Design:

- Does **not** write to the log table.
- Reads parquet from OneLake using `file:/lakehouse/default/Files/.../*.parquet`.
- Supports three `load_mode` values, coming from the DAG:
  - `snapshot` – overwrite the entire Delta table.
  - `window` – treated like `snapshot` (the parquet already contains the window).
  - `incremental` – append-only.
- Returns a Python `dict` with all metrics needed for logging and summary.
- Handles:
  - Missing parquet files → status = `EMPTY`.
  - Unknown `load_mode` → status = `SKIPPED` (no Spark work).
  - Probable corrupt Delta table → optional drop+recreate on write error.

This notebook is intended to be imported via `%run "/10_bronze_load"` from the
master notebook.


## [1] Imports and helper functions

This cell defines:

- `build_parquet_dir(base_files, source_name, run_ts, table_name)` – builds the
  OneLake directory for parquet.
- `is_missing_path_error(exc)` – detects "no parquet files found".
- `is_probably_corrupt_delta(exc)` – detects a corrupt or incompatible Delta table.


In [None]:
# [1] Imports and helper functions

from pyspark.sql import functions as F
from datetime import datetime

def build_parquet_dir(base_files: str,
                      source_name: str,
                      run_ts: str,
                      table_name: str) -> str:
    """
    Build the OneLake directory for parquet files of one table and run_ts.

    Example result:
      /lakehouse/default/Files/greenhouse_sources/anva_meeus/2025/10/05/20251005T142752505/Dim_Kantoor
    """
    if not run_ts or len(run_ts) < 8:
        raise ValueError(f"run_ts '{run_ts}' is not in expected yyyymmddThhmmss format")

    year = run_ts[0:4]
    month = run_ts[4:6]
    day = run_ts[6:8]

    #return f"/lakehouse/default/Files/{base_files}/{source_name}/{year}/{month}/{day}/{run_ts}/{table_name}"
    return f"Files/{base_files}/{source_name}/{year}/{month}/{day}/{run_ts}/{table_name}"

def is_missing_path_error(exc: Exception) -> bool:
    """
    Heuristic to detect 'no parquet files found' situations.
    """
    msg = str(exc).lower()
    return (
        "path does not exist" in msg
        or "no such file or directory" in msg
        or "file not found" in msg
        or "cannot find path" in msg
    )


def is_probably_corrupt_delta(exc: Exception) -> bool:
    """
    Heuristic to detect a broken Delta table that may need to be recreated.
    """
    msg = str(exc).lower()
    return (
        "is not a delta table" in msg
        or "failed to merge fields" in msg
        or "incompatible format" in msg
        or ("delta log" in msg and "error" in msg)
    )


## [2] Core worker: process_bronze_table

This function:

- Loads **one** table's parquet files for a specific `run_ts`.
- Writes to a bronze Delta table in the given `bronze_schema`.
- Does *not* log to the log table; it only returns a dict.

Signature:

```python
process_bronze_table(
    table_def,        # dict from DAG["tables"][...]
    source_name,      # e.g. "anva_meeus"
    run_ts,           # e.g. "20251005T142752505"
    base_files,       # e.g. "greenhouse_sources"
    bronze_schema,    # e.g. "bronze"
    debug=False
) -> dict


In [None]:
# [2] Core worker function
from uuid import uuid4

def process_bronze_table(table_def,
                         source_name: str,
                         run_ts: str,
                         base_files: str,
                         debug: bool = False) -> dict:
    """
    Load a single table's parquet files for a given run_ts into a bronze Delta table.

    Returns a dict with all fields needed for logging and summary.
    This function does not write to the log table itself; the master notebook
    is responsible for batch logging.
    """
    table_name = table_def.get("name")
    if not table_name:
        raise ValueError("table_def is missing 'name'")

    load_mode = (table_def.get("load_mode") or "snapshot").lower()
    supported_modes = {"snapshot", "window", "incremental"}

    log_id = f"{source_name}:{table_name}:{run_ts}:{uuid4().hex[:8]}"
    # if debug:
    #         print(f"log_id: {log_id}")

    start_time = datetime.utcnow()
    end_time = None
    status = "RUNNING"
    error_message = None
    rows_read = None
    rows_written = None

    # If load_mode is not recognised, do not even touch Spark
    if load_mode not in supported_modes:
        end_time = datetime.utcnow()
        duration = int((end_time - start_time).total_seconds())
        if debug:
            print(f"[{table_name}] SKIPPED: unsupported load_mode '{load_mode}'")
        return {
            "log_id": log_id,
            "run_id": RUN_ID,
            "run_ts": run_ts,
            "source": source_name,
            "table_name": table_name,
            "load_mode": load_mode,
            "status": "SKIPPED",
            "rows_read": None,
            "rows_written": None,
            "start_time": start_time,
            "end_time": end_time,
            "duration_seconds": duration,
            "error_message": f"Unsupported load_mode '{load_mode}'",
            "parquet_path": None,
            "delta_table": None,
        }

    target_table = table_def.get("delta_table") or table_name
    delta_schema = table_def.get("delta_schema") or "unk_schema"
    delta_table_full =  f"{delta_schema}.{target_table}"

    parquet_dir = build_parquet_dir(base_files, source_name, run_ts, table_name)
    parquet_glob = f"{parquet_dir}/*.parquet"

    # if debug:
    #     print(f"[{table_name}] start ({load_mode}) parquet={parquet_dir} → delta={delta_table_full}")
    #     print(f"glob: {parquet_glob}")

    # 1) Read parquet
    try:
        df = spark.read.parquet(parquet_glob)
        rows_read = df.count()

        if debug:
            print(f"[{table_name}] rows_read={rows_read}")

    except Exception as e:
        if is_missing_path_error(e):
            # Treat as empty export: no data for this table in this run_ts
            end_time = datetime.utcnow()
            duration = int((end_time - start_time).total_seconds())
            if debug:
                print(f"[{table_name}] EMPTY: no parquet files found in folder {parquet_dir}")
            return {
                "log_id": log_id,
                "run_id": RUN_ID,
                "run_ts": run_ts,
                "source": source_name,
                "table_name": table_name,
                "load_mode": load_mode,
                "status": "SKIPPED",
                "rows_read": 0,
                "rows_written": 0,
                "start_time": start_time,
                "end_time": end_time,
                "duration_seconds": duration,
                "error_message": f"No parquet files found in folder {parquet_dir}.",
                "parquet_path": parquet_dir,
                "delta_table": delta_table_full,
            }
        else:
            end_time = datetime.utcnow()
            duration = int((end_time - start_time).total_seconds())
            if debug:
                print(f"[{table_name}] FAILED while reading parquet: {str(e)}")
            return {
                "log_id": log_id,
                "run_id": RUN_ID,
                "run_ts": run_ts,
                "source": source_name,
                "table_name": table_name,
                "load_mode": load_mode,
                "status": "FAILED",
                "rows_read": None,
                "rows_written": None,
                "start_time": start_time,
                "end_time": end_time,
                "duration_seconds": duration,
                "error_message": f"Read parquet failed: {str(e)}",
                "parquet_path": parquet_dir,
                "delta_table": delta_table_full,
            }

    # try:
    #     rows_read = df.count()
    #     rows_written = rows_read
    #     if debug:
    #         print(f"[{table_name}] rows_read={rows_read}")
    # except Exception as e:
    #     rows_read = None
    #     rows_written = None
    #     if debug:
    #         print(f"[{table_name}] WARNING: failed to count rows: {str(e)[:200]}")

    # 2) Write to Delta
    try:

        if load_mode in ("snapshot", "window"):
            # Overwrite the entire table
            writer = (df.write
                        .format("delta")
                        .mode("overwrite")
                        .option("overwriteSchema", "true"))
            writer.saveAsTable(delta_table_full)
        elif load_mode == "incremental":
            # Append only; no pre-read health-check for performance
            writer = (df.write
                        .format("delta")
                        .mode("append"))
            writer.saveAsTable(delta_table_full)

        rows_written = spark.table(delta_table_full).count()

        # Check voor lege dataset
        if rows_read == 0:
            end_time = datetime.utcnow()
            duration = int((end_time - start_time).total_seconds())
        
            if debug:
                print(f"[{table_name}] EMPTY: parquet exists but contains 0 rows")

            return {
                "log_id": log_id,
                "run_id": RUN_ID,
                "run_ts": run_ts,
                "source": source_name,
                "table_name": table_name,
                "load_mode": load_mode,
                "status": "EMPTY",  
                "rows_read": 0,
                "rows_written": 0,
                "start_time": start_time,
                "end_time": end_time,
                "duration_seconds": duration,
                "error_message": "Parquet exists but contains 0 rows.",
                "parquet_path": parquet_dir,
                "delta_table": delta_table_full,
            }
        elif rows_written != 0:
            status = "SUCCESS"

    except Exception as e:
        # Try one recovery attempt if the target Delta table looks corrupt
        if is_probably_corrupt_delta(e):
            if debug:
                print(f"[{table_name}] write failed, attempting drop+recreate due to probable corrupt Delta table: {str(e)}")
            try:
                spark.sql(f"DROP TABLE IF EXISTS {delta_table_full}")
                writer = (df.write
                            .format("delta")
                            .mode("overwrite")
                            .option("overwriteSchema", "true"))
                writer.saveAsTable(delta_table_full)
                status = "SUCCESS"
                rows_written = spark.table(delta_table_full).count()
                error_message = f"Initial write failed but table was recreated. Original error: {str(e)}"
            except Exception as e2:
                status = "FAILED"
                error_message = f"Write failed and recovery failed: {str(e2)}"
        else:
            status = "FAILED"
            error_message = f"Write failed: {str(e)}"

    end_time = datetime.utcnow()
    duration = int((end_time - start_time).total_seconds())

    if debug:
        print(f"[{table_name}] {status} in {duration}s")

    return {
        "log_id": log_id,
        "run_id": RUN_ID,
        "run_ts": run_ts,
        "source": source_name,
        "table_name": table_name,
        "load_mode": load_mode,
        "status": status,
        "rows_read": rows_read,
        "rows_written": rows_written,
        "start_time": start_time,
        "end_time": end_time,
        "duration_seconds": duration,
        "error_message": error_message,
        "parquet_path": parquet_dir,
        "delta_table": delta_table_full,
    }
