# 01 – Logging utilities for bronze processing

This notebook defines the logging schema and helpers for the bronze processing
pipeline:

- Creates an append-only Delta table `logs.bronze_processing_log`.
- Ensures the table is partitioned by `run_date` and `table_name`.
- Provides helper functions to:
  - Convert `run_ts` (e.g. `20251005T142752505`) into a `DATE` column (`run_date`).
  - Write many log records in one batch (`log_table_processing_batch`).
  - Write a single log record (`log_table_processing`).
  - Optionally read the latest log row per table (`get_latest_logs`).

The design goal is:
- **One physical log row per table per run**, written in **one batch** from the master notebook.
- **No MERGE**, only `INSERT` (append) for maximum concurrency.


In [None]:
# [1] Setup logging schema and Delta table

from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, TimestampType, DateType, DoubleType
)
from pyspark.sql import Row
from pyspark.sql import functions as F
from datetime import datetime, date

# run log summary
SUMMARY_LOG_DB = "logs"
SUMMARY_LOG_TABLE = "bronze_run_summary"
SUMMARY_LOG_TABLE_FULLNAME = f"{SUMMARY_LOG_DB}.{SUMMARY_LOG_TABLE}"

summary_log_schema = StructType([
    StructField("run_id",              StringType(), False),
    StructField("source",              StringType(),  False),
    StructField("run_ts",              StringType(),  False),
    StructField("run_start",           TimestampType(),  False),
    StructField("run_end",             TimestampType(),  True),
    StructField("total_tables",        LongType(), False),
    StructField("tables_success",      LongType(), True),
    StructField("tables_empty",        LongType(), True),
    StructField("tables_failed",       LongType(), True),
    StructField("tables_skipped",      LongType(), True),
    StructField("total_rows",          LongType(), True),
    StructField("duration_seconds",    LongType(), True),
    StructField("workers",             LongType(),    True),
    StructField("sum_task_seconds",    DoubleType(),  True),
    StructField("theoretical_min_sec", DoubleType(),  True),
    StructField("actual_time_sec",     DoubleType(),  True),
    StructField("efficiency_pct",      DoubleType(),  True),
])

if not spark.catalog.tableExists(SUMMARY_LOG_TABLE_FULLNAME):
    empty_sdf = spark.createDataFrame([], summary_log_schema)

    (empty_sdf.write
         .format("delta")
         .mode("overwrite")
         .saveAsTable(SUMMARY_LOG_TABLE_FULLNAME))

# Database and table name for the processing log
LOG_DB = "logs"
LOG_TABLE = "bronze_processing_log"
LOG_TABLE_FULLNAME = f"{LOG_DB}.{LOG_TABLE}"

# Log table schema (run_date is a DATE, not STRING)
log_schema = StructType([
    StructField("log_id",           StringType(),  False),
    StructField("run_id",           StringType(),  False),
    StructField("run_date",         DateType(),    False),
    StructField("run_ts",           StringType(),  False),
    StructField("source",           StringType(),  False),
    StructField("table_name",       StringType(),  False),
    StructField("load_mode",        StringType(),  True),
    StructField("status",           StringType(),  False),
    StructField("rows_read",        LongType(),    True),
    StructField("rows_written",     LongType(),    True),
    StructField("start_time",       TimestampType(), True),
    StructField("end_time",         TimestampType(), True),
    StructField("duration_seconds", LongType(),    True),
    StructField("error_message",    StringType(),  True),
    StructField("parquet_path",     StringType(),  True),
    StructField("delta_table",      StringType(),  True),
])

# Ensure the logs schema exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {LOG_DB}")

# Idempotent create of the Delta log table
if not spark.catalog.tableExists(LOG_TABLE_FULLNAME):
    empty_df = spark.createDataFrame([], log_schema)

    (empty_df.write
         .format("delta")
         .partitionBy("run_date", "table_name")
         .mode("overwrite")
         .saveAsTable(LOG_TABLE_FULLNAME))

## [2] Logging helper functions

This cell defines:

- `build_run_date(run_ts)` – converts `20251005T142752505` → `date(2025, 10, 5)` using Python.
- `log_table_processing_batch(records)` – writes a list of log dicts in **one** append operation.
- `log_table_processing(**kwargs)` – convenience wrapper for a single record.
- `get_latest_logs(run_ts)` – returns the latest log row per `(source, table_name)` for a given run.


In [None]:
# [2] Logging helper functions

from pyspark.sql.window import Window

def build_run_date(run_ts: str) -> date:
    """
    Convert a run_ts like '20251005T142752505' into a Python date(2025, 10, 5).

    We do this in Python to avoid Spark date parsing issues with ANSI mode.
    """
    if not run_ts or len(run_ts) < 8:
        raise ValueError(f"run_ts '{run_ts}' is not in expected yyyymmddThhmmss format")

    y = int(run_ts[0:4])
    m = int(run_ts[4:6])
    d = int(run_ts[6:8])
    return date(y, m, d)


def log_table_processing_batch(records, log_db: str = LOG_DB, log_table: str = LOG_TABLE):
    """
    Write many log records in a single append to logs.bronze_processing_log.

    records: iterable of dicts with at least:
      - log_id, run_ts, source, table_name, load_mode, status
      - rows_read, rows_written, start_time, end_time, duration_seconds
      - error_message, parquet_path, delta_table
    """
    records = list(records)
    if not records:
        return

    rows = []
    for r in records:
        run_ts = r.get("run_ts")
        if not run_ts:
            raise ValueError("log record is missing run_ts")

        rd = r.get("run_date")
        if rd is None:
            rd = build_run_date(run_ts)

        rows.append(Row(
            log_id           = r.get("log_id"),
            run_id           = r.get("run_id"),
            run_date         = rd,
            run_ts           = run_ts,
            source           = r.get("source"),
            table_name       = r.get("table_name"),
            load_mode        = r.get("load_mode"),
            status           = r.get("status"),
            rows_read        = r.get("rows_read"),
            rows_written     = r.get("rows_written"),
            start_time       = r.get("start_time"),
            end_time         = r.get("end_time"),
            duration_seconds = r.get("duration_seconds"),
            error_message    = r.get("error_message"),
            parquet_path     = r.get("parquet_path"),
            delta_table      = r.get("delta_table"),
        ))

    df = spark.createDataFrame(rows, schema=log_schema)
    full_name = f"{log_db}.{log_table}"

    (df.write
       .format("delta")
       .mode("append")
       .saveAsTable(full_name))


def log_table_processing(**kwargs):
    """
    Convenience wrapper around log_table_processing_batch for a single record.

    Usage:
        log_table_processing(
            log_id=...,
            run_id=...,
            run_ts=...,
            source=...,
            table_name=...,
            status="SUCCESS",
            ...
        )
    """
    log_table_processing_batch([kwargs])


def get_latest_logs(run_ts: str):
    """
    Return, for a given run_ts, the latest log row per (source, table_name).

    With the current design we write exactly one row per table per run_ts,
    but this function still works if you ever decide to log multiple times.
    """
    df = spark.table(LOG_TABLE_FULLNAME).where(F.col("run_ts") == run_ts)

    w = Window.partitionBy("source", "table_name").orderBy(
        F.col("end_time").desc_nulls_last(),
        F.col("start_time").desc_nulls_last()
    )

    return (df
            .withColumn("rn", F.row_number().over(w))
            .where(F.col("rn") == 1)
            .drop("rn"))
