# 01 — Logging Utilities for Bronze and Silver Processing

This notebook defines the logging infrastructure for the data pipeline:

## Bronze Logging
- `logs.bronze_processing_log` - Per-table processing results
- `logs.bronze_run_summary` - Aggregated run statistics

## Silver Logging
- `logs.silver_processing_log` - Per-table CDC merge results
- `logs.silver_run_summary` - Aggregated CDC statistics

## Key Features
- Batch logging (one write per run, not per table)
- Append-only (no MERGE, maximum concurrency)
- Partitioned by `run_date` and `table_name`
- Helper functions for log retrieval and analysis

**Architecture:** Bronze uses append with run_ts history for full CDC capability

In [None]:
# Parameters (Papermill compatible)
# These can be overridden when running via notebook orchestration
LOG_SCHEMA = "logs"  # Database for all log tables

## [1] Setup: Imports and Schemas

In [None]:
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType, IntegerType,
    TimestampType, DateType, DoubleType
)
from pyspark.sql import Row
from pyspark.sql import functions as F
from datetime import date
from typing import List, Dict, Any, Optional
import sys, os
from pyspark.sql import DataFrame 
from delta.tables import DeltaTable
import shutil

## [2] Bronze Processing Log Schema

In [None]:
# Bronze processing log - tracks individual table loads
BRONZE_LOG_TABLE = "bronze_processing_log"
BRONZE_LOG_TABLE_FULLNAME = f"{LOG_SCHEMA}.{BRONZE_LOG_TABLE}"

bronze_log_schema = StructType([
    # Identifiers
    StructField("log_id",           StringType(),    False),  # Unique log entry ID
    StructField("run_id",           StringType(),    False),  # Run identifier
    StructField("run_date",         DateType(),      False),  # Partition key (derived from run_ts)
    StructField("run_ts",           StringType(),    False),  # Run timestamp (yyyyMMddTHHmmssSSS)
    
    # Source information
    StructField("source",           StringType(),    False),  # Source system name
    StructField("table_name",       StringType(),    False),  # Table name (partition key)
    StructField("load_mode",        StringType(),    True),   # snapshot, incremental, window
    
    # Processing results
    StructField("status",           StringType(),    False),  # SUCCESS, FAILED, SKIPPED, EMPTY
    StructField("rows_read",        LongType(),      True),   # Rows read from parquet
    StructField("rows_written",     LongType(),      True),   # Rows written to Bronze
    
    # Timing
    StructField("start_time",       TimestampType(), True),
    StructField("end_time",         TimestampType(), True),
    StructField("duration_seconds", LongType(),      True),
    
    # Error handling
    StructField("error_message",    StringType(),    True),   # Truncated to 1000 chars
    
    # Source/target paths
    StructField("parquet_path",     StringType(),    True),   # Source parquet folder
    StructField("delta_table",      StringType(),    True),   # Target Bronze table
])

print(f"Bronze log schema defined: {BRONZE_LOG_TABLE_FULLNAME}")

## [3] Bronze Run Summary Schema

In [None]:
# Bronze run summary - aggregated statistics per run
BRONZE_SUMMARY_TABLE = "bronze_run_summary"
BRONZE_SUMMARY_TABLE_FULLNAME = f"{LOG_SCHEMA}.{BRONZE_SUMMARY_TABLE}"

bronze_summary_schema = StructType([
    # Run identifiers
    StructField("run_id",              StringType(),    False),
    StructField("source",              StringType(),    False),
    StructField("run_ts",              StringType(),    False),
    StructField("run_date",            DateType(),      False),
    
    # Timing
    StructField("run_start",           TimestampType(), False),
    StructField("run_end",             TimestampType(), True),
    StructField("duration_seconds",    LongType(),      True),
    
    # Table counts
    StructField("total_tables",        LongType(),      False),
    StructField("tables_success",      LongType(),      True),
    StructField("tables_empty",        LongType(),      True),
    StructField("tables_failed",       LongType(),      True),
    StructField("tables_skipped",      LongType(),      True),
    
    # Row counts
    StructField("total_rows",          LongType(),      True),
    
    # Performance metrics
    StructField("workers",             IntegerType(),   True),  # Parallel workers used
    StructField("sum_task_seconds",    DoubleType(),    True),  # Sum of all table durations
    StructField("theoretical_min_sec", DoubleType(),    True),  # Sum / workers
    StructField("actual_time_sec",     DoubleType(),    True),  # Wall clock time
    StructField("efficiency_pct",      DoubleType(),    True),  # (theoretical / actual) * 100
    
    # Failed tables list
    StructField("failed_tables",       StringType(),    True),  # JSON array of failed table names
])

print(f"Bronze summary schema defined: {BRONZE_SUMMARY_TABLE_FULLNAME}")

## [4] Silver Processing Log Schema

In [None]:
# Silver processing log - tracks CDC merge operations
SILVER_LOG_TABLE = "silver_processing_log"
SILVER_LOG_TABLE_FULLNAME = f"{LOG_SCHEMA}.{SILVER_LOG_TABLE}"

silver_log_schema = StructType([
    # Identifiers
    StructField("log_id",           StringType(),    False),
    StructField("run_id",           StringType(),    False),
    StructField("run_date",         DateType(),      False),
    StructField("run_ts",           StringType(),    False),
    
    # Source information
    StructField("source",           StringType(),    False),
    StructField("table_name",       StringType(),    False),
    StructField("load_mode",        StringType(),    True),
    
    # Processing results
    StructField("status",           StringType(),    False),  # SUCCESS, FAILED, SKIPPED
    
    # CDC statistics
    StructField("rows_inserted",    LongType(),      True),   # New rows added to Silver
    StructField("rows_updated",     LongType(),      True),   # Existing rows updated
    StructField("rows_deleted",     LongType(),      True),   # Rows marked as deleted
    StructField("rows_unchanged",   LongType(),      True),   # Rows with no changes
    StructField("total_silver_rows",LongType(),      True),   # Total rows in Silver after merge
    
    # Bronze source info
    StructField("bronze_rows",      LongType(),      True),   # Rows processed from Bronze
    StructField("bronze_table",     StringType(),    True),   # Source Bronze table
    
    # Timing
    StructField("start_time",       TimestampType(), True),
    StructField("end_time",         TimestampType(), True),
    StructField("duration_seconds", LongType(),      True),
    
    # Error handling
    StructField("error_message",    StringType(),    True),
    
    # Target
    StructField("silver_table",     StringType(),    True),   # Target Silver table
])

print(f"Silver log schema defined: {SILVER_LOG_TABLE_FULLNAME}")

## [5] Silver Run Summary Schema

In [None]:
# Silver run summary - aggregated CDC statistics per run
SILVER_SUMMARY_TABLE = "silver_run_summary"
SILVER_SUMMARY_TABLE_FULLNAME = f"{LOG_SCHEMA}.{SILVER_SUMMARY_TABLE}"

silver_summary_schema = StructType([
    # Run identifiers
    StructField("run_id",              StringType(),    False),
    StructField("source",              StringType(),    False),
    StructField("run_ts",              StringType(),    False),
    StructField("run_date",            DateType(),      False),
    
    # Timing
    StructField("run_start",           TimestampType(), False),
    StructField("run_end",             TimestampType(), True),
    StructField("duration_seconds",    LongType(),      True),
    
    # Table counts
    StructField("total_tables",        LongType(),      False),
    StructField("tables_success",      LongType(),      True),
    StructField("tables_failed",       LongType(),      True),
    StructField("tables_skipped",      LongType(),      True),
    
    # Aggregate CDC statistics
    StructField("total_inserts",       LongType(),      True),
    StructField("total_updates",       LongType(),      True),
    StructField("total_deletes",       LongType(),      True),
    StructField("total_unchanged",     LongType(),      True),
    
    # Failed tables
    StructField("failed_tables",       StringType(),    True),
])

print(f"Silver summary schema defined: {SILVER_SUMMARY_TABLE_FULLNAME}")

## [6] Create Log Tables (Idempotent)

In [None]:
# Ensure logs schema exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {LOG_SCHEMA}")
print(f"✓ Schema '{LOG_SCHEMA}' ready")

# Create Bronze processing log
if not spark.catalog.tableExists(BRONZE_LOG_TABLE_FULLNAME):
    spark.sql(f"""
        CREATE TABLE {BRONZE_LOG_TABLE_FULLNAME} (
            log_id STRING,
            run_id STRING,
            run_date DATE,
            run_ts STRING,
            source STRING,
            table_name STRING,
            load_mode STRING,
            status STRING,
            rows_read LONG,
            rows_written LONG,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            duration_seconds LONG,
            error_message STRING,
            parquet_path STRING,
            delta_table STRING
        )
        USING DELTA
        PARTITIONED BY (run_date, table_name)
    """)
    print(f"✓ Created: {BRONZE_LOG_TABLE_FULLNAME}")
else:
    print(f"✓ Exists: {BRONZE_LOG_TABLE_FULLNAME}")

# Create Bronze run summary
if not spark.catalog.tableExists(BRONZE_SUMMARY_TABLE_FULLNAME):
    spark.sql(f"""
        CREATE TABLE {BRONZE_SUMMARY_TABLE_FULLNAME} (
            run_id STRING,
            source STRING,
            run_ts STRING,
            run_date DATE,
            run_start TIMESTAMP,
            run_end TIMESTAMP,
            duration_seconds LONG,
            total_tables LONG,
            tables_success LONG,
            tables_empty LONG,
            tables_failed LONG,
            tables_skipped LONG,
            total_rows LONG,
            workers LONG,
            sum_task_seconds LONG,
            theoretical_min_sec LONG,
            actual_time_sec LONG,
            efficiency_pct DOUBLE,
            failed_tables STRING
        )
        USING DELTA
        PARTITIONED BY (run_date)
    """)
    print(f"✓ Created: {BRONZE_SUMMARY_TABLE_FULLNAME}")
else:
    print(f"✓ Exists: {BRONZE_SUMMARY_TABLE_FULLNAME}")

# Create Silver processing log
if not spark.catalog.tableExists(SILVER_LOG_TABLE_FULLNAME):
    spark.sql(f"""
        CREATE TABLE {SILVER_LOG_TABLE_FULLNAME} (
            log_id STRING,
            run_id STRING,
            run_date DATE,
            run_ts STRING,
            source STRING,
            table_name STRING,
            load_mode STRING,
            status STRING,
            rows_inserted LONG,
            rows_updated LONG,
            rows_deleted LONG,
            rows_unchanged LONG,
            total_silver_rows LONG,
            bronze_rows LONG,
            bronze_table STRING,
            start_time TIMESTAMP,
            end_time TIMESTAMP,
            duration_seconds LONG,
            error_message STRING,
            silver_table STRING
        )
        USING DELTA
        PARTITIONED BY (run_date, table_name)
    """)
    print(f"✓ Created: {SILVER_LOG_TABLE_FULLNAME}")
else:
    print(f"✓ Exists: {SILVER_LOG_TABLE_FULLNAME}")

# Create Silver run summary
if not spark.catalog.tableExists(SILVER_SUMMARY_TABLE_FULLNAME):
    spark.sql(f"""
        CREATE TABLE {SILVER_SUMMARY_TABLE_FULLNAME} (
            run_id STRING,
            source STRING,
            run_ts STRING,
            run_date DATE,
            run_start TIMESTAMP,
            run_end TIMESTAMP,
            duration_seconds LONG,
            total_tables LONG,
            tables_success LONG,
            tables_failed LONG,
            tables_skipped LONG,
            total_inserts LONG,
            total_updates LONG,
            total_deletes LONG,
            total_unchanged LONG,
            failed_tables STRING
        )
        USING DELTA
        PARTITIONED BY (run_date)
    """)
    print(f"✓ Created: {SILVER_SUMMARY_TABLE_FULLNAME}")
else:
    print(f"✓ Exists: {SILVER_SUMMARY_TABLE_FULLNAME}")


print("\n✓ All log tables ready")

## [7] Helper Functions

In [None]:
def build_run_date(run_ts: str) -> date:
    """
    Convert a run_ts like '20251005T142752505' into a Python date(2025, 10, 5).
    
    This avoids Spark date parsing issues with ANSI mode.
    """
    if not run_ts or len(run_ts) < 8:
        raise ValueError(f"run_ts '{run_ts}' is not in expected yyyymmddThhmmss format")
    
    y = int(run_ts[0:4])
    m = int(run_ts[4:6])
    d = int(run_ts[6:8])
    return date(y, m, d)


def truncate_error_message(error_msg: Optional[str], max_length: int = 1000) -> Optional[str]:
    """
    Truncate error messages to prevent bloating log tables.
    """
    if not error_msg:
        return None
    
    if len(error_msg) <= max_length:
        return error_msg
    
    return error_msg[:max_length] + "... [TRUNCATED]"


print("✓ Helper functions defined")

## [8] Bronze Logging Functions

In [None]:
def log_bronze_batch(records: List[Dict[str, Any]]) -> None:
    """
    Write many Bronze log records in a single batch append.
    
    Args:
        records: List of dicts with Bronze processing results
    
    Each record should contain:
        - log_id, run_id, run_ts, source, table_name, load_mode
        - status, rows_read, rows_written
        - start_time, end_time, duration_seconds
        - error_message, parquet_path, delta_table
    """
    if not records:
        return
    
    rows = []
    for r in records:
        run_ts = r.get("run_ts")
        if not run_ts:
            raise ValueError("Bronze log record is missing run_ts")
        
        # Build run_date from run_ts
        rd = r.get("run_date")
        if rd is None:
            rd = build_run_date(run_ts)
        
        # Truncate error message
        error_msg = truncate_error_message(r.get("error_message"))
        
        rows.append(Row(
            log_id           = r.get("log_id"),
            run_id           = r.get("run_id"),
            run_date         = rd,
            run_ts           = run_ts,
            source           = r.get("source"),
            table_name       = r.get("table_name"),
            load_mode        = r.get("load_mode"),
            status           = r.get("status"),
            rows_read        = r.get("rows_read"),
            rows_written     = r.get("rows_written"),
            start_time       = r.get("start_time"),
            end_time         = r.get("end_time"),
            duration_seconds = r.get("duration_seconds"),
            error_message    = error_msg,
            parquet_path     = r.get("parquet_path"),
            delta_table      = r.get("delta_table"),
        ))
    
    df = spark.createDataFrame(rows, schema=bronze_log_schema)
    
    (df.write
       .format("delta")
       .mode("append")
       .saveAsTable(BRONZE_LOG_TABLE_FULLNAME))
    
    print(f"✓ Logged {len(records)} Bronze records to {BRONZE_LOG_TABLE_FULLNAME}")


def log_bronze_summary(summary: Dict[str, Any]) -> None:
    """
    Write Bronze run summary.
    
    Args:
        summary: Dict with run-level statistics
    """
    run_ts = summary.get("run_ts")
    if not run_ts:
        raise ValueError("Summary missing run_ts")
    
    run_date = summary.get("run_date")
    if run_date is None:
        run_date = build_run_date(run_ts)
    
    # Convert failed_tables list to JSON string
    import json
    failed_tables = summary.get("failed_tables", [])
    failed_tables_json = json.dumps(failed_tables) if failed_tables else None
    
    row = Row(
        run_id              = summary.get("run_id"),
        source              = summary.get("source"),
        run_ts              = run_ts,
        run_date            = run_date,
        run_start           = summary.get("run_start"),
        run_end             = summary.get("run_end"),
        duration_seconds    = summary.get("duration_seconds"),
        total_tables        = summary.get("total_tables"),
        tables_success      = summary.get("tables_success"),
        tables_empty        = summary.get("tables_empty"),
        tables_failed       = summary.get("tables_failed"),
        tables_skipped      = summary.get("tables_skipped"),
        total_rows          = summary.get("total_rows"),
        workers             = summary.get("workers"),
        sum_task_seconds    = summary.get("sum_task_seconds"),
        theoretical_min_sec = summary.get("theoretical_min_sec"),
        actual_time_sec     = summary.get("actual_time_sec"),
        efficiency_pct      = summary.get("efficiency_pct"),
        failed_tables       = failed_tables_json,
    )
    
    df = spark.createDataFrame([row], schema=bronze_summary_schema)
    
    (df.write
       .format("delta")
       .mode("append")
       .saveAsTable(BRONZE_SUMMARY_TABLE_FULLNAME))
    
    print(f"✓ Logged Bronze summary to {BRONZE_SUMMARY_TABLE_FULLNAME}")


print("✓ Bronze logging functions defined")

## [9] Silver Logging Functions

In [None]:
def log_silver_batch(records: List[Dict[str, Any]]) -> None:
    """
    Write many Silver log records in a single batch append.
    
    Args:
        records: List of dicts with Silver CDC merge results
    
    Each record should contain:
        - log_id, run_id, run_ts, source, table_name, load_mode
        - status, rows_inserted, rows_updated, rows_deleted, rows_unchanged
        - total_silver_rows, bronze_rows, bronze_table
        - start_time, end_time, duration_seconds
        - error_message, silver_table
    """
    if not records:
        return
    
    rows = []
    for r in records:
        run_ts = r.get("run_ts")
        if not run_ts:
            raise ValueError("Silver log record is missing run_ts")
        
        rd = r.get("run_date")
        if rd is None:
            rd = build_run_date(run_ts)
        
        error_msg = truncate_error_message(r.get("error_message"))
        
        rows.append(Row(
            log_id           = r.get("log_id"),
            run_id           = r.get("run_id"),
            run_date         = rd,
            run_ts           = run_ts,
            source           = r.get("source"),
            table_name       = r.get("table_name"),
            load_mode        = r.get("load_mode"),
            status           = r.get("status"),
            rows_inserted    = r.get("rows_inserted"),
            rows_updated     = r.get("rows_updated"),
            rows_deleted     = r.get("rows_deleted"),
            rows_unchanged   = r.get("rows_unchanged"),
            total_silver_rows= r.get("total_silver_rows"),
            bronze_rows      = r.get("bronze_rows"),
            bronze_table     = r.get("bronze_table"),
            start_time       = r.get("start_time"),
            end_time         = r.get("end_time"),
            duration_seconds = r.get("duration_seconds"),
            error_message    = error_msg,
            silver_table     = r.get("silver_table"),
        ))
    
    df = spark.createDataFrame(rows, schema=silver_log_schema)
    
    (df.write
       .format("delta")
       .mode("append")
       .saveAsTable(SILVER_LOG_TABLE_FULLNAME))
    
    print(f"✓ Logged {len(records)} Silver records to {SILVER_LOG_TABLE_FULLNAME}")


def log_silver_summary(summary: Dict[str, Any]) -> None:
    """
    Write Silver run summary.
    
    Args:
        summary: Dict with run-level CDC statistics
    """
    run_ts = summary.get("run_ts")
    if not run_ts:
        raise ValueError("Summary missing run_ts")
    
    run_date = summary.get("run_date")
    if run_date is None:
        run_date = build_run_date(run_ts)
    
    import json
    failed_tables = summary.get("failed_tables", [])
    failed_tables_json = json.dumps(failed_tables) if failed_tables else None
    
    row = Row(
        run_id              = summary.get("run_id"),
        source              = summary.get("source"),
        run_ts              = run_ts,
        run_date            = run_date,
        run_start           = summary.get("run_start"),
        run_end             = summary.get("run_end"),
        duration_seconds    = summary.get("duration_seconds"),
        total_tables        = summary.get("total_tables"),
        tables_success      = summary.get("tables_success"),
        tables_failed       = summary.get("tables_failed"),
        tables_skipped      = summary.get("tables_skipped"),
        total_inserts       = summary.get("total_inserts"),
        total_updates       = summary.get("total_updates"),
        total_deletes       = summary.get("total_deletes"),
        total_unchanged     = summary.get("total_unchanged"),
        failed_tables       = failed_tables_json,
    )
    
    df = spark.createDataFrame([row], schema=silver_summary_schema)
    
    (df.write
       .format("delta")
       .mode("append")
       .saveAsTable(SILVER_SUMMARY_TABLE_FULLNAME))
    
    print(f"✓ Logged Silver summary to {SILVER_SUMMARY_TABLE_FULLNAME}")


print("✓ Silver logging functions defined")

## [10] Query Helper Functions

In [None]:
def get_bronze_logs_for_run(run_ts: str) -> DataFrame:
    """
    Get all Bronze processing logs for a specific run_ts.
    """
    return spark.table(BRONZE_LOG_TABLE_FULLNAME).where(F.col("run_ts") == run_ts)


def get_silver_logs_for_run(run_ts: str) -> DataFrame:
    """
    Get all Silver processing logs for a specific run_ts.
    """
    return spark.table(SILVER_LOG_TABLE_FULLNAME).where(F.col("run_ts") == run_ts)


def get_failed_tables(run_ts: str, layer: str = "bronze") -> List[str]:
    """
    Get list of failed table names for a run_ts.
    
    Args:
        run_ts: Run timestamp
        layer: "bronze" or "silver"
    
    Returns:
        List of table names with status='FAILED'
    """
    table = BRONZE_LOG_TABLE_FULLNAME if layer == "bronze" else SILVER_LOG_TABLE_FULLNAME
    
    failed = spark.table(table) \
        .where(f"run_ts = '{run_ts}' AND status = 'FAILED'") \
        .select("table_name") \
        .distinct() \
        .collect()
    
    return [row.table_name for row in failed]


def get_successful_tables(run_ts: str, layer: str = "bronze") -> List[str]:
    """
    Get list of successful table names for a run_ts.
    
    Args:
        run_ts: Run timestamp
        layer: "bronze" or "silver"
    
    Returns:
        List of table names with status='SUCCESS'
    """
    table = BRONZE_LOG_TABLE_FULLNAME if layer == "bronze" else SILVER_LOG_TABLE_FULLNAME
    
    success = spark.table(table) \
        .where(f"run_ts = '{run_ts}' AND status = 'SUCCESS'") \
        .select("table_name") \
        .distinct() \
        .collect()
    
    return [row.table_name for row in success]


def is_table_processed(run_ts: str, table_name: str, layer: str = "bronze") -> bool:
    """
    Check if a specific table was successfully processed for a run_ts.
    
    Returns:
        True if table has status='SUCCESS' for this run_ts
    """
    table = BRONZE_LOG_TABLE_FULLNAME if layer == "bronze" else SILVER_LOG_TABLE_FULLNAME
    
    count = spark.table(table) \
        .where(f"run_ts = '{run_ts}' AND table_name = '{table_name}' AND status = 'SUCCESS'") \
        .count()
    
    return count > 0


def get_latest_run_summary(source: str, layer: str = "bronze") -> Optional[Dict[str, Any]]:
    """
    Get the most recent run summary for a source.
    
    Returns:
        Dict with summary data, or None if no runs found
    """
    table = BRONZE_SUMMARY_TABLE_FULLNAME if layer == "bronze" else SILVER_SUMMARY_TABLE_FULLNAME
    
    latest = spark.table(table) \
        .where(f"source = '{source}'") \
        .orderBy(F.col("run_ts").desc()) \
        .limit(1) \
        .collect()
    
    if not latest:
        return None
    
    return latest[0].asDict()


print("✓ Query helper functions defined")

## [11] Verification

Quick verification that all tables exist and are queryable.

In [None]:
print("=" * 80)
print("LOGGING INFRASTRUCTURE VERIFICATION")
print("=" * 80)

tables_to_check = [
    BRONZE_LOG_TABLE_FULLNAME,
    BRONZE_SUMMARY_TABLE_FULLNAME,
    SILVER_LOG_TABLE_FULLNAME,
    SILVER_SUMMARY_TABLE_FULLNAME,
]

for table_name in tables_to_check:
    if not spark.catalog.tableExists(table_name):
        print(f"✗ {table_name:<40} NOT FOUND")
        continue

    try:
        count = spark.table(table_name).count()
        print(f"✓ {table_name:<40} {count:>10,} rows")
    except Exception as e:
        print(f"! {table_name:<40} ERROR: {type(e).__name__}: {e}")

print("\n✓ Logging infrastructure ready for Bronze and Silver processing")