# 01 — Logging Utilities for Bronze and Silver Processing

This notebook defines the logging infrastructure for the data pipeline:

## Bronze Logging
- `logs.bronze_processing_log` - Per-table processing results
- `logs.bronze_run_summary` - Aggregated run statistics

## Silver Logging
- `logs.silver_processing_log` - Per-table CDC merge results
- `logs.silver_run_summary` - Aggregated CDC statistics

## Key Features
- Batch logging (one write per run, not per table)
- Append-only (no MERGE, maximum concurrency)
- Partitioned by `run_date` and `table_name`
- Helper functions for log retrieval and analysis

**Architecture:** Bronze uses append with run_ts history for full CDC capability

In [None]:
# Parameters (Papermill compatible)
# These can be overridden when running via notebook orchestration
LOG_SCHEMA = "logs"  # Database for all log tables

## [1] Setup: Imports and Schemas

In [None]:
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("✓ Imports loaded")

## [2] Bronze Processing Log Schema

In [None]:
# Import Bronze log schema from module
from modules.log_schemas import (
    BRONZE_LOG_TABLE,
    BRONZE_LOG_TABLE_FULLNAME,
    bronze_processing_log_schema
)

logger.info(f"✓ Bronze log schema imported: {BRONZE_LOG_TABLE_FULLNAME}")

## [3] Bronze Run Summary Schema

In [None]:
# Import Bronze summary schema from module
from modules.log_schemas import (
    BRONZE_SUMMARY_TABLE,
    BRONZE_SUMMARY_TABLE_FULLNAME,
    bronze_run_summary_schema
)

logger.info(f"✓ Bronze summary schema imported: {BRONZE_SUMMARY_TABLE_FULLNAME}")

## [4] Silver Processing Log Schema

In [None]:
# Import Silver log schema from module
from modules.log_schemas import (
    SILVER_LOG_TABLE,
    SILVER_LOG_TABLE_FULLNAME,
    silver_processing_log_schema
)

logger.info(f"✓ Silver log schema imported: {SILVER_LOG_TABLE_FULLNAME}")

## [5] Silver Run Summary Schema

In [None]:
# Import Silver summary schema from module
from modules.log_schemas import (
    SILVER_SUMMARY_TABLE,
    SILVER_SUMMARY_TABLE_FULLNAME,
    silver_run_summary_schema
)

logger.info(f"✓ Silver summary schema imported: {SILVER_SUMMARY_TABLE_FULLNAME}")

## [6] Create Log Tables (Idempotent)

In [None]:
# Ensure logs schema exists
from modules.log_schemas import LOG_SCHEMA

spark.sql(f"CREATE SCHEMA IF NOT EXISTS {LOG_SCHEMA}")
logger.info(f"✓ Schema '{LOG_SCHEMA}' ready")

# Create Bronze processing log table
if not spark.catalog.tableExists(BRONZE_LOG_TABLE_FULLNAME):
    empty_df = spark.createDataFrame([], bronze_processing_log_schema)
    (empty_df.write
         .format("delta")
         .partitionBy("run_date", "table_name")
         .mode("append")
         .saveAsTable(BRONZE_LOG_TABLE_FULLNAME))
    logger.info(f"✓ Created table: {BRONZE_LOG_TABLE_FULLNAME}")

# Create Bronze summary table
if not spark.catalog.tableExists(BRONZE_SUMMARY_TABLE_FULLNAME):
    empty_df = spark.createDataFrame([], bronze_run_summary_schema)
    (empty_df.write
         .format("delta")
         .mode("append")
         .saveAsTable(BRONZE_SUMMARY_TABLE_FULLNAME))
    logger.info(f"✓ Created table: {BRONZE_SUMMARY_TABLE_FULLNAME}")

# Create Silver processing log table
if not spark.catalog.tableExists(SILVER_LOG_TABLE_FULLNAME):
    empty_df = spark.createDataFrame([], silver_processing_log_schema)
    (empty_df.write
         .format("delta")
         .partitionBy("run_date", "table_name")
         .mode("append")
         .saveAsTable(SILVER_LOG_TABLE_FULLNAME))
    logger.info(f"✓ Created table: {SILVER_LOG_TABLE_FULLNAME}")

# Create Silver summary table
if not spark.catalog.tableExists(SILVER_SUMMARY_TABLE_FULLNAME):
    empty_df = spark.createDataFrame([], silver_run_summary_schema)
    (empty_df.write
         .format("delta")
         .mode("append")
         .saveAsTable(SILVER_SUMMARY_TABLE_FULLNAME))
    logger.info(f"✓ Created table: {SILVER_SUMMARY_TABLE_FULLNAME}")

logger.info("\n✓ All log tables ready")

## [7] Helper Functions

In [None]:
# Import helper functions from module
from modules.logging_utils import (
    build_run_date,
    truncate_error_message
)

logger.info("✓ Helper functions imported from modules.logging_utils")

## [8] Bronze Logging Functions

In [None]:
# Note: log_batch() and log_summary() functions are defined in this notebook
# because they depend on the Spark session and schemas being available.
# These are also available in modules.logging_utils but the notebook versions
# allow for interactive setup and testing.

from uuid import uuid4
from pyspark.sql import Row
from pyspark.sql import functions as F
from typing import List, Dict, Any, Optional

def _prepare_bronze_rows(bronze_results: List[Dict[str, Any]], run_log_id: str):
    rows = []
    for r in bronze_results:
        log_id = r.get("log_id") or f"{run_log_id}_{uuid4().hex[:8]}"
        partition_key = r.get("partition_key") or r.get("run_ts")
        error_msg = truncate_error_message(r.get("error_message"))

        rows.append(
            (
                log_id,
                run_log_id,
                r.get("run_id"),
                r.get("run_date"),
                r.get("run_ts"),
                r.get("source"),
                r.get("table_name"),
                partition_key,
                r.get("load_mode"),
                r.get("status"),
                r.get("rows_processed"),
                r.get("start_time"),
                r.get("end_time"),
                r.get("duration_seconds"),
                error_msg,
                r.get("parquet_path"),
                r.get("delta_table"),
            )
        )
    return rows


def _prepare_silver_rows(records: List[Dict[str, Any]]):
    rows = []
    for r in records:
        run_ts = r.get("run_ts")
        if not run_ts:
            raise ValueError("Silver log record is missing run_ts")

        run_date = r.get("run_date")
        if run_date is None:
            run_date = build_run_date(run_ts)

        error_msg = truncate_error_message(r.get("error_message"))

        rows.append(Row(
            log_id           = r.get("log_id"),
            run_id           = r.get("run_id"),
            run_date         = run_date,
            run_ts           = run_ts,
            source           = r.get("source"),
            table_name       = r.get("table_name"),
            load_mode        = r.get("load_mode"),
            status           = r.get("status"),
            rows_inserted    = r.get("rows_inserted"),
            rows_updated     = r.get("rows_updated"),
            rows_deleted     = r.get("rows_deleted"),
            rows_unchanged   = r.get("rows_unchanged"),
            total_silver_rows= r.get("total_silver_rows"),
            bronze_rows      = r.get("bronze_rows"),
            bronze_table     = r.get("bronze_table"),
            start_time       = r.get("start_time"),
            end_time         = r.get("end_time"),
            duration_seconds = r.get("duration_seconds"),
            error_message    = error_msg,
            silver_table     = r.get("silver_table"),
        ))
    return rows


def log_batch(records: List[Dict[str, Any]], layer: str, run_log_id: Optional[str] = None) -> None:
    """
    Write many log records in a single batch append for the given layer.
    """
    if not records:
        return

    layer = layer.lower()
    if layer == "bronze":
        if not run_log_id:
            raise ValueError("run_log_id is required for Bronze batch logging")
        rows = _prepare_bronze_rows(records, run_log_id)
        schema = bronze_processing_log_schema
        table = BRONZE_LOG_TABLE_FULLNAME
    elif layer == "silver":
        rows = _prepare_silver_rows(records)
        schema = silver_processing_log_schema
        table = SILVER_LOG_TABLE_FULLNAME
    else:
        raise ValueError("layer must be 'bronze' or 'silver'")

    df = spark.createDataFrame(rows, schema=schema)

    (df.write
        .format("delta")
        .mode("append")
        .saveAsTable(table))

    logger.info(f"✓ Logged {len(records)} {layer.capitalize()} records to {table}")


def log_summary(summary: Dict[str, Any], layer: str) -> Optional[str]:
    """
    Write run summary for Bronze or Silver processing.

    Returns:
        The Bronze run log_id for linking batch rows, or None for Silver.
    """
    import json
    
    layer = layer.lower()

    if layer == "bronze":
        log_id = summary.get("log_id") or uuid4().hex
        run_ts = summary["run_ts"]
        run_id = summary.get("run_id") or f"{run_ts}_{log_id[:8]}"

        row = {
            "log_id":               log_id,
            "run_id":               run_id,
            "run_date":             summary["run_date"],
            "run_ts":               run_ts,
            "source":               summary.get("source"),
            "status":               summary.get("status", "SUCCESS"),
            "run_start":            summary["run_start"],
            "run_end":              summary["run_end"],
            "duration_seconds":     summary.get("duration_seconds"),
            "total_tables":         summary["total_tables"],
            "tables_success":       summary["tables_success"],
            "tables_empty":         summary["tables_empty"],
            "tables_failed":        summary["tables_failed"],
            "tables_skipped":       summary["tables_skipped"],
            "total_rows":           summary["total_rows"],
            "workers":              summary["workers"],
            "sum_task_seconds":     summary.get("sum_task_seconds"),
            "theoretical_min_sec":  summary.get("theoretical_min_sec"),
            "actual_time_sec":      summary.get("actual_time_sec"),
            "efficiency_pct":       summary.get("efficiency_pct"),
            "failed_tables":        summary.get("failed_tables"),
            "error_message":        summary.get("error_message"),
        }

        df = spark.createDataFrame([row], schema=bronze_run_summary_schema)
        table = BRONZE_SUMMARY_TABLE_FULLNAME

    elif layer == "silver":
        run_ts = summary.get("run_ts")
        if not run_ts:
            raise ValueError("Summary missing run_ts")

        run_date = summary.get("run_date")
        if run_date is None:
            run_date = build_run_date(run_ts)

        failed_tables = summary.get("failed_tables", [])
        failed_tables_json = json.dumps(failed_tables) if failed_tables else None

        row = Row(
            run_id              = summary.get("run_id"),
            source              = summary.get("source"),
            run_ts              = run_ts,
            run_date            = run_date,
            run_start           = summary.get("run_start"),
            run_end             = summary.get("run_end"),
            duration_seconds    = summary.get("duration_seconds"),
            total_tables        = summary.get("total_tables"),
            tables_success      = summary.get("tables_success"),
            tables_failed       = summary.get("tables_failed"),
            tables_skipped      = summary.get("tables_skipped"),
            total_inserts       = summary.get("total_inserts"),
            total_updates       = summary.get("total_updates"),
            total_deletes       = summary.get("total_deletes"),
            total_unchanged     = summary.get("total_unchanged"),
            failed_tables       = failed_tables_json,
        )

        df = spark.createDataFrame([row], schema=silver_run_summary_schema)
        table = SILVER_SUMMARY_TABLE_FULLNAME
        log_id = None

    else:
        raise ValueError("layer must be 'bronze' or 'silver'")

    (df.write
        .format("delta")
        .mode("append")
        .saveAsTable(table))

    logger.info(f"✓ Logged {layer.capitalize()} summary to {table}")
    return log_id


logger.info("✓ Logging functions defined for Bronze and Silver layers")

## [9] Silver Logging Functions

In [None]:
logger.info("✓ Generic log_batch/log_summary functions ready for use")


## [10] Query Helper Functions

In [None]:
# Import query helper functions from module
from modules.logging_utils import (
    get_bronze_logs_for_run,
    get_silver_logs_for_run,
    get_failed_tables,
    get_successful_tables,
    is_table_processed,
    get_latest_run_summary
)

logger.info("✓ Query helper functions imported from modules.logging_utils")

## [11] Verification

Quick verification that all tables exist and are queryable.

In [None]:
# logger.info("=" * 80)
# logger.info("LOGGING INFRASTRUCTURE VERIFICATION")
# logger.info("=" * 80)

# tables_to_check = [
#     BRONZE_LOG_TABLE_FULLNAME,
#     BRONZE_SUMMARY_TABLE_FULLNAME,
#     SILVER_LOG_TABLE_FULLNAME,
#     SILVER_SUMMARY_TABLE_FULLNAME,
# ]

# for table_name in tables_to_check:
#     if not spark.catalog.tableExists(table_name):
#         logger.info(f"✗ {table_name:<40} NOT FOUND")
#         continue

#     try:
#         count = spark.table(table_name).count()
#         logger.info(f"✓ {table_name:<40} {count:>10,} rows")
#     except Exception as e:
#         logger.info(f"! {table_name:<40} ERROR: {type(e).__name__}: {e}")

# logger.info("\n✓ Logging infrastructure ready for Bronze and Silver processing")