# 02 — Configuration Utilities for Bronze and Silver Processing

This notebook provides configuration management for the data pipeline:

## Configuration Files
- **DAG files** (`dag_<source>_<schedule>.json`) - Table definitions and load modes
- **Watermarks** (`watermarks.json`) - Incremental loading state (READ-ONLY)
- **Runplan** (`runplan.json`) - Scheduling configuration

## Key Features
- DAG loading and validation
- Path resolution (Fabric vs Local)
- Table filtering (enabled, retry_tables)
- Load mode validation
- Watermark reading (managed by data pipeline)

**Important:** Notebooks NEVER modify watermarks.json - this is managed by the extraction pipeline

In [None]:
# Parameters (Papermill compatible)
config_base_path = None  # Will be auto-detected if None

## [1] Imports and Path Detection

In [None]:
import json
import os
from typing import Dict, List, Any, Optional
import logging

logger = logging.getLogger(__name__)
if not logger.handlers:
    handler = logging.StreamHandler()
    formatter = logging.Formatter("%(asctime)s %(levelname)s %(name)s - %(message)s")
    handler.setFormatter(formatter)
    logger.addHandler(handler)
logger.setLevel(logging.INFO)

logger.info("✓ Imports loaded")

In [None]:
def detect_base_path() -> str:
    """
    Auto-detect base path for Files directory.
    
    Tries multiple possible locations in order:
    1. /lakehouse/default/Files (Microsoft Fabric)
    2. /data/lakehouse/*/Files (Custom cluster with mounted storage)
    3. Files (Relative path for local development)
    
    Returns:
        Absolute or relative path to Files directory
    """
    import glob
    
    # Option 1: Microsoft Fabric
    fabric_path = '/lakehouse/default/Files'
    if os.path.exists(fabric_path):
        return fabric_path
    
    # Option 2: Custom cluster with mounted lakehouse storage
    # Pattern: /data/lakehouse/{lakehouse_name}/Files
    logger.info(f"Checking for custom cluster Files directory /data/lakehouse... {os.path.exists('/data/lakehouse')}")
    if os.path.exists('/data/lakehouse'):
        pattern = '/data/lakehouse/**/Files'
        matches = glob.glob(pattern, recursive=True)
        logger.info(f"Detected matches for custom cluster Files directories: {matches}")
        if matches:
            # Use first match (sorted for consistency)
            base = sorted(matches)[0]
            return base
    
    # Option 3: Relative path (local development / repository)
    return 'Files'

# Set base path\n",
if config_base_path is None:
    BASE_PATH = detect_base_path()
else:
    BASE_PATH = config_base_path

# Detect environment type
if BASE_PATH == '/lakehouse/default/Files':
    env_type = 'Fabric'
elif BASE_PATH.startswith('/data/lakehouse/'):
    env_type = 'Custom Cluster'
elif BASE_PATH.startswith('/'):
    env_type = 'Absolute Path'
else:
    env_type = 'Local/Relative'

logger.info(f"✓ Base path: {BASE_PATH}")
logger.info(f"✓ Environment: {env_type}")

## [2] Configuration Paths

In [None]:
# Standard config locations
CONFIG_DIR = f"{BASE_PATH}/config"
WATERMARKS_PATH = f"{CONFIG_DIR}/watermarks.json"
RUNPLAN_PATH = f"{CONFIG_DIR}/runplan.json"

# Data paths
DATA_BASE = f"{BASE_PATH}/greenhouse_sources"  # Default, can be overridden in DAG

logger.info(f"✓ Config directory: {CONFIG_DIR}")
logger.info(f"✓ Watermarks path: {WATERMARKS_PATH}")
logger.info(f"✓ Runplan path: {RUNPLAN_PATH}")
logger.info(f"✓ Data base path: {DATA_BASE}")

## [3] DAG Loading and Validation

In [None]:
def load_dag(dag_path: str) -> Dict[str, Any]:
    """
    Load and validate a DAG configuration file.
    
    Args:
        dag_path: Path to DAG JSON file (relative to BASE_PATH or absolute)
    
    Returns:
        Dict with DAG configuration
    
    Raises:
        FileNotFoundError: If DAG file doesn't exist
        ValueError: If DAG validation fails
    """
    # Handle both absolute and relative paths
    if not os.path.isabs(dag_path):
        # Relative path - check if it starts with BASE_PATH
        if not dag_path.startswith(BASE_PATH):
            dag_path = f"{BASE_PATH}/{dag_path}"
    
    if not os.path.exists(dag_path):
        raise FileNotFoundError(f"DAG file not found: {dag_path}")
    
    with open(dag_path, 'r') as f:
        dag = json.load(f)
    
    # Validate required fields
    validate_dag(dag)
    
    return dag


def validate_dag(dag: Dict[str, Any]) -> None:
    """
    Validate DAG structure and required fields.
    
    Raises:
        ValueError: If validation fails
    """
    # Required top-level fields
    required_fields = ["source", "tables"]
    
    for field in required_fields:
        if field not in dag:
            raise ValueError(f"DAG missing required field: {field}")
    
    # Validate source name
    if not dag["source"] or not isinstance(dag["source"], str):
        raise ValueError(f"Invalid source name: {dag.get('source')}")
    
    # Validate tables
    if not isinstance(dag["tables"], list):
        raise ValueError("DAG 'tables' must be a list")
    
    if len(dag["tables"]) == 0:
        raise ValueError("DAG has no tables defined")
    
    # Validate each table
    valid_load_modes = {"snapshot", "incremental", "window"}
    
    for idx, table in enumerate(dag["tables"]):
        if "name" not in table:
            raise ValueError(f"Table at index {idx} missing 'name' field")
        
        # Validate load_mode if present
        if "load_mode" in table:
            load_mode = table["load_mode"].lower()
            if load_mode not in valid_load_modes:
                logger.info(f"⚠️  Warning: Table '{table['name']}' has unsupported load_mode: {load_mode}")


logger.info("✓ DAG loading functions defined")

## [4] Table Filtering

In [None]:
def get_enabled_tables(dag: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Get all enabled tables from DAG.
    
    A table is enabled if:
    - 'enabled' field is True, 1, or missing (default=enabled)
    
    Returns:
        List of table definitions
    """
    enabled = []
    
    for table in dag["tables"]:
        # Default to enabled if field missing
        enabled_flag = table.get("enabled", True)
        
        # Handle various true values (True, 1, "1", "true")
        if enabled_flag in (True, 1, "1", "true", "True"):
            enabled.append(table)
    
    return enabled


def filter_retry_tables(
    tables: List[Dict[str, Any]], 
    retry_tables: Optional[List[str]]
) -> List[Dict[str, Any]]:
    """
    Filter tables to only those in retry_tables list.
    
    Args:
        tables: List of table definitions
        retry_tables: List of table names to retry, or None for all
    
    Returns:
        Filtered list of tables
    """
    if retry_tables is None or len(retry_tables) == 0:
        return tables
    
    retry_set = set(retry_tables)
    filtered = [t for t in tables if t["name"] in retry_set]
    
    # Warn about missing tables
    found_names = {t["name"] for t in filtered}
    missing = retry_set - found_names
    if missing:
        logger.info(f"⚠️  Warning: Retry tables not found in DAG: {sorted(missing)}")
    
    return filtered


def get_tables_by_load_mode(
    tables: List[Dict[str, Any]], 
    load_mode: str
) -> List[Dict[str, Any]]:
    """
    Filter tables by load_mode.
    
    Args:
        tables: List of table definitions
        load_mode: Load mode to filter (e.g., "incremental", "snapshot")
    
    Returns:
        Filtered list of tables
    """
    load_mode_lower = load_mode.lower()
    return [
        t for t in tables 
        if t.get("load_mode", "snapshot").lower() == load_mode_lower
    ]


def get_tables_to_process(
    dag: Dict[str, Any],
    retry_tables: Optional[List[str]] = None,
    only_enabled: bool = True
) -> List[Dict[str, Any]]:
    """
    Get final list of tables to process based on filters.
    
    This is the main entry point for determining which tables to load.
    
    Args:
        dag: DAG configuration
        retry_tables: Optional list of specific tables to retry
        only_enabled: If True, only return enabled tables
    
    Returns:
        List of table definitions to process
    """
    tables = dag["tables"]
    
    # Filter by enabled status
    if only_enabled:
        tables = get_enabled_tables(dag)
    
    # Filter by retry list if provided
    if retry_tables:
        tables = filter_retry_tables(tables, retry_tables)
    
    return tables


logger.info("✓ Table filtering functions defined")

## [5] Watermark Management (READ-ONLY)

In [None]:
def load_watermarks() -> Dict[str, Any]:
    """
    Load watermarks configuration.
    
    NOTE: This is READ-ONLY. Watermarks are managed by the data extraction pipeline.
    
    Returns:
        Dict with watermarks configuration
    """
    if not os.path.exists(WATERMARKS_PATH):
        raise FileNotFoundError(f"Watermarks file not found: {WATERMARKS_PATH}")
    
    with open(WATERMARKS_PATH, 'r') as f:
        watermarks = json.load(f)
    
    return watermarks


def get_source_watermarks(source: str) -> Optional[Dict[str, Any]]:
    """
    Get watermarks for a specific source.
    
    Args:
        source: Source system name (e.g., "vizier", "anva_concern")
    
    Returns:
        Dict with table watermarks, or None if source not found
    """
    watermarks = load_watermarks()
    
    # Watermarks structure: {"source": [{"name": "vizier", "tables": {...}}]}
    sources = watermarks.get("source", [])
    
    for src in sources:
        if src.get("name") == source:
            return src.get("tables", {})
    
    return None


def get_table_watermark(
    source: str, 
    table_name: str
) -> Optional[Any]:
    """
    Get watermark value for a specific table.
    
    Args:
        source: Source system name
        table_name: Table name
    
    Returns:
        Watermark value (string, int, or None)
    """
    source_wm = get_source_watermarks(source)
    
    if source_wm is None:
        return None
    
    return source_wm.get(table_name)


logger.info("✓ Watermark functions defined (READ-ONLY)")
logger.info("⚠️  NOTE: Watermarks are managed by extraction pipeline, not by notebooks")

## [6] Path Building Utilities

In [None]:
def build_bronze_table_name(
    table_def: Dict[str, Any],
    default_schema: str = "bronze"
) -> str:
    """
    Build full Bronze table name from table definition.
    
    Args:
        table_def: Table definition from DAG
        default_schema: Default schema if not specified in table_def
    
    Returns:
        Full table name (schema.table)
    """
    table_name = table_def.get("name")
    delta_table = table_def.get("delta_table", table_name)
    
    # Check if delta_table already has schema
    if "." in delta_table:
        return delta_table
    
    # Get schema from table_def or use default
    schema = table_def.get("delta_schema", default_schema)
    
    return f"{schema}.{delta_table}"


def build_silver_table_name(
    table_def: Dict[str, Any],
    default_schema: str = "silver"
) -> str:
    """
    Build full Silver table name from table definition.
    
    Args:
        table_def: Table definition from DAG
        default_schema: Default schema if not specified
    
    Returns:
        Full table name (schema.table)
    """
    # Check if delta_table specifies Silver schema
    delta_table = table_def.get("delta_table")
    if delta_table and delta_table.startswith("silver."):
        return delta_table
    
    # Otherwise use table name with default Silver schema
    table_name = table_def.get("name")
    return f"{default_schema}.{table_name}"


logger.info("✓ Path building functions defined")

## [7] DAG Query Helpers

In [None]:
def get_dag_metadata(dag: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract metadata from DAG.
    
    Returns:
        Dict with metadata fields
    """
    return {
        "source": dag.get("source"),
        "base_files": dag.get("base_files", "greenhouse_sources"),
        "watermarks_path": dag.get("watermarks_path", "config/watermarks.json"),
        "connection_name": dag.get("connection_name"),
        "defaults": dag.get("defaults", {}),
    }


def get_business_keys(table_def: Dict[str, Any]) -> Optional[List[str]]:
    """
    Get business keys for a table (used for CDC merge).
    
    Returns:
        List of business key column names, or None if not defined
    """
    return table_def.get("business_keys")


def get_incremental_column(table_def: Dict[str, Any]) -> Optional[str]:
    """
    Get incremental column for a table.
    
    Returns:
        Column name used for incremental loading, or None
    """
    incremental = table_def.get("incremental", {})
    return incremental.get("column")


def get_window_config(table_def: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Get window configuration for a table.
    
    Returns:
        Dict with window config (column, granularity, lookback, etc.)
    """
    return table_def.get("window")


def get_partitioning_config(table_def: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    Get partitioning configuration for a table.
    
    Returns:
        Dict with partitioning config (type, year_col, month_col)
    """
    return table_def.get("partitioning")


def summarize_dag(dag: Dict[str, Any]) -> Dict[str, Any]:
    """
    Generate summary statistics for a DAG.
    
    Returns:
        Dict with counts by load_mode, enabled status, etc.
    """
    tables = dag["tables"]
    enabled = get_enabled_tables(dag)
    
    # Count by load_mode
    load_mode_counts = {}
    for table in enabled:
        mode = table.get("load_mode", "snapshot")
        load_mode_counts[mode] = load_mode_counts.get(mode, 0) + 1
    
    return {
        "source": dag.get("source"),
        "total_tables": len(tables),
        "enabled_tables": len(enabled),
        "disabled_tables": len(tables) - len(enabled),
        "load_mode_counts": load_mode_counts,
    }


logger.info("✓ DAG query helper functions defined")

## [8] Runplan Management

In [None]:
def load_runplan() -> List[Dict[str, Any]]:
    """
    Load runplan configuration.
    
    Returns:
        List of scheduled runs
    """
    if not os.path.exists(RUNPLAN_PATH):
        logger.info(f"⚠️  Warning: Runplan file not found: {RUNPLAN_PATH}")
        return []
    
    with open(RUNPLAN_PATH, 'r') as f:
        runplan = json.load(f)
    
    return runplan


def get_source_schedule(source: str) -> List[Dict[str, Any]]:
    """
    Get schedule entries for a specific source.
    
    Args:
        source: Source system name
    
    Returns:
        List of schedule entries (may be multiple for weekday/weekend)
    """
    runplan = load_runplan()
    return [entry for entry in runplan if entry.get("source") == source]


logger.info("✓ Runplan functions defined")

## [9] Verification and Testing

In [None]:
# logger.info("=" * 80)
# logger.info("CONFIGURATION UTILITIES VERIFICATION")
# logger.info("=" * 80)

# # Test environment detection
# logger.info(f"\n1. Environment Detection:")
# logger.info(f"   Base path: {BASE_PATH}")
# logger.info(f"   Environment: {'Fabric' if '/lakehouse' in BASE_PATH else 'Local'}")

# # Test config paths
# logger.info(f"\n2. Configuration Paths:")
# logger.info(f"   Config dir: {CONFIG_DIR}")
# logger.info(f"   Watermarks: {WATERMARKS_PATH}")
# logger.info(f"   Runplan: {RUNPLAN_PATH}")

# # Check if config files exist
# logger.info(f"\n3. Config Files Status:")
# logger.info(f"   Config dir exists: {os.path.exists(CONFIG_DIR)}")
# logger.info(f"   Watermarks exists: {os.path.exists(WATERMARKS_PATH)}")
# logger.info(f"   Runplan exists: {os.path.exists(RUNPLAN_PATH)}")

# # Try to list DAG files
# if os.path.exists(CONFIG_DIR):
#     dag_files = [f for f in os.listdir(CONFIG_DIR) if f.startswith('dag_') and f.endswith('.json')]
#     logger.info(f"\n4. Available DAG files: {len(dag_files)}")
#     for dag_file in sorted(dag_files)[:5]:  # Show first 5
#         logger.info(f"   - {dag_file}")
#     if len(dag_files) > 5:
#         logger.info(f"   ... and {len(dag_files) - 5} more")

# logger.info("\n✓ Configuration utilities ready")

## [10] Example Usage

Example of how to use these configuration functions in other notebooks.

In [None]:
# Example: Load a DAG and get tables to process
# 
# dag_path = f"{CONFIG_DIR}/dag_vizier_weekday.json"
# dag = load_dag(dag_path)
# 
# # Get all enabled tables
# tables = get_tables_to_process(dag)
# logger.info(f"Found {len(tables)} enabled tables")
# 
# # Get only incremental tables
# incremental_tables = get_tables_by_load_mode(tables, "incremental")
# logger.info(f"Found {len(incremental_tables)} incremental tables")
# 
# # Get metadata
# metadata = get_dag_metadata(dag)
# logger.info(f"Source: {metadata['source']}")
# logger.info(f"Base files: {metadata['base_files']}")
# 
# # Build parquet path for a table
# table = tables[0]
# parquet_path = build_parquet_path(
#     source=metadata['source'],
#     run_ts="20251105T120000123",
#     table_name=table['name'],
#     base_files=metadata['base_files']
# )
# logger.info(f"Parquet path: {parquet_path}")
# 
# # Get watermark for incremental table
# if table.get('load_mode') == 'incremental':
#     watermark = get_table_watermark(metadata['source'], table['name'])
#     logger.info(f"Watermark for {table['name']}: {watermark}")

#logger.info("✓ Example usage documented (commented out)")