# 02 — Configuration Utilities for Bronze and Silver Processing

This notebook provides configuration management for the data pipeline:

## Configuration Files
- **DAG files** (`dag_<source>_<schedule>.json`) - Table definitions and load modes
- **Watermarks** (`watermarks.json`) - Incremental loading state (READ-ONLY)
- **Runplan** (`runplan.json`) - Scheduling configuration

## Key Features
- DAG loading and validation
- Path resolution (Fabric vs Local)
- Table filtering (enabled, retry_tables)
- Load mode validation
- Watermark reading (managed by data pipeline)

**Important:** Notebooks NEVER modify watermarks.json - this is managed by the extraction pipeline

In [None]:
# Parameters (Papermill compatible)
config_base_path = None  # Will be auto-detected if None

## [1] Imports and Path Detection

In [None]:
import json
import os
from typing import Dict, List, Any, Optional
import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

logger.info("✓ Imports loaded")
from modules.path_utils import get_base_path


In [None]:
# Bepaal Files-basispad met de centrale helper
if config_base_path is None:
    BASE_PATH = get_base_path()
else:
    BASE_PATH = config_base_path

# Detect environment type
if BASE_PATH == '/lakehouse/default/Files':
    env_type = 'Fabric'
elif '/data/lakehouse/' in BASE_PATH:
    env_type = 'Custom Cluster'
elif BASE_PATH.startswith('/'):
    env_type = 'Absolute Path'
else:
    env_type = 'Local/Relative'

logger.info(f"✓ Base path: {BASE_PATH}")
logger.info(f"✓ Environment: {env_type}")


## [2] Configuration Paths

In [None]:
# Standard config locations
CONFIG_DIR = f"{BASE_PATH}/config"
WATERMARKS_PATH = f"{CONFIG_DIR}/watermarks.json"
RUNPLAN_PATH = f"{CONFIG_DIR}/runplan.json"

# Data paths
DATA_BASE = f"{BASE_PATH}/greenhouse_sources"  # Default, can be overridden in DAG

logger.info(f"✓ Config directory: {CONFIG_DIR}")
logger.info(f"✓ Watermarks path: {WATERMARKS_PATH}")
logger.info(f"✓ Runplan path: {RUNPLAN_PATH}")
logger.info(f"✓ Data base path: {DATA_BASE}")

## [3] DAG Loading and Validation

In [None]:
# Import all configuration functions from module
from modules.config_utils import (
    load_dag,
    validate_dag,
    get_enabled_tables,
    filter_retry_tables,
    get_tables_by_load_mode,
    get_tables_to_process
)

logger.info("✓ DAG loading and filtering functions imported from modules.config_utils")

## [4] Table Filtering

In [None]:
def get_enabled_tables(dag: Dict[str, Any]) -> List[Dict[str, Any]]:
    """
    Get all enabled tables from DAG.
    
    A table is enabled if:
    - 'enabled' field is True, 1, or missing (default=enabled)
    
    Returns:
        List of table definitions
    """
    enabled = []
    
    for table in dag["tables"]:
        # Default to enabled if field missing
        enabled_flag = table.get("enabled", True)
        
        # Handle various true values (True, 1, "1", "true")
        if enabled_flag in (True, 1, "1", "true", "True"):
            enabled.append(table)
    
    return enabled


def filter_retry_tables(
    tables: List[Dict[str, Any]], 
    retry_tables: Optional[List[str]]
) -> List[Dict[str, Any]]:
    """
    Filter tables to only those in retry_tables list.
    
    Args:
        tables: List of table definitions
        retry_tables: List of table names to retry, or None for all
    
    Returns:
        Filtered list of tables
    """
    if retry_tables is None or len(retry_tables) == 0:
        return tables
    
    retry_set = set(retry_tables)
    filtered = [t for t in tables if t["name"] in retry_set]
    
    # Warn about missing tables
    found_names = {t["name"] for t in filtered}
    missing = retry_set - found_names
    if missing:
        logger.info(f"⚠️  Warning: Retry tables not found in DAG: {sorted(missing)}")
    
    return filtered


def get_tables_by_load_mode(
    tables: List[Dict[str, Any]], 
    load_mode: str
) -> List[Dict[str, Any]]:
    """
    Filter tables by load_mode.
    
    Args:
        tables: List of table definitions
        load_mode: Load mode to filter (e.g., "incremental", "snapshot")
    
    Returns:
        Filtered list of tables
    """
    load_mode_lower = load_mode.lower()
    return [
        t for t in tables 
        if t.get("load_mode", "snapshot").lower() == load_mode_lower
    ]


def get_tables_to_process(
    dag: Dict[str, Any],
    retry_tables: Optional[List[str]] = None,
    only_enabled: bool = True
) -> List[Dict[str, Any]]:
    """
    Get final list of tables to process based on filters.
    
    This is the main entry point for determining which tables to load.
    
    Args:
        dag: DAG configuration
        retry_tables: Optional list of specific tables to retry
        only_enabled: If True, only return enabled tables
    
    Returns:
        List of table definitions to process
    """
    tables = dag["tables"]
    
    # Filter by enabled status
    if only_enabled:
        tables = get_enabled_tables(dag)
    
    # Filter by retry list if provided
    if retry_tables:
        tables = filter_retry_tables(tables, retry_tables)
    
    return tables


logger.info("✓ Table filtering functions defined")

## [5] Watermark Management (READ-ONLY)

In [None]:
# Import watermark functions from module
from modules.config_utils import (
    load_watermarks,
    get_source_watermarks,
    get_table_watermark
)

logger.info("✓ Watermark functions imported from modules.config_utils (READ-ONLY)")
logger.info("⚠️  NOTE: Watermarks are managed by extraction pipeline, not by notebooks")

## [6] Path Building Utilities

In [None]:
# Import path building utilities from module
from modules.config_utils import (
    build_bronze_table_name,
    build_silver_table_name
)

logger.info("✓ Path building functions imported from modules.config_utils")

## [7] DAG Query Helpers

In [None]:
# Import DAG query helpers from module
from modules.config_utils import (
    get_dag_metadata,
    get_business_keys,
    get_incremental_column,
    get_window_config,
    get_partitioning_config,
    summarize_dag
)

logger.info("✓ DAG query helper functions imported from modules.config_utils")

## [8] Runplan Management

In [None]:
# Import runplan management functions from module
from modules.config_utils import (
    load_runplan,
    get_source_schedule
)

logger.info("✓ Runplan functions imported from modules.config_utils")

## [9] Verification and Testing

In [None]:
# logger.info("=" * 80)
# logger.info("CONFIGURATION UTILITIES VERIFICATION")
# logger.info("=" * 80)

# # Test environment detection
# logger.info(f"\n1. Environment Detection:")
# logger.info(f"   Base path: {BASE_PATH}")
# logger.info(f"   Environment: {'Fabric' if '/lakehouse' in BASE_PATH else 'Local'}")

# # Test config paths
# logger.info(f"\n2. Configuration Paths:")
# logger.info(f"   Config dir: {CONFIG_DIR}")
# logger.info(f"   Watermarks: {WATERMARKS_PATH}")
# logger.info(f"   Runplan: {RUNPLAN_PATH}")

# # Check if config files exist
# logger.info(f"\n3. Config Files Status:")
# logger.info(f"   Config dir exists: {os.path.exists(CONFIG_DIR)}")
# logger.info(f"   Watermarks exists: {os.path.exists(WATERMARKS_PATH)}")
# logger.info(f"   Runplan exists: {os.path.exists(RUNPLAN_PATH)}")

# # Try to list DAG files
# if os.path.exists(CONFIG_DIR):
#     dag_files = [f for f in os.listdir(CONFIG_DIR) if f.startswith('dag_') and f.endswith('.json')]
#     logger.info(f"\n4. Available DAG files: {len(dag_files)}")
#     for dag_file in sorted(dag_files)[:5]:  # Show first 5
#         logger.info(f"   - {dag_file}")
#     if len(dag_files) > 5:
#         logger.info(f"   ... and {len(dag_files) - 5} more")

# logger.info("\n✓ Configuration utilities ready")

## [10] Example Usage

Example of how to use these configuration functions in other notebooks.

In [None]:
# Example: Load a DAG and get tables to process
# 
# dag_path = f"{CONFIG_DIR}/dag_vizier_weekday.json"
# dag = load_dag(dag_path)
# 
# # Get all enabled tables
# tables = get_tables_to_process(dag)
# logger.info(f"Found {len(tables)} enabled tables")
# 
# # Get only incremental tables
# incremental_tables = get_tables_by_load_mode(tables, "incremental")
# logger.info(f"Found {len(incremental_tables)} incremental tables")
# 
# # Get metadata
# metadata = get_dag_metadata(dag)
# logger.info(f"Source: {metadata['source']}")
# logger.info(f"Base files: {metadata['base_files']}")
# 
# # Build parquet path for a table
# table = tables[0]
# parquet_path = build_parquet_path(
#     source=metadata['source'],
#     run_ts="20251105T120000123",
#     table_name=table['name'],
#     base_files=metadata['base_files']
# )
# logger.info(f"Parquet path: {parquet_path}")
# 
# # Get watermark for incremental table
# if table.get('load_mode') == 'incremental':
#     watermark = get_table_watermark(metadata['source'], table['name'])
#     logger.info(f"Watermark for {table['name']}: {watermark}")

#logger.info("✓ Example usage documented (commented out)")