# 11 ‚Äî Bronze Watermark Merge

Merges watermark values from incremental load runtime files back into the central watermarks configuration.

## Process Flow

1. **Data Extraction Pipeline** (out of scope):
   - Reads `watermarks.json` to get last processed value
   - Extracts incremental data (rows > watermark)
   - Writes parquet files to `Files/{source}/{run_ts}/{table}/`
   - Writes new watermark to `runtime/{source}/{run_id}/{table}/watermark.parquet`

2. **This Notebook**:
   - Reads watermark parquet files from runtime folder
   - Extracts max watermark per table
   - Updates `watermarks.json` with new values
   - Uses atomic write (tmp file + replace)

3. **Next Run**:
   - Extraction pipeline reads updated watermarks
   - Continues from new watermark value

## Important Notes

- **This is the ONLY notebook that modifies watermarks.json**
- Only runs for sources with incremental tables (e.g., vizier)
- Uses PyArrow for reading (OneLake compatibility)
- Atomic file updates prevent corruption
- Runtime watermark files are NOT deleted (kept for audit)

In [None]:
# Parameters (Papermill compatible)
wm_configpath = "config/watermarks.json"  # Relative path to watermarks config
run_id = None                             # Run identifier (e.g., "20251105T120000123")
source = None                             # Source system name (e.g., "vizier")
wm_folder = None                          # Runtime watermark folder (e.g., "runtime/vizier/{run_id}/")

## [1] Imports and Path Setup

In [None]:
import os
import json
from typing import Dict, Any, Optional
import pyarrow.parquet as pq
import pyarrow.dataset as ds

import logging

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


logger.info("‚úì Imports loaded")

In [None]:
# Auto-detect base path (Fabric vs Local)
from modules.path_utils import get_base_path

BASE_PATH = get_base_path()

logger.info(f"‚úì Base path: {BASE_PATH}")
logger.info(f"‚úì Environment: {'Fabric' if '/lakehouse' in BASE_PATH else 'Local/Relative'}")


In [None]:
# Validate parameters
if not source or not run_id:
    raise ValueError("Parameters 'source' and 'run_id' are required")

# Build full paths
wm_configpath_full = f'{BASE_PATH}/{wm_configpath}'

# If wm_folder not provided, build default
if wm_folder is None:
    wm_folder = f"runtime/{source}/{run_id}/"

wm_folder_full = f'{BASE_PATH}/{wm_folder}'

logger.info(f"\nParameters:")
logger.info(f"  Source: {source}")
logger.info(f"  Run ID: {run_id}")
logger.info(f"  Watermark config: {wm_configpath_full}")
logger.info(f"  Runtime folder: {wm_folder_full}")

## [2] Load Watermarks Configuration

In [None]:
# Load watermarks config
if not os.path.exists(wm_configpath_full):
    raise FileNotFoundError(f"Watermarks config not found: {wm_configpath_full}")

with open(wm_configpath_full, 'r') as f:
    cfg = json.load(f)

logger.info(f"‚úì Loaded watermarks config")

In [None]:
# Find source in config
sources = cfg.get("source", [])
src = next((s for s in sources if s.get("name") == source), None)

if not src:
    raise ValueError(f"Source '{source}' not found in watermarks config")

tables_map = src.get("tables", {})
logger.info(f"‚úì Found source '{source}' with {len(tables_map)} table watermarks")

## [3] Read Watermark Updates with PyArrow

Uses PyArrow to read watermark parquet files in parallel.
This approach is 10-100x faster than sequential file reading.

In [None]:
# Check if runtime folder exists
if not os.path.exists(wm_folder_full):
    logger.info(f"‚ö†Ô∏è  No watermark runtime folder found: {wm_folder_full}")
    logger.info(f"   This is normal if no incremental tables were processed")
    updates = {}
else:
    logger.info(f"\nüìÇ Reading watermark updates from: {wm_folder_full}")
    updates = {}
    
    # List table folders
    table_folders = [d for d in os.listdir(wm_folder_full) 
                     if os.path.isdir(os.path.join(wm_folder_full, d))]
    
    logger.info(f"   Found {len(table_folders)} table folders")
    
    # Process each table folder
    for table_name in table_folders:
        table_path = os.path.join(wm_folder_full, table_name)
        
        try:
            # List parquet files in this folder
            parquet_files = [f for f in os.listdir(table_path) if f.endswith('.parquet')]
            
            if not parquet_files:
                logger.info(f"   ‚ö†Ô∏è  {table_name}: No parquet files found")
                continue
            
            # Read all parquets with PyArrow
            dataset = ds.dataset(table_path, format='parquet')
            table = dataset.to_table()
            
            # Get max watermark if data exists
            if table.num_rows > 0 and 'watermark' in table.column_names:
                wm_column = table.column('watermark')
                
                # Find max non-null watermark
                valid_watermarks = [
                    wm.as_py() for wm in wm_column 
                    if wm.as_py() is not None and wm.as_py() != ''
                ]
                
                if valid_watermarks:
                    max_wm = max(valid_watermarks)
                    updates[table_name] = max_wm
                    logger.info(f"   ‚úì {table_name}: {max_wm}")
                else:
                    logger.info(f"   ‚ö†Ô∏è  {table_name}: No valid watermarks found")
            else:
                logger.info(f"   ‚ö†Ô∏è  {table_name}: No watermark column or empty table")
        
        except Exception as e:
            logger.info(f"   ‚ö†Ô∏è  {table_name}: Error reading watermark - {str(e)[:80]}")
    
    logger.info(f"\n‚úì Found {len(updates)} watermark updates")

## [4] Compare Old vs New Watermarks

In [None]:
if updates:
    logger.info("\nüîç Comparing old vs new watermarks:")
    logger.info(f"{'Table':<30} {'Old Watermark':<25} {'New Watermark':<25} {'Status':<15}")
    logger.info("-" * 95)
    
    for table, new_wm in sorted(updates.items()):
        old_wm = tables_map.get(table)
        
        if old_wm == new_wm:
            status = "‚úì MATCH"
        elif old_wm is None:
            status = "üÜï NEW"
        else:
            status = "üîÑ UPDATE"
        
        logger.info(f"{table:<30} {str(old_wm):<25} {str(new_wm):<25} {status:<15}")
else:
    logger.info("\n‚ÑπÔ∏è  No watermark updates to process")

## [5] Update Configuration

Updates watermarks in the configuration dictionary where changes are detected.

In [None]:
# Update watermarks
changed = 0
changes = []

for table, new_wm in updates.items():
    old_wm = tables_map.get(table)
    
    if old_wm != new_wm:
        tables_map[table] = new_wm
        changed += 1
        changes.append({
            "table": table,
            "old": old_wm,
            "new": new_wm
        })
        logger.info(f"  ‚úì Updated {table}: {old_wm} ‚Üí {new_wm}")

if changed > 0:
    logger.info(f"\n‚úì {changed} watermark(s) will be updated")
else:
    logger.info("\n‚ÑπÔ∏è  No watermark changes detected")

## [6] Save Configuration (Atomic Write)

Uses atomic file replacement to prevent corruption:
1. Write to temporary file
2. Replace original file with temp file
3. OS guarantees atomic operation

In [None]:
if changed > 0:
    logger.info("\nüíæ Saving updated watermarks config...")
    
    tmp_path = wm_configpath_full + ".tmp"
    
    try:
        # Write to temporary file
        with open(tmp_path, 'w') as f:
            json.dump(cfg, f, indent=2)
        
        logger.info(f"  ‚úì Wrote temporary file: {tmp_path}")
        
        # Atomic replace
        os.replace(tmp_path, wm_configpath_full)
        
        logger.info(f"  ‚úì Replaced original file: {wm_configpath_full}")
        logger.info("\n‚úì Watermarks config saved successfully")
        
    except Exception as e:
        logger.info(f"\n‚úó Failed to save config: {str(e)}")
        
        # Cleanup temp file if it exists
        if os.path.exists(tmp_path):
            try:
                os.remove(tmp_path)
                logger.info(f"  ‚úì Cleaned up temporary file")
            except:
                pass
        
        raise
else:
    logger.info("\n‚ÑπÔ∏è  No changes to save")

## [7] Return Result

Outputs processing summary for pipeline orchestration.

In [None]:
# Build result summary
result = {
    "source": source,
    "run_id": run_id,
    "updates_processed": len(updates),
    "watermarks_changed": changed,
    "config_updated": changed > 0,
    "changes": changes
}

logger.info("\n" + "=" * 80)
logger.info("WATERMARK MERGE SUMMARY")
logger.info("=" * 80)
logger.info(f"Source: {result['source']}")
logger.info(f"Run ID: {result['run_id']}")
logger.info(f"Watermark files processed: {result['updates_processed']}")
logger.info(f"Watermarks changed: {result['watermarks_changed']}")
logger.info(f"Config file updated: {result['config_updated']}")
logger.info("=" * 80)

# Print result as JSON for pipeline parsing
result_json = json.dumps(result)
logger.info(f"\nRESULT_JSON: {result_json}")

## [8] Exit for Notebook Orchestration

If using mssparkutils.notebook.run(), this provides the exit value.

In [None]:
# Exit with result (for mssparkutils orchestration)
try:
    mssparkutils.notebook.exit(result_json)
except NameError:
    # mssparkutils not available (local environment)
    logger.info("\n‚ÑπÔ∏è  mssparkutils not available (local environment)")
    logger.info("   Notebook completed successfully")