In [0]:
# ============================================
# SLA Monitoring Configuration
# ============================================

from pyspark.sql.functions import *
from datetime import datetime, timedelta

CATALOG = "fraud_detection"
SCHEMA = "raw"

SLA_METRICS_TABLE = f"{CATALOG}.{SCHEMA}.gold_sla_metrics"
PERFORMANCE_TABLE = f"{CATALOG}.{SCHEMA}.gold_system_performance"

print("✅ SLA Monitoring loaded")


✅ SLA Monitoring loaded


In [0]:
# ============================================
# Create SLA Metrics Table
# ============================================

spark.sql(f"""
CREATE TABLE IF NOT EXISTS {SLA_METRICS_TABLE} (
    check_timestamp TIMESTAMP,
    metric_name STRING,
    metric_value DOUBLE,
    sla_target DOUBLE,
    status STRING,
    breach_severity STRING,
    message STRING
) USING DELTA
""")

print(f"✅ SLA metrics table created: {SLA_METRICS_TABLE}")


✅ SLA metrics table created: fraud_detection.raw.gold_sla_metrics


In [0]:
# ============================================
# SLA Check #1: End-to-End Latency < 5 seconds
# ============================================

print("\n📊 SLA CHECK #1: LATENCY")
print("=" * 70)

# Get p95 latency from last hour
latency_result = spark.sql(f"""
    SELECT 
        PERCENTILE(avg_latency_ms, 0.95) as p95_latency_ms,
        AVG(avg_latency_ms) as avg_latency_ms,
        MAX(avg_latency_ms) as max_latency_ms
    FROM {PERFORMANCE_TABLE}
    WHERE batch_timestamp >= CURRENT_TIMESTAMP() - INTERVAL 1 HOUR
""").first()

if latency_result and latency_result.p95_latency_ms:
    p95_latency = latency_result.p95_latency_ms
    sla_target = 5000.0  # 5 seconds
    
    status = "✅ PASS" if p95_latency <= sla_target else "❌ BREACH"
    severity = "CRITICAL" if p95_latency > sla_target * 1.5 else "WARNING" if p95_latency > sla_target else "OK"
    
    print(f"Target: < {sla_target:.0f}ms (p95)")
    print(f"Current p95: {p95_latency:.0f}ms")
    print(f"Current avg: {latency_result.avg_latency_ms:.0f}ms")
    print(f"Current max: {latency_result.max_latency_ms:.0f}ms")
    print(f"Status: {status}")
    
    # Save result
    spark.createDataFrame([{
        "check_timestamp": datetime.now(),
        "metric_name": "p95_latency_ms",
        "metric_value": float(p95_latency),
        "sla_target": sla_target,
        "status": "PASS" if p95_latency <= sla_target else "BREACH",
        "breach_severity": severity,
        "message": f"p95 latency: {p95_latency:.0f}ms (target: {sla_target:.0f}ms)"
    }]).write.format("delta").mode("append").saveAsTable(SLA_METRICS_TABLE)
else:
    print("⚠️  No data available yet")



📊 SLA CHECK #1: LATENCY
⚠️  No data available yet


In [0]:
# ============================================
# SLA Check #2: Data Freshness < 1 minute
# ============================================

print("\n📊 SLA CHECK #2: DATA FRESHNESS")
print("=" * 70)

latest_batch = spark.sql(f"""
    SELECT MAX(batch_timestamp) as latest
    FROM {PERFORMANCE_TABLE}
""").first()

if latest_batch and latest_batch.latest:
    freshness_sec = (datetime.now() - latest_batch.latest).total_seconds()
    sla_target = 60.0  # 1 minute
    
    status = "✅ PASS" if freshness_sec <= sla_target else "❌ BREACH"
    severity = "CRITICAL" if freshness_sec > 300 else "WARNING" if freshness_sec > sla_target else "OK"
    
    print(f"Target: < {sla_target:.0f} seconds")
    print(f"Current: {freshness_sec:.0f} seconds")
    print(f"Last batch: {latest_batch.latest}")
    print(f"Status: {status}")
    
    # Save result
    spark.createDataFrame([{
        "check_timestamp": datetime.now(),
        "metric_name": "data_freshness_sec",
        "metric_value": freshness_sec,
        "sla_target": sla_target,
        "status": "PASS" if freshness_sec <= sla_target else "BREACH",
        "breach_severity": severity,
        "message": f"Data freshness: {freshness_sec:.0f}s (target: {sla_target:.0f}s)"
    }]).write.format("delta").mode("append").saveAsTable(SLA_METRICS_TABLE)
else:
    print("⚠️  No data available yet")



📊 SLA CHECK #2: DATA FRESHNESS
⚠️  No data available yet


In [0]:
# ============================================
# SLA Check #3: Throughput > 100 txns/minute
# ============================================

print("\n📊 SLA CHECK #3: THROUGHPUT")
print("=" * 70)

throughput_result = spark.sql(f"""
    SELECT 
        SUM(transactions_processed) / 
        (COUNT(*) * 10.0 / 60.0) as txns_per_minute
    FROM {PERFORMANCE_TABLE}
    WHERE batch_timestamp >= CURRENT_TIMESTAMP() - INTERVAL 1 HOUR
""").first()

if throughput_result and throughput_result.txns_per_minute:
    throughput = throughput_result.txns_per_minute
    sla_target = 100.0  # 100 transactions per minute
    
    status = "✅ PASS" if throughput >= sla_target else "❌ BREACH"
    
    print(f"Target: > {sla_target:.0f} txns/min")
    print(f"Current: {throughput:.0f} txns/min")
    print(f"Status: {status}")
    
    # Save result
    spark.createDataFrame([{
        "check_timestamp": datetime.now(),
        "metric_name": "throughput_txns_per_min",
        "metric_value": throughput,
        "sla_target": sla_target,
        "status": "PASS" if throughput >= sla_target else "BREACH",
        "breach_severity": "WARNING" if throughput < sla_target else "OK",
        "message": f"Throughput: {throughput:.0f} txns/min (target: {sla_target:.0f})"
    }]).write.format("delta").mode("append").saveAsTable(SLA_METRICS_TABLE)
else:
    print("⚠️  No data available yet")



📊 SLA CHECK #3: THROUGHPUT
⚠️  No data available yet


In [0]:
# ============================================
# SLA Summary - Last 24 Hours
# ============================================

print("\n📊 SLA SUMMARY (Last 24 Hours)")
print("=" * 70)

summary = spark.sql(f"""
    SELECT 
        metric_name,
        COUNT(*) as total_checks,
        SUM(CASE WHEN status = 'PASS' THEN 1 ELSE 0 END) as passed,
        SUM(CASE WHEN status = 'BREACH' THEN 1 ELSE 0 END) as breached,
        ROUND(AVG(metric_value), 2) as avg_value,
        MAX(sla_target) as target
    FROM {SLA_METRICS_TABLE}
    WHERE check_timestamp >= CURRENT_TIMESTAMP() - INTERVAL 24 HOUR
    GROUP BY metric_name
""")

summary.show(truncate=False)

# Check for breaches
breach_count = spark.sql(f"""
    SELECT COUNT(*) as breaches
    FROM {SLA_METRICS_TABLE}
    WHERE check_timestamp >= CURRENT_TIMESTAMP() - INTERVAL 24 HOUR
    AND status = 'BREACH'
""").first().breaches

if breach_count > 0:
    print(f"\n⚠️  {breach_count} SLA BREACHES in last 24 hours!")
    
    # Show recent breaches
    print("\n🚨 Recent breaches:")
    spark.sql(f"""
        SELECT check_timestamp, metric_name, metric_value, sla_target, message
        FROM {SLA_METRICS_TABLE}
        WHERE status = 'BREACH'
        AND check_timestamp >= CURRENT_TIMESTAMP() - INTERVAL 24 HOUR
        ORDER BY check_timestamp DESC
        LIMIT 10
    """).show(truncate=False)
else:
    print("\n✅ ALL SLAs MET - System performing within targets!")

print("\n💡 Schedule this notebook to run every 15 minutes")



📊 SLA SUMMARY (Last 24 Hours)
+-----------+------------+------+--------+---------+------+
|metric_name|total_checks|passed|breached|avg_value|target|
+-----------+------------+------+--------+---------+------+
+-----------+------------+------+--------+---------+------+


✅ ALL SLAs MET - System performing within targets!

💡 Schedule this notebook to run every 15 minutes
