# Module 3: Incremental Data Processing and Delta Lake
## Laboratory Exercises

Welcome to the hands-on laboratory exercises for Module 3! Today we'll build sophisticated incremental processing systems using Auto Loader, Structured Streaming, and the Medallion Architecture.

### Prerequisites
- Access to your Databricks workspace
- Running cluster (8.x or higher with Delta Lake)
- Approximately 3.5 hours for completion

### Lab Structure
1. **Lab 1**: Auto Loader and Bronze Layer Implementation (45 minutes)
2. **Lab 2**: Building Silver Layer with CDC (60 minutes)
3. **Lab 3**: Structured Streaming and Gold Layer (60 minutes)
4. **Lab 4**: Production Patterns and Monitoring (45 minutes)

---
## Lab Setup: Preparing Your Environment

In [0]:
# Module 3 Setup - Complete Environment Initialization
import time
from datetime import datetime, timedelta
from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable
import random
import json

# Create module workspace
module_path = "dbfs:/databricks_course/module_03_incremental"
dbutils.fs.mkdirs(module_path)

# Create dedicated database for Module 3
spark.sql("CREATE DATABASE IF NOT EXISTS module3_incremental")
spark.sql("USE module3_incremental")

# Clean up any existing tables from previous runs
tables_to_drop = [
    "bronze_sales", "silver_sales", "gold_realtime_sales_metrics",
    "gold_sales_trends", "gold_customer_sessions", "gold_executive_dashboard",
    "bronze_ingestion_metrics", "pipeline_health_metrics"
]

for table in tables_to_drop:
    spark.sql(f"DROP TABLE IF EXISTS {table}")

print("✅ Environment initialized!")
print(f"📁 Working directory: {module_path}")
print(f"🗄️  Current database: {spark.catalog.currentDatabase()}")
print(f"⚡ Spark version: {spark.version}")

In [0]:
# Create data generators for realistic GlobalMart scenarios
class GlobalMartDataGenerator:
    """Generates realistic streaming data for GlobalMart retail scenarios."""
    
    def __init__(self):
        # Product catalog with categories
        self.products = [
            {"id": f"PROD_{i:04d}", 
             "name": f"Product {i}", 
             "category": random.choice(["Electronics", "Clothing", "Food", "Home", "Sports"]),
             "price": random.uniform(10, 500)}
            for i in range(1, 101)
        ]
        
        # Store locations across regions
        self.stores = [
            {"id": f"STORE_{region}_{i:03d}", 
             "region": region,
             "type": random.choice(["Flagship", "Standard", "Express"])}
            for region in ["NORTH", "SOUTH", "EAST", "WEST"]
            for i in range(1, 6)
        ]
        
        # Customer segments
        self.customer_segments = ["Premium", "Regular", "Budget", "New"]
        
    def generate_sales_transaction(self, timestamp_base=None):
        """Generate a single sales transaction."""
        if timestamp_base is None:
            timestamp_base = datetime.now()
        
        product = random.choice(self.products)
        store = random.choice(self.stores)
        
        # Simulate late-arriving data (10% chance)
        if random.random() < 0.1:
            time_offset = -random.randint(0, 120)
        else:
            time_offset = 0
        
        transaction = {
            "transaction_id": f"TXN_{int(time.time() * 1000000)}_{random.randint(1000, 9999)}",
            "store_id": store["id"],
            "store_region": store["region"],
            "product_id": product["id"],
            "product_category": product["category"],
            "customer_id": f"CUST_{random.randint(10000, 99999)}",
            "customer_segment": random.choice(self.customer_segments),
            "quantity": random.randint(1, 10),
            "unit_price": product["price"],
            "transaction_time": (timestamp_base + timedelta(minutes=time_offset)).isoformat(),
            "payment_method": random.choice(["credit_card", "debit_card", "cash", "mobile_pay"]),
            "is_online": random.choice([True, False]),
            "promotion_applied": random.choice([True, False])
        }
        
        # Calculate totals
        base_amount = transaction["quantity"] * transaction["unit_price"]
        if transaction["promotion_applied"]:
            discount = random.uniform(0.05, 0.25)
            transaction["discount_amount"] = base_amount * discount
        else:
            transaction["discount_amount"] = 0.0
        
        transaction["total_amount"] = base_amount - transaction["discount_amount"]        
        return transaction
    
    def generate_batch(self, num_records=1000, timestamp_base=None):
        """Generate a batch of transactions."""
        return [self.generate_sales_transaction(timestamp_base) 
                for _ in range(num_records)]

# Initialize data generator
data_generator = GlobalMartDataGenerator()
print("✅ GlobalMart data generator initialized")
print(f"📦 Products: {len(data_generator.products)}")
print(f"🏪 Stores: {len(data_generator.stores)}")

In [0]:
# Generate initial streaming data files
def generate_streaming_files(base_path, num_files=5, records_per_file=1000):
    """Generate JSON files simulating streaming data arrival."""
    # Clean up existing files
    #dbutils.fs.rm(base_path, recurse=True)
    #dbutils.fs.mkdirs(base_path)
    
    files_created = []
    base_time = datetime.now() - timedelta(hours=num_files)
    
    for file_num in range(num_files):
        # Generate transactions for this time period
        file_time = base_time + timedelta(hours=file_num)
        transactions = data_generator.generate_batch(records_per_file, file_time)
        
        # Write to JSON file
        file_path = f"{base_path}/sales_{file_num:04d}_{int(file_time.timestamp())}.json"
        json_content = "\n".join([json.dumps(t) for t in transactions])
        dbutils.fs.put(file_path, json_content, overwrite=True)
        
        files_created.append(file_path)
        time.sleep(0.5)  # Simulate time between file arrivals
    
    return files_created

# Generate initial data
streaming_input_path = f"{module_path}/streaming_input"
initial_files = generate_streaming_files(streaming_input_path, num_files=5, records_per_file=1000)

print(f"✅ Generated {len(initial_files)} initial data files")
print(f"📁 Files location: {streaming_input_path}")
print(f"📊 Total records: {5 * 1000:,}")

---
## Lab 1: Auto Loader and Bronze Layer Implementation (45 minutes)

### Objectives
- Configure Auto Loader for incremental file ingestion
- Set up Bronze tables with proper metadata
- Handle schema evolution and malformed files
- Monitor ingestion progress and performance

### Exercise 1.1: Basic Auto Loader Configuration

In [0]:
%sql
SHOW TABLES

In [0]:
dbutils.fs.ls('/databricks_course/module_03_incremental/streaming_input/')

In [0]:
dbutils.fs.ls('/databricks_course/module_03_incremental/checkpoints/')

In [0]:
# Configure paths for Bronze layer
bronze_table_path = f"{module_path}/tables/bronze_sales"
bronze_checkpoint = f"{module_path}/checkpoints/bronze_sales"
bronze_schema_location = f"{module_path}/schemas/bronze_sales"

try:
    dbutils.fs.rm(f"{module_path}/checkpoints/bronze_sales", recurse=True)
    print("✅ Cleared old checkpoint")
except:
    print("ℹ️ No old checkpoint to clear")


# Configure Auto Loader stream for Bronze ingestion
bronze_stream = (
    spark.readStream
    .format("cloudFiles")
    .option("cloudFiles.format", "json")
    .option("cloudFiles.schemaLocation", bronze_schema_location)
    .option("cloudFiles.inferColumnTypes", "true")
    .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
    .option("cloudFiles.maxFilesPerTrigger", 1)
    .load(streaming_input_path)
    .withColumn("ingestion_timestamp", current_timestamp())
    .withColumn("source_file_name", col("_metadata.file_path"))
    .withColumn("file_modification_time", col("_metadata.file_modification_time"))
    .withColumn("bronze_processing_date", current_date())
)

# Start the Bronze ingestion stream
bronze_query = (
    bronze_stream.writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", bronze_checkpoint)
    .option("mergeSchema", "true")
    .trigger(processingTime="10 seconds")
    .table("bronze_sales")
)

print("🚀 Auto Loader stream started for Bronze layer")

In [0]:
try:
    dbutils.fs.rm(f"{module_path}/checkpoints/bronze_sales", recurse=True)
    print("✅ Cleared old checkpoint")
except:
    print("ℹ️ No old checkpoint to clear")

dbfs:/databricks_course/module_03_incremental/streaming_input/sales_evolved_0000_1753829818.json

In [0]:
# Monitor streaming progress
time.sleep(20)

# Query the Bronze table
bronze_count = spark.table("bronze_sales").count()
print(f"\nTotal records in Bronze layer: {bronze_count:,}")

# Display sample records
display(
    spark.table("bronze_sales")
    .select("transaction_id", "store_id", "total_amount", "transaction_time", "ingestion_timestamp")
    .orderBy(col("ingestion_timestamp").desc())
    .limit(5)
)

### Exercise 1.2: Handling Schema Evolution

In [0]:
# Generate data with evolved schema
def generate_evolved_schema_data():
    """Generate data with new fields."""
    transactions = []
    
    for _ in range(500):
        transaction = data_generator.generate_sales_transaction()
        # Add new fields
        transaction["loyalty_card_id"] = f"LOYAL_{random.randint(1000, 9999)}" if random.random() > 0.3 else None
        transaction["loyalty_points_earned"] = random.randint(10, 500) if transaction["loyalty_card_id"] else 0
        transaction["delivery_method"] = random.choice(["in_store", "curbside", "home_delivery"])
        transactions.append(transaction)
    
    return transactions

# Generate files with evolved schema
for i in range(2):
    evolved_data = generate_evolved_schema_data()
    file_path = f"{streaming_input_path}/sales_evolved_{i:04d}_{int(time.time())}.json"
    json_content = "\n".join([json.dumps(t) for t in evolved_data])
    dbutils.fs.put(file_path, json_content, overwrite=True)
    time.sleep(1)

print("✅ Generated files with evolved schema")

In [0]:
# Wait and check schema evolution
time.sleep(30)

# Check schema
evolved_df = spark.table("bronze_sales")
print("Schema after evolution:")
evolved_df.printSchema()

# Query records with new fields
display(
    evolved_df
    .filter(col("loyalty_card_id").isNotNull())
    .select("transaction_id", "loyalty_card_id", "loyalty_points_earned", "delivery_method")
    .limit(5)
)

In [0]:
%sql
SELECT * FROM bronze_sales

In [0]:
# Stop Bronze stream
bronze_query.stop()
print("⏹️  Bronze stream stopped")

---
## Lab 2: Building Silver Layer with CDC (60 minutes)

### Objectives
- Implement Change Data Feed on Bronze tables
- Create Silver layer transformations using merge
- Handle late-arriving data scenarios
- Build quality metrics and monitoring

### Exercise 2.1: Enable Change Data Feed

In [0]:
# Enable Change Data Feed on Bronze table
spark.sql("""
    ALTER TABLE bronze_sales 
    SET TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

print("✅ Change Data Feed enabled on bronze_sales table")

# Get current version for CDF tracking
current_version = spark.sql("DESCRIBE HISTORY bronze_sales LIMIT 1").select("version").collect()[0][0]
print(f"📌 Current Bronze table version: {current_version}")

In [0]:
# Create Silver table with business rules
spark.sql("""
    CREATE TABLE IF NOT EXISTS silver_sales (
        transaction_id STRING NOT NULL,
        store_id STRING NOT NULL,
        store_region STRING,
        product_id STRING NOT NULL,
        product_category STRING,
        customer_id STRING NOT NULL,
        customer_segment STRING,
        transaction_timestamp TIMESTAMP,
        transaction_date DATE,
        quantity INT,
        unit_price DECIMAL(10,2),
        discount_amount DECIMAL(10,2),
        total_amount DECIMAL(10,2),
        payment_method STRING,
        channel STRING,
        delivery_method STRING,
        promotion_applied BOOLEAN,
        loyalty_card_id STRING,
        loyalty_points_earned INT,
        is_valid BOOLEAN,
        validation_errors ARRAY<STRING>,
        data_quality_score DECIMAL(3,2),
        silver_processing_timestamp TIMESTAMP,
        bronze_file_name STRING,
        bronze_ingestion_timestamp TIMESTAMP
    )
    USING DELTA
    PARTITIONED BY (transaction_date)
    TBLPROPERTIES (
        delta.enableChangeDataFeed = true,
        delta.autoOptimize.optimizeWrite = true,
        delta.autoOptimize.autoCompact = true
    )
""")

print("✅ Silver table created with CDF and optimization enabled")

### Exercise 2.2: Implement Silver Layer Transformations

In [0]:
# Define data quality rules and transformations
def apply_silver_transformations(bronze_df):
    """Apply business rules and data quality checks for Silver layer."""
    from pyspark.sql.window import Window
    
    transformed_df = (
        bronze_df
        # Parse timestamps
        .withColumn("transaction_timestamp", 
                   to_timestamp(col("transaction_time"), "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"))
        .withColumn("transaction_date", to_date(col("transaction_timestamp")))
        
        # Standardize channel
        .withColumn("channel", 
                   when(col("is_online") == True, "ONLINE").otherwise("IN_STORE"))
        
        # Handle nulls
        .withColumn("discount_amount",
                   when(col("discount_amount").isNull(), 0.0).otherwise(col("discount_amount")))
        .withColumn("loyalty_points_earned",
                   when(col("loyalty_points_earned").isNull(), 0).otherwise(col("loyalty_points_earned")))
        
        # Initialize validation
        .withColumn("validation_errors", array().cast("array<string>"))
    )
    
    # Apply validation rules
    validation_rules = [
        (col("total_amount") <= 0, "Invalid total amount"),
        (col("quantity") <= 0, "Invalid quantity"),
        (~col("customer_id").rlike("^CUST_[0-9]{5}$"), "Invalid customer ID format"),
        (col("transaction_timestamp").isNull(), "Invalid timestamp")
    ]
    
    for condition, error_msg in validation_rules:
        transformed_df = transformed_df.withColumn(
            "validation_errors",
            when(condition, array_union(col("validation_errors"), array(lit(error_msg))))
            .otherwise(col("validation_errors"))
        )
    
    # Final calculations
    final_df = (
        transformed_df
        .withColumn("is_valid", size(col("validation_errors")) == 0)
        .withColumn("data_quality_score",
                   when(size(col("validation_errors")) == 0, 1.0)
                   .when(size(col("validation_errors")) == 1, 0.7)
                   .otherwise(0.3))
        .withColumn("silver_processing_timestamp", current_timestamp())
        .withColumn("bronze_file_name", col("source_file_name"))
        .withColumn("bronze_ingestion_timestamp", col("ingestion_timestamp"))
    )
    
    return final_df.drop("source_file_name", "ingestion_timestamp", "file_modification_time", 
                         "bronze_processing_date", "_metadata", "is_online", "transaction_time")

# Test transformation
bronze_sample = spark.table("bronze_sales").limit(100)
silver_sample = apply_silver_transformations(bronze_sample)

print("✅ Silver transformation function created")
display(
    silver_sample
    .groupBy("is_valid")
    .agg(count("*").alias("count"), avg("data_quality_score").alias("avg_quality"))
)

In [0]:
try:
    dbutils.fs.rm(f"{module_path}/checkpoints/silver_cdc", recurse=True)
    print("✅ Cleared old checkpoint")
except:
    print("ℹ️ No old checkpoint to clear")

In [0]:
# Implement streaming CDC from Bronze to Silver
from pyspark.sql.window import Window


def create_bronze_to_silver_stream():
    """Create streaming pipeline from Bronze to Silver using CDC."""
    
    # Read CDC stream from Bronze
    bronze_cdc_stream = (
        spark.readStream
        .format("delta")
        .option("readChangeFeed", "true")
        .option("startingVersion", current_version)
        .table("bronze_sales")
        .filter(col("_change_type").isin(["insert", "update_postimage"]))
    )
    
    # Apply transformations
    silver_stream = apply_silver_transformations(bronze_cdc_stream)
    
    # Define merge function
    def merge_to_silver(batch_df, batch_id):
        """Merge batch into Silver table."""

        # Deduplicate within batch
        window_spec = Window.partitionBy("transaction_id").orderBy(col("silver_processing_timestamp").desc())
        
        deduped_df = (
            batch_df
            .drop("_rescued_data")
            .drop("_change_type")
            .drop("_commit_version")
            .drop("_commit_timestamp")
            .withColumn("row_rank", row_number().over(window_spec))
            .filter(col("row_rank") == 1)
            .drop("row_rank")
        )
        
        # Merge into Silver
        silver_table = DeltaTable.forName(spark, "silver_sales")
        
        silver_table.alias("target").merge(
            deduped_df.alias("source"),
            "target.transaction_id = source.transaction_id"
        ).whenMatchedUpdate(
            condition = "source.silver_processing_timestamp > target.silver_processing_timestamp",
            set = {col: f"source.{col}" for col in deduped_df.columns}
        ).whenNotMatchedInsertAll().execute()
        
        print(f"Batch {batch_id}: Processed {batch_df.count()} records")
    
    # Create streaming query
    query = (
        silver_stream
        .writeStream
        .foreachBatch(merge_to_silver)
        .outputMode("update")
        .trigger(processingTime="30 seconds")
        .option("checkpointLocation", f"{module_path}/checkpoints/silver_cdc")
        .start()
    )
    
    return query

# Start CDC stream
silver_cdc_query = create_bronze_to_silver_stream()
print("🚀 Bronze → Silver CDC stream started")

In [0]:
%sql
SELECT * FROM silver_sales

In [0]:
# Generate more data and check Silver
generate_streaming_files(streaming_input_path, num_files=2, records_per_file=500)
time.sleep(45)

# Check Silver statistics
silver_stats = spark.sql("""
    SELECT 
        COUNT(*) as total_records,
        SUM(CASE WHEN is_valid THEN 1 ELSE 0 END) as valid_records,
        AVG(data_quality_score) as avg_quality_score
    FROM silver_sales
""")

print("📊 Silver Layer Statistics:")
display(silver_stats)

In [0]:
generate_streaming_files(streaming_input_path, num_files=4, records_per_file=500)

In [0]:
# Stop Silver CDC stream
silver_cdc_query.stop()
print("⏹️  Silver CDC stream stopped")

---
## Lab 3: Structured Streaming and Gold Layer (60 minutes)

### Objectives
- Develop streaming aggregations for real-time metrics
- Implement windowing operations
- Create Gold layer tables optimized for queries

### Exercise 3.1: Real-Time Sales Metrics

In [0]:
%sql
DROP TABLE IF EXISTS gold_realtime_sales_metrics

In [0]:
# Create Gold table for real-time metrics
spark.sql("""
    CREATE TABLE IF NOT EXISTS gold_realtime_sales_metrics (
        window_start TIMESTAMP,
        window_end TIMESTAMP,
        store_id STRING,
        store_region STRING,
        total_transactions BIGINT,
        total_revenue DECIMAL(20,2),
        avg_transaction_value DECIMAL(14,6),
        unique_customers BIGINT,
        online_revenue DECIMAL(22,2),
        store_revenue DECIMAL(22,2),
        last_updated TIMESTAMP
    )
    USING DELTA
""")

print("✅ Gold real-time metrics table created")

In [0]:
try:
    dbutils.fs.rm(f"{module_path}/checkpoints/gold_realtime_metrics", recurse=True)
    print("✅ Cleared old checkpoint")
except:
    print("ℹ️ No old checkpoint to clear")

In [0]:
# Create streaming aggregation
def create_realtime_sales_metrics():
    """Create streaming aggregations for real-time dashboard."""
    
    # Read from Silver with CDC
    silver_stream = (
        spark.readStream
        .format("delta")
        .option("readChangeFeed", "true")
        .table("silver_sales")
        .filter((col("_change_type").isin(["insert", "update_postimage"])) & 
                (col("is_valid") == True))
    )
    
    # Apply watermarking
    watermarked_stream = silver_stream.withWatermark("transaction_timestamp", "30 minutes")
    
    # Create windowed aggregations
    windowed_metrics = (
        watermarked_stream
        .groupBy(
            window(col("transaction_timestamp"), "5 minutes"),
            col("store_id"),
            col("store_region")
        )
        .agg(
            count("*").alias("total_transactions"),
            sum("total_amount").alias("total_revenue"),
            avg("total_amount").alias("avg_transaction_value"),
            approx_count_distinct("customer_id").alias("unique_customers"),
            sum(when(col("channel") == "ONLINE", col("total_amount")).otherwise(0)).alias("online_revenue"),
            sum(when(col("channel") == "IN_STORE", col("total_amount")).otherwise(0)).alias("store_revenue")
        )
        .select(
            col("window.start").alias("window_start"),
            col("window.end").alias("window_end"),
            "store_id",
            "store_region",
            "total_transactions",
            "total_revenue",
            "avg_transaction_value",
            "unique_customers",
            "online_revenue",
            "store_revenue"
        )
        .withColumn("last_updated", current_timestamp())
    )
    
    # Write to Gold table
    query = (
        windowed_metrics
        .writeStream
        .format("delta")
        .outputMode("complete")
        .option("checkpointLocation", f"{module_path}/checkpoints/gold_realtime_metrics")
        .trigger(processingTime="30 seconds")
        .table("gold_realtime_sales_metrics")
    )
    
    return query

# Start real-time metrics
realtime_query = create_realtime_sales_metrics()
print("🚀 Real-time sales metrics stream started")

In [0]:
# Generate continuous data
import threading

def generate_continuous_data():
    for i in range(3):
        generate_streaming_files(streaming_input_path, num_files=1, records_per_file=200)
        time.sleep(15)

# Start data generation in background
data_thread = threading.Thread(target=generate_continuous_data)
data_thread.start()

# Wait and query metrics
time.sleep(60)

# Query latest metrics
latest_metrics = spark.sql("""
    SELECT 
        window_start,
        window_end,
        store_region,
        SUM(total_transactions) as transactions,
        SUM(total_revenue) as revenue
    FROM gold_realtime_sales_metrics
    WHERE window_end >= current_timestamp() - INTERVAL 15 MINUTES
    GROUP BY window_start, window_end, store_region
    ORDER BY window_end DESC
    LIMIT 10
""")

print("📊 Latest Sales Metrics:")
display(latest_metrics)

In [0]:
%sql
SELECT 
        window_start,
        window_end,
        store_region,
        SUM(total_transactions) as transactions,
        SUM(total_revenue) as revenue
    FROM gold_realtime_sales_metrics
    --WHERE window_end >= current_timestamp() - INTERVAL 15 MINUTES
    GROUP BY window_start, window_end, store_region
    ORDER BY window_end DESC
    LIMIT 10

In [0]:
# Stop streaming query
realtime_query.stop()
print("⏹️  Real-time metrics stream stopped")

---
## Lab 4: Production Patterns and Monitoring (45 minutes)

### Objectives
- Implement circuit breakers and error handling
- Set up comprehensive monitoring
- Create data quality dashboards

### Exercise 4.1: Implement Circuit Breaker Pattern

In [0]:
# Create monitoring table
spark.sql("""
    CREATE TABLE IF NOT EXISTS pipeline_health_metrics (
        pipeline_name STRING,
        check_timestamp TIMESTAMP,
        metric_name STRING,
        metric_value DOUBLE,
        status STRING,
        alert_triggered BOOLEAN
    )
    USING DELTA
""")

print("✅ Pipeline health metrics table created")

In [0]:
# Simple circuit breaker implementation
class CircuitBreaker:
    def __init__(self, failure_threshold=5, reset_timeout=300):
        self.failure_threshold = failure_threshold
        self.reset_timeout = reset_timeout
        self.failure_count = 0
        self.last_failure_time = None
        self.is_open = False
    
    def record_success(self):
        self.failure_count = 0
        self.is_open = False
    
    def record_failure(self):
        self.failure_count += 1
        self.last_failure_time = time.time()
        
        if self.failure_count >= self.failure_threshold:
            self.is_open = True
            print(f"Circuit breaker OPEN - {self.failure_count} failures")
            return True
        return False
    
    def should_attempt(self):
        if not self.is_open:
            return True
        
        if time.time() - self.last_failure_time > self.reset_timeout:
            print("Circuit breaker attempting reset")
            self.is_open = False
            self.failure_count = 0
            return True
        
        return False

# Test circuit breaker
cb = CircuitBreaker(failure_threshold=3)
print("✅ Circuit breaker initialized")

### Exercise 4.2: Data Quality Monitoring

In [0]:
# Create data quality dashboard
def create_quality_dashboard():
    """Create comprehensive quality metrics."""
    
    # Overall quality
    overall_quality = spark.sql("""
        SELECT 
            'Overall' as metric_category,
            COUNT(*) as total_records,
            AVG(CASE WHEN is_valid THEN 1 ELSE 0 END) * 100 as validity_rate,
            AVG(data_quality_score) * 100 as avg_quality_score
        FROM silver_sales
    """)
    
    # Error breakdown
    error_breakdown = spark.sql("""
        WITH errors AS (
            SELECT explode(validation_errors) as error_type
            FROM silver_sales
            WHERE size(validation_errors) > 0
        )
        SELECT 
            error_type,
            COUNT(*) as error_count
        FROM errors
        GROUP BY error_type
        ORDER BY error_count DESC
    """)
    
    print("📊 Data Quality Dashboard:")
    print("\n1. Overall Quality:")
    display(overall_quality)
    
    print("\n2. Error Breakdown:")
    display(error_breakdown)

# Generate dashboard
create_quality_dashboard()

In [0]:
# Final summary
print("\n" + "="*50)
print("✅ Module 3 Complete!")
print("="*50)

summary = spark.sql("""
    SELECT 
        'Bronze Layer' as layer,
        (SELECT COUNT(*) FROM bronze_sales) as record_count
    UNION ALL
    SELECT 
        'Silver Layer',
        (SELECT COUNT(*) FROM silver_sales)
    UNION ALL
    SELECT 
        'Gold Layer',
        (SELECT COUNT(*) FROM gold_realtime_sales_metrics)
""")

display(summary)

print("\n🎉 Congratulations! You've mastered:")
print("- Auto Loader for incremental ingestion")
print("- Change Data Feed for CDC")
print("- Structured Streaming for real-time analytics")
print("- Medallion Architecture implementation")
print("- Production monitoring patterns")