# üèîÔ∏è Lakehouse Analysis with PySpark + Iceberg

This notebook demonstrates querying the Iceberg Lakehouse tables using PySpark.

## Medallion Architecture Layers
- **Bronze**: Raw events from Kafka (ingested by Flink)
- **Silver**: Cleaned, deduplicated facts and SCD Type 2 dimensions
- **Gold**: Aggregated metrics for BI/Analytics

## Prerequisites
Run the medallion flow test first:
```bash
docker exec spark-master /opt/spark/bin/spark-submit \
    --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262 \
    --conf spark.extraListeners= \
    /opt/spark-jobs/test_medallion_flow.py
```


In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt

# Configure Spark for Iceberg on MinIO using Hadoop catalog
# (No Hive Metastore required - uses file-based catalog)
spark = SparkSession.builder \
    .appName("Lakehouse Analysis") \
    .config("spark.jars.packages", 
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.5.0,"
            "org.apache.hadoop:hadoop-aws:3.3.4,"
            "com.amazonaws:aws-java-sdk-bundle:1.12.262") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.lakehouse", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.lakehouse.type", "hadoop") \
    .config("spark.sql.catalog.lakehouse.warehouse", "s3a://lakehouse/warehouse") \
    .config("spark.sql.catalog.lakehouse.io-impl", "org.apache.iceberg.hadoop.HadoopFileIO") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"‚úÖ Spark version: {spark.version}")
print(f"üì¶ Iceberg catalog configured: lakehouse (Hadoop catalog)")
print(f"ü™£ MinIO endpoint: http://minio:9000")


In [None]:
# List tables in each Medallion layer
for layer in ['bronze', 'silver', 'gold']:
    print(f"\n{'='*60}")
    print(f"üìÅ {layer.upper()} LAYER")
    print('='*60)
    try:
        tables = spark.sql(f"SHOW TABLES IN lakehouse.{layer}")
        if tables.count() == 0:
            print("   (no tables)")
        else:
            for row in tables.collect():
                table_name = f"lakehouse.{layer}.{row['tableName']}"
                count = spark.table(table_name).count()
                print(f"   ‚îî‚îÄ‚îÄ {row['tableName']}: {count} rows")
    except Exception as e:
        print(f"   ‚ùå Error: {e}")


In [None]:
# Query Gold layer tables
print("üìä GOLD LAYER ANALYTICS")
print("="*60)

# Daily Summary
try:
    daily_df = spark.table("lakehouse.gold.daily_conversion_summary")
    print("\nüìÖ Daily Conversion Summary:")
    daily_df.show(truncate=False)
except Exception as e:
    print(f"‚ùå daily_conversion_summary: {e}")

# User Metrics
try:
    users_df = spark.table("lakehouse.gold.user_metrics")
    print("\nüë• User Metrics:")
    users_df.show(truncate=False)
except Exception as e:
    print(f"‚ùå user_metrics: {e}")

# Popular Numbers
try:
    popular_df = spark.table("lakehouse.gold.popular_numbers")
    print("\nüî• Most Popular Numbers:")
    popular_df.show(truncate=False)
    
    # Visualization
    pdf = popular_df.toPandas()
    if len(pdf) > 0:
        fig, ax = plt.subplots(figsize=(10, 5))
        ax.barh(pdf['output_value'].astype(str), pdf['request_count'], color='steelblue')
        ax.set_xlabel('Request Count')
        ax.set_ylabel('Roman Numeral')
        ax.set_title('Most Popular Roman Numeral Conversions')
        plt.tight_layout()
        plt.show()
except Exception as e:
    print(f"‚ùå popular_numbers: {e}")
