# Lakehouse Analysis with PySpark

This notebook demonstrates querying the Iceberg Lakehouse tables using PySpark.

## Medallion Architecture Layers
- **Bronze**: Raw events from Kafka (ingested by Flink)
- **Silver**: Cleaned, deduplicated facts and SCD Type 2 dimensions
- **Gold**: Aggregated metrics for BI/Analytics


In [None]:
from pyspark.sql import SparkSession
import pandas as pd
import matplotlib.pyplot as plt

# Configure Spark for Iceberg on MinIO
spark = SparkSession.builder \
    .appName("Lakehouse Analysis") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.lakehouse", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.lakehouse.type", "hive") \
    .config("spark.sql.catalog.lakehouse.uri", "thrift://hive-metastore:9083") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin123") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print(f"Spark version: {spark.version}")


In [None]:
# List tables in each Medallion layer
for db in ['bronze', 'silver', 'gold']:
    print(f"\n{'='*50}")
    print(f"{db.upper()} LAYER TABLES")
    print('='*50)
    try:
        spark.sql(f"SHOW TABLES IN lakehouse.{db}").show(truncate=False)
    except Exception as e:
        print(f"Database {db} not found or empty: {e}")


In [None]:
# Query Gold layer - Number popularity
try:
    popularity = spark.sql("""
        SELECT 
            input_value,
            total_conversions,
            unique_users,
            popularity_rank
        FROM lakehouse.gold.fact_number_popularity
        WHERE popularity_rank <= 20
        ORDER BY popularity_rank
    """)
    
    print("Top 20 Most Popular Numbers:")
    popularity.show()
except Exception as e:
    print(f"Gold layer not available: {e}")
    print("Run the Airflow Gold ETL DAG to populate.")
