## 1. Initialize Spark Session with Iceberg

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pandas as pd

# Create Spark session with Iceberg extensions
spark = SparkSession.builder \
    .appName("IcebergDemo") \
    .master("spark://spark-master:7077") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "s3a://warehouse/") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.secret.key", "minioadmin") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

print(f"Spark Version: {spark.version}")
print(f"Spark App ID: {spark.sparkContext.applicationId}")

## 2. Create an Iceberg Table

In [None]:
# Create a new Iceberg table
spark.sql("""
    CREATE TABLE IF NOT EXISTS local.default.customers (
        customer_id INT,
        name STRING,
        email STRING,
        age INT,
        city STRING
    )
    USING iceberg
""")

print("Table 'customers' created successfully!")

## 3. Insert Sample Data

In [None]:
# Insert sample data
spark.sql("""
    INSERT INTO local.default.customers VALUES
    (1, 'Alice Johnson', 'alice@example.com', 30, 'San Francisco'),
    (2, 'Bob Smith', 'bob@example.com', 25, 'New York'),
    (3, 'Charlie Brown', 'charlie@example.com', 35, 'Boston'),
    (4, 'Diana Prince', 'diana@example.com', 28, 'Seattle'),
    (5, 'Eve Wilson', 'eve@example.com', 32, 'Austin')
""")

print("Sample data inserted!")

## 4. Query the Table

In [None]:
# Query the table
result = spark.sql("SELECT * FROM local.default.customers")
result.show()

# Display as pandas DataFrame
result.toPandas()

## 5. Iceberg Features - Time Travel

In [None]:
# Update data (creates a new version)
spark.sql("""
    UPDATE local.default.customers
    SET age = 31
    WHERE customer_id = 1
""")

print("Data updated!")

# Query current version
print("\nCurrent version:")
spark.sql("SELECT * FROM local.default.customers WHERE customer_id = 1").show()

## 6. Table Statistics and Metadata

In [None]:
# Get table statistics
spark.sql("SELECT * FROM local.default.customers").describe().show()

# Count records
count = spark.sql("SELECT COUNT(*) as record_count FROM local.default.customers").collect()[0][0]
print(f"\nTotal records: {count}")

## 7. Analytics Example

In [None]:
# Analytics queries
spark.sql("""
    SELECT
        city,
        COUNT(*) as customer_count,
        AVG(age) as avg_age,
        MIN(age) as min_age,
        MAX(age) as max_age
    FROM local.default.customers
    GROUP BY city
    ORDER BY customer_count DESC
""").show()