In [None]:
# Install Java 8 (required for PySpark)
!apt-get update
!apt-get install -y openjdk-8-jdk-headless
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

# Install PySpark with Iceberg support
%pip install pyspark==3.4.1
%pip install pyiceberg[s3fs,duckdb]==0.5.1
%pip install pandas==2.0.3
%pip install matplotlib seaborn

print("‚úÖ Dependencies installed successfully!")


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
import pandas as pd
import os

# Download Iceberg JAR for Spark
!wget -q https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.4.2/iceberg-spark-runtime-3.4_2.12-1.4.2.jar -O /content/iceberg-spark-runtime.jar

# Configure Spark with Iceberg
spark = SparkSession.builder \
    .appName("Iceberg Enterprise Demo") \
    .config("spark.jars", "/content/iceberg-spark-runtime.jar") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "/content/iceberg-warehouse") \
    .config("spark.sql.warehouse.dir", "/content/iceberg-warehouse") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("WARN")

print(f"‚úÖ Spark {spark.version} with Iceberg initialized successfully!")
print(f"üìÅ Warehouse location: /content/iceberg-warehouse")


In [None]:
from datetime import datetime, timedelta
import random

# Generate sample customer data
def generate_customer_data(num_customers=1000):
    customers = []
    for i in range(num_customers):
        customers.append({
            'customer_id': f'CUST_{i:06d}',
            'first_name': f'FirstName{i}',
            'last_name': f'LastName{i}',
            'email': f'customer{i}@enterprise.com',
            'registration_date': datetime(2020, 1, 1) + timedelta(days=random.randint(0, 1400)),
            'customer_segment': random.choice(['Premium', 'Standard', 'Basic']),
            'credit_limit': random.randint(1000, 50000),
            'country': random.choice(['USA', 'Canada', 'UK', 'Germany', 'France']),
            'is_active': random.choice([True, False])
        })
    return customers

# Generate sample sales data
def generate_sales_data(num_transactions=5000):
    sales = []
    for i in range(num_transactions):
        sales.append({
            'transaction_id': f'TXN_{i:08d}',
            'customer_id': f'CUST_{random.randint(0, 999):06d}',
            'product_id': f'PROD_{random.randint(1, 100):03d}',
            'transaction_date': datetime(2023, 1, 1) + timedelta(days=random.randint(0, 365)),
            'quantity': random.randint(1, 10),
            'unit_price': round(random.uniform(10.0, 500.0), 2),
            'discount_percentage': random.uniform(0, 0.3),
            'payment_method': random.choice(['Credit Card', 'Debit Card', 'Cash', 'Bank Transfer']),
            'sales_rep': f'REP_{random.randint(1, 50):03d}'
        })
    return sales

# Create DataFrames
customers_data = generate_customer_data(1000)
sales_data = generate_sales_data(5000)

customers_df = spark.createDataFrame(customers_data)
sales_df = spark.createDataFrame(sales_data)

# Add calculated columns
sales_df = sales_df.withColumn(
    'total_amount', 
    col('quantity') * col('unit_price') * (1 - col('discount_percentage'))
)

print("‚úÖ Sample data generated:")
print(f"   üìä Customers: {customers_df.count():,} records")
print(f"   üí∞ Sales: {sales_df.count():,} transactions")
print(f"   üíµ Total revenue: ${sales_df.select(sum('total_amount')).collect()[0][0]:,.2f}")


In [None]:
# Create customers Iceberg table
customers_df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .option("path", "/content/iceberg-warehouse/customers") \
    .saveAsTable("local.db.customers")

# Create sales Iceberg table with partitioning (enterprise best practice)
sales_df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .option("path", "/content/iceberg-warehouse/sales") \
    .partitionBy("transaction_date") \
    .saveAsTable("local.db.sales")

print("‚úÖ Iceberg tables created successfully!")

# Show table information
print("\nüìã Table Details:")
spark.sql("SHOW TABLES IN local.db").show()

# Show customers table schema
print("\nüë• Customers Table Schema:")
spark.sql("DESCRIBE local.db.customers").show()

# Show sales table schema
print("\nüí∞ Sales Table Schema:")
spark.sql("DESCRIBE local.db.sales").show()


In [None]:
# Read Iceberg tables
customers_iceberg = spark.read.format("iceberg").table("local.db.customers")
sales_iceberg = spark.read.format("iceberg").table("local.db.sales")

print("üìä Data Summary:")
print(f"   Customers: {customers_iceberg.count():,}")
print(f"   Sales Transactions: {sales_iceberg.count():,}")

# Sample queries
print("\nüîç Top 5 Customer Segments by Count:")
customers_iceberg.groupBy("customer_segment") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

print("\nüí≥ Sales by Payment Method:")
sales_iceberg.groupBy("payment_method") \
    .agg(
        count("*").alias("transaction_count"),
        round(sum("total_amount"), 2).alias("total_revenue")
    ) \
    .orderBy(col("total_revenue").desc()) \
    .show()


In [None]:
# View table history
print("üìú Sales Table History:")
spark.sql("SELECT * FROM local.db.sales.history").show(truncate=False)

# Get current snapshot info
print("\nüì∏ Current Snapshots:")
spark.sql("SELECT * FROM local.db.sales.snapshots").show(truncate=False)

# Make some changes to demonstrate time travel
print("\nüîÑ Making changes to demonstrate time travel...")

# Add new sales data (simulating new transactions)
new_sales_data = generate_sales_data(100)
new_sales_df = spark.createDataFrame(new_sales_data)
new_sales_df = new_sales_df.withColumn(
    'total_amount', 
    col('quantity') * col('unit_price') * (1 - col('discount_percentage'))
)

# Append to existing table
new_sales_df.write \
    .format("iceberg") \
    .mode("append") \
    .saveAsTable("local.db.sales")

print(f"‚úÖ Added {new_sales_df.count()} new transactions")
print(f"üìä Total transactions now: {spark.read.format('iceberg').table('local.db.sales').count():,}")

# Show updated history
print("\nüìú Updated Table History:")
spark.sql("SELECT * FROM local.db.sales.history ORDER BY made_current_at").show(truncate=False)


In [None]:
# Show current schema
print("üìã Current Customers Schema:")
spark.sql("DESCRIBE local.db.customers").show()

# Add a new column (common enterprise requirement)
print("\nüîß Adding new column: loyalty_points")
spark.sql("""
    ALTER TABLE local.db.customers 
    ADD COLUMN loyalty_points INT AFTER credit_limit
""")

# Show updated schema
print("\nüìã Updated Schema:")
spark.sql("DESCRIBE local.db.customers").show()

# Update some records with loyalty points
print("\nüìù Updating loyalty points for Premium customers...")
spark.sql("""
    UPDATE local.db.customers 
    SET loyalty_points = CAST(credit_limit * 0.1 AS INT)
    WHERE customer_segment = 'Premium'
""")

# Verify the update
print("\n‚úÖ Premium customers with loyalty points:")
spark.sql("""
    SELECT customer_segment, 
           COUNT(*) as customer_count,
           AVG(loyalty_points) as avg_loyalty_points
    FROM local.db.customers 
    WHERE customer_segment = 'Premium'
    GROUP BY customer_segment
""").show()

# Show that old queries still work (backward compatibility)
print("\nüîÑ Backward compatibility check - old queries still work:")
spark.sql("""
    SELECT customer_segment, COUNT(*) as count
    FROM local.db.customers 
    GROUP BY customer_segment
    ORDER BY count DESC
""").show()


In [None]:
# Display best practices and final summary
print("üè¢ ENTERPRISE APACHE ICEBERG BEST PRACTICES")
print("=" * 60)

best_practices = [
    "üéØ **Partitioning Strategy**",
    "   ‚Ä¢ Use date/time partitioning for time-series data",
    "   ‚Ä¢ Consider business-specific partitions (region, department)",
    "   ‚Ä¢ Avoid over-partitioning (aim for 100MB+ per partition)",
    "",
    "üîß **Table Maintenance**",
    "   ‚Ä¢ Schedule regular compaction jobs",
    "   ‚Ä¢ Implement snapshot cleanup policies",
    "   ‚Ä¢ Monitor table statistics and file counts",
    "",
    "üîí **Data Governance**",
    "   ‚Ä¢ Use schema evolution carefully with proper testing",
    "   ‚Ä¢ Implement data lineage tracking",
    "   ‚Ä¢ Set up proper access controls and auditing",
    "",
    "üìä **Performance Optimization**",
    "   ‚Ä¢ Use vectorized readers when available",
    "   ‚Ä¢ Implement predicate pushdown in queries",
    "   ‚Ä¢ Optimize file sizes (128MB-1GB per file)",
    "",
    "üõ°Ô∏è **Reliability & Recovery**",
    "   ‚Ä¢ Implement backup strategies for metadata",
    "   ‚Ä¢ Test disaster recovery procedures",
    "   ‚Ä¢ Use time travel for audit and compliance",
    "",
    "üîó **Integration**",
    "   ‚Ä¢ Standardize on Iceberg across analytics engines",
    "   ‚Ä¢ Implement proper CI/CD for schema changes",
    "   ‚Ä¢ Use catalog services for metadata management"
]

for practice in best_practices:
    print(practice)

# Final summary
print("\n\nüìà DEMO SUMMARY")
print("=" * 30)

summary_stats = spark.sql("""
    SELECT 
        'Total Customers' as metric,
        CAST(COUNT(*) AS STRING) as value
    FROM local.db.customers
    UNION ALL
    SELECT 
        'Total Sales Transactions',
        CAST(COUNT(*) AS STRING)
    FROM local.db.sales
    UNION ALL
    SELECT 
        'Total Revenue',
        CONCAT('$', CAST(ROUND(SUM(total_amount), 2) AS STRING))
    FROM local.db.sales
    UNION ALL
    SELECT 
        'Active Customers',
        CAST(SUM(CASE WHEN is_active THEN 1 ELSE 0 END) AS STRING)
    FROM local.db.customers
""")

summary_stats.show(truncate=False)

print("\n‚úÖ Apache Iceberg Enterprise Demo Completed Successfully!")
print("\nüöÄ Ready for production deployment with proper configuration!")

# Optional cleanup
print("\n‚èπÔ∏è Stopping Spark session...")
spark.stop()
print("‚úÖ Demo completed! Your Iceberg tables are preserved in /content/iceberg-warehouse/")
print("üìö To learn more, visit: https://iceberg.apache.org/")
