In [None]:
# =============================================================================
# CELL 1: Configuration
# =============================================================================
# Replace these values with your project details

import os
import subprocess
import time

from pyspark.sql import SparkSession

# GCP Project Configuration
PROJECT_ID = "my-project-id"  # Your Google Cloud project ID
REGION = "us-central1"        # The region for BigQuery/GCS
# GCS bucket for Iceberg data
BUCKET_NAME = f"{PROJECT_ID}-docker-bucket"

# Docker Settings
CONTAINER_NAME = "biglake-iceberg-env"
JUPYTER_PORT = 8888
SPARK_UI_PORT = 4040
# Place your service account key here
GCP_KEY_FILE = "my-project-id.json"
GCP_KEY_PATH = "/home/jovyan/gcp-key.json"  # Path inside the Docker container

# BigLake Metastore Settings
BIGLAKE_DATASET = "my_iceberg_metastore"  # BigQuery dataset for metastore
BIGLAKE_CATALOG = "iceberg_on_bq"  # Iceberg catalog name
BIGLAKE_CONNECTION = f"projects/{PROJECT_ID}/locations/{REGION}/connections/default-{REGION}"

print("✅ Configuration loaded")
print(f"Project: {PROJECT_ID}")
print(f"BigLake Dataset: {BIGLAKE_DATASET}")
print(f"Iceberg Catalog: {BIGLAKE_CATALOG}")

In [None]:
# =============================================================================
# CELL 2: Docker Environment Setup
# =============================================================================
# Creates minimal Docker environment for BigLake + Iceberg

def check_prerequisites():
    """Check if Docker and GCP key file are available"""
    try:
        subprocess.run(["docker", "--version"],
                       check=True, capture_output=True)
        print("✅ Docker is available")
    except (subprocess.CalledProcessError, FileNotFoundError):
        raise Exception("❌ Docker is not installed or not in PATH")

    if not os.path.exists(GCP_KEY_FILE):
        raise Exception(
            f"❌ GCP key file '{GCP_KEY_FILE}' not found. Please place your service account key file here.")

    print(f"✅ GCP key file found: {GCP_KEY_FILE}")


check_prerequisites()

# Create minimal Dockerfile for BigLake + Iceberg
dockerfile_content = f"""
FROM jupyter/pyspark-notebook:spark-3.5.0

# Switch to root to install packages
USER root

# Install required system packages
RUN apt-get update && apt-get install -y wget curl && rm -rf /var/lib/apt/lists/*

# Download only required JARs for BigLake + Iceberg
RUN wget -P /usr/local/spark/jars/ https://storage-download.googleapis.com/maven-central/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.6.1/iceberg-spark-runtime-3.5_2.12-1.6.1.jar && \\
    wget -P /usr/local/spark/jars/ https://storage.googleapis.com/spark-lib/bigquery/iceberg-bigquery-catalog-1.6.1-1.0.1-beta.jar && \\
    wget -P /usr/local/spark/jars/ https://storage.googleapis.com/hadoop-lib/gcs/gcs-connector-hadoop3-2.2.5.jar

# Install Python packages for GCP integration
RUN pip install --no-cache-dir google-cloud-storage google-cloud-bigquery

# Switch back to jovyan user
USER jovyan

# Set environment variables for BigLake
ENV SPARK_OPTS="--driver-memory 4g --executor-memory 4g"
"""

# Write Dockerfile
with open("Dockerfile", "w") as f:
    f.write(dockerfile_content)

print("✅ Dockerfile created")

# Create docker-compose.yml
compose_content = f"""
version: '3.8'
services:
  biglake-iceberg:
    build: .
    container_name: {CONTAINER_NAME}
    ports:
      - "{JUPYTER_PORT}:{JUPYTER_PORT}"
      - "{SPARK_UI_PORT}:{SPARK_UI_PORT}"
    volumes:
      - ./notebooks:/home/jovyan/work
      - ./{GCP_KEY_FILE}:/home/jovyan/gcp-key.json:ro
    environment:
      - JUPYTER_ENABLE_LAB=yes
      - GOOGLE_APPLICATION_CREDENTIALS=/home/jovyan/gcp-key.json
    working_dir: /home/jovyan/work
"""

with open("docker-compose.yml", "w") as f:
    f.write(compose_content)

print("✅ docker-compose.yml created")

# Create notebooks directory
os.makedirs("notebooks", exist_ok=True)

# Build and start the container
print("🐳 Building Docker image... (this may take a few minutes)")
result = subprocess.run(["docker-compose", "build"],
                        capture_output=True, text=True)
if result.returncode != 0:
    print(f"❌ Build failed: {result.stderr}")
    raise Exception("Docker build failed")

print("🚀 Starting container...")
result = subprocess.run(["docker-compose", "up", "-d"],
                        capture_output=True, text=True)
if result.returncode != 0:
    print(f"❌ Container start failed: {result.stderr}")
    raise Exception("Container start failed")

# Wait for container to be ready
print("⏳ Waiting for container to be ready...")
time.sleep(10)

# Get Jupyter URL
result = subprocess.run(["docker", "logs", CONTAINER_NAME],
                        capture_output=True, text=True)
logs = result.stdout

# Extract Jupyter URL
for line in logs.split('\n'):
    if 'http://127.0.0.1:8888/lab?token=' in line:
        jupyter_url = line.strip()
        print(f"✅ Jupyter Lab is ready!")
        print(f"🔗 Access at: {jupyter_url}")
        break
else:
    print("⚠️ Jupyter URL not found in logs. Container may still be starting.")
    print(f"🔗 Try accessing: http://localhost:{JUPYTER_PORT}")

---

In [None]:
# =============================================================================
# CELL 3: Spark Session + Sample Data
# =============================================================================
# Initialize Spark with BigLake catalog and create sample data
# Run this inside the Jupyter container (copy to notebook cell)

from pyspark.sql import SparkSession

# Configuration (these should match your settings from Cell 1)
PROJECT_ID = "my-project-id"  # Your Google Cloud project ID
REGION = "us-central1"        # The region for BigQuery/GCS
BUCKET_NAME = f"{PROJECT_ID}-docker-bucket"
BIGLAKE_DATASET = "my_iceberg_metastore"  # BigQuery dataset for metastore
BIGLAKE_CATALOG = "iceberg_on_bq"  # Iceberg catalog name
GCP_KEY_PATH = "/home/jovyan/gcp-key.json"  # Path inside the Docker container

# Create Spark session configured for BigLake metastore
spark = SparkSession.builder \
    .appName("BigLake_Iceberg_Demo") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config(f"spark.sql.catalog.{BIGLAKE_CATALOG}", "org.apache.iceberg.spark.SparkCatalog") \
    .config(f"spark.sql.catalog.{BIGLAKE_CATALOG}.catalog-impl", "org.apache.iceberg.gcp.bigquery.BigQueryMetastoreCatalog") \
    .config(f"spark.sql.catalog.{BIGLAKE_CATALOG}.gcp_project", PROJECT_ID) \
    .config(f"spark.sql.catalog.{BIGLAKE_CATALOG}.location", REGION) \
    .config(f"spark.sql.catalog.{BIGLAKE_CATALOG}.warehouse", f"gs://{BUCKET_NAME}/{BIGLAKE_CATALOG}") \
    .config("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .config("spark.hadoop.google.cloud.auth.service.account.json.keyfile", GCP_KEY_PATH) \
    .config("spark.hadoop.fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem") \
    .config("spark.hadoop.fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS") \
    .getOrCreate()

print("✅ Spark Session with BigLake catalog created successfully!")
print(f"🔗 Spark UI: http://localhost:4040")

# Create sample data for testing
sample_data = [
    ("Alice", 25, "Engineering", 75000),
    ("Bob", 30, "Marketing", 65000),
    ("Charlie", 35, "Engineering", 85000),
    ("Diana", 28, "Sales", 60000),
    ("Eve", 32, "Engineering", 90000)
]

df = spark.createDataFrame(
    sample_data, ["name", "age", "department", "salary"])

print("📊 Sample DataFrame created:")
df.show()

print(f"✅ Ready to work with BigLake Iceberg tables!")
print(f"📋 Catalog: {BIGLAKE_CATALOG}")
print(f"🗄️ Dataset: {BIGLAKE_DATASET}")

In [None]:
# =============================================================================
# CELL 4: BigLake Iceberg Operations
# =============================================================================
# Create and work with Iceberg tables in BigLake metastore

# Table configuration
TABLE_NAME = f"{BIGLAKE_CATALOG}.{BIGLAKE_DATASET}.employees"

print("🏔️ Working with BigLake Iceberg tables...")

# Step 1: Create Iceberg table in BigLake metastore
print("📝 Creating Iceberg table in BigLake metastore...")

try:
    # Drop table if exists (for demo purposes)
    spark.sql(f"DROP TABLE IF EXISTS {TABLE_NAME}")

    # Create Iceberg table
    spark.sql(f"""
    CREATE TABLE {TABLE_NAME} (
        name STRING,
        age INT,
        department STRING,
        salary BIGINT
    )
    USING ICEBERG
    TBLPROPERTIES (
        'bq_connection'='projects/{PROJECT_ID}/locations/{REGION}/connections/default-{REGION}'
    )
    """)

    print(f"✅ Iceberg table created: {TABLE_NAME}")

except Exception as e:
    print(f"⚠️ Table creation failed: {e}")
    print("💡 Make sure BigQuery dataset and connection exist")

In [None]:
# Step 2: Insert sample data
print("📥 Inserting sample data...")

try:
    df.write \
        .format("iceberg") \
        .mode("append") \
        .saveAsTable(TABLE_NAME)

    print("✅ Data inserted successfully")

except Exception as e:
    print(f"⚠️ Data insertion failed: {e}")

In [None]:
# Step 3: Query the table
print("🔍 Querying the Iceberg table...")

try:
    result = spark.sql(f"SELECT * FROM {TABLE_NAME}")
    print("--- All Records ---")
    result.show()

    # Example queries
    print("--- Engineering Department ---")
    spark.sql(f"""
        SELECT name, age, salary
        FROM {TABLE_NAME}
        WHERE department = 'Engineering'
        ORDER BY salary DESC
    """).show()

    print("--- Department Summary ---")
    spark.sql(f"""
        SELECT department,
               COUNT(*) as employee_count,
               AVG(salary) as avg_salary,
               MAX(salary) as max_salary
        FROM {TABLE_NAME}
        GROUP BY department
        ORDER BY avg_salary DESC
    """).show()

    print("✅ Queries executed successfully")

except Exception as e:
    print(f"⚠️ Query failed: {e}")

In [None]:
# Step 4: Show table metadata
print("📋 Table Information:")

try:
    # Show table schema
    print("--- Table Schema ---")
    spark.sql(f"DESCRIBE {TABLE_NAME}").show()

    # Show table properties
    print("--- Table Properties ---")
    spark.sql(f"SHOW TBLPROPERTIES {TABLE_NAME}").show()

except Exception as e:
    print(f"⚠️ Metadata query failed: {e}")

print("🎉 BigLake Iceberg demo completed!")
print(f"📊 Table: {TABLE_NAME}")
print(f"🔗 You can now use this table for your data operations")

---

In [None]:
# =============================================================================
# CLEANUP (Optional)
# =============================================================================
# Uncomment to stop Spark session and clean up

# print("🧹 Cleaning up...")
# spark.stop()
#
# # To stop Docker containers:
# # !docker-compose down
#
# # To remove all data:
# # !docker-compose down -v