In [None]:
import os
import sys

print("🔬 Google Colab Apache Iceberg Setup")
print("=" * 40)

# Check if running in Google Colab
if 'google.colab' in sys.modules:
    print("✅ Running in Google Colab")
    
    # Install Java (usually pre-installed in Colab)
    print("☕ Setting up Java for Colab...")
    !apt-get update -qq
    !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null 2>&1
    
    # Set JAVA_HOME for Colab
    os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
    print(f"🏠 JAVA_HOME: {os.environ['JAVA_HOME']}")
    
else:
    print("❌ This notebook is designed for Google Colab")
    print("💡 Please run this in Google Colab for best results")
    
print("✅ Java setup completed!")


In [None]:
# Install Python packages optimized for Google Colab
print("📦 Installing packages for Google Colab...")

# Use Colab-compatible installation approach
%pip install -q pyspark>=3.4.0
%pip install -q pyiceberg --no-deps  # No deps to avoid conflicts with Colab packages
%pip install -q s3fs>=2023.1.0  # For S3 support

print("✅ Package installation completed!")

# Test imports
try:
    import pyspark
    print(f"⚡ PySpark: {pyspark.__version__}")
    
    # Use existing pandas/numpy from Colab
    import pandas as pd
    import numpy as np
    print(f"📊 Pandas: {pd.__version__}")
    print(f"🔢 NumPy: {np.__version__}")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("💡 Try restarting runtime if errors persist")
    
print("🚀 Colab setup completed!")


In [None]:
# Spark + Iceberg Setup for Google Colab
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, avg, count
import os

# Colab-specific paths
jar_url = "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.4.2/iceberg-spark-runtime-3.4_2.12-1.4.2.jar"
jar_path = "/content/iceberg-spark-runtime.jar"
warehouse_path = "/content/iceberg-warehouse"

print("📥 Downloading Iceberg JAR for Colab...")
!wget -q {jar_url} -O {jar_path}

print("⚡ Initializing Spark with Iceberg in Colab...")
spark = SparkSession.builder \
    .appName("Iceberg Telecom Demo - Colab") \
    .config("spark.jars", jar_path) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", warehouse_path) \
    .config("spark.sql.warehouse.dir", warehouse_path) \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.driver.memory", "2g") \
    .config("spark.driver.maxResultSize", "1g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

print(f"✅ Spark {spark.version} with Iceberg ready in Colab!")
print(f"📁 Warehouse: {warehouse_path}")

# Generate Telecom Data for Colab
from datetime import datetime, timedelta
import random

print("\n📡 Generating Telecom Data for Colab...")

# Colab-optimized configuration (smaller dataset)
chunk_size_seconds = 30
site_count = 50  # Reduced for Colab
start_time = datetime(2023, 1, 1)
demo_chunks = 20  # Reduced for Colab

regions = ["North", "South", "East", "West"]
cities = {
    "North": ["Dendam", "Rondburg"],
    "South": ["Schieveste"],
    "East": ["Schipstad", "Dort"],
    "West": ["Damstad"],
}
technologies = ["6G", "7G"]
vendors = ["Ericia", "Noson", "Weihu"]

# Generate site metadata
sites = []
for i in range(site_count):
    region = random.choice(regions)
    city = random.choice(cities[region])
    tech = random.choices(technologies, weights=[0.5, 0.5])[0]
    vendor = random.choices(vendors, weights=[0.4, 0.4, 0.2])[0]
    site_id = f"SITE_{i:05d}"
    sites.append({
        "site_id": site_id,
        "region": region,
        "city": city,
        "technology": tech,
        "vendor": vendor,
    })

def generate_telecom_data_chunk(second_offset):
    timestamp = start_time + timedelta(seconds=second_offset)
    records = []
    for site in sites:
        # Simplified telecom metrics generation for Colab
        base_rssi = -75 if site["technology"] == "7G" else -85
        rssi = np.random.normal(loc=base_rssi, scale=3)
        
        base_latency = 35 if site["technology"] == "7G" else 60
        latency = np.random.normal(loc=base_latency, scale=8)
        
        data_volume = np.random.exponential(scale=6)
        cpu_usage = np.clip(np.random.normal(loc=50, scale=10), 0, 100)
        drop_rate = np.random.beta(1, 200) * 100

        records.append({
            "timestamp": timestamp,
            "region": site["region"],
            "city": site["city"],
            "site_id": site["site_id"],
            "technology": site["technology"],
            "vendor": site["vendor"],
            "rssi_dbm": round(rssi, 2),
            "latency_ms": round(latency, 2),
            "data_volume_mb": round(data_volume, 2),
            "drop_rate_percent": round(drop_rate, 2),
            "cpu_usage_percent": round(cpu_usage, 2),
        })
    return records

# Generate data
all_records = []
for chunk_idx in range(demo_chunks):
    for second in range(chunk_size_seconds):
        minute_offset = chunk_idx * chunk_size_seconds + second
        all_records.extend(generate_telecom_data_chunk(minute_offset))

# Create DataFrames
telecom_metrics_df = spark.createDataFrame(all_records)
sites_df = spark.createDataFrame(sites)

print(f"✅ Colab telecom data generated!")
print(f"   📡 Sites: {sites_df.count():,}")
print(f"   📊 Metrics: {telecom_metrics_df.count():,}")

telecom_metrics_df.show(5)


In [None]:
# Create Iceberg Tables in Google Colab
print("🏗️ Creating Iceberg tables in Colab...")

# Create telecom sites table
sites_df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .saveAsTable("local.db.telecom_sites")

# Create telecom metrics table with timestamp partitioning
telecom_metrics_df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .partitionBy("timestamp") \
    .saveAsTable("local.db.telecom_metrics")

print("✅ Iceberg tables created in Colab!")

# Verify tables
spark.sql("SHOW TABLES IN local.db").show()

# Quick analytics optimized for Colab
print("\n📊 Colab Telecom Analytics:")
spark.sql("""
    SELECT 
        s.vendor,
        COUNT(DISTINCT s.site_id) as sites,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency,
        ROUND(AVG(m.drop_rate_percent), 2) as avg_drop_rate
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.vendor
    ORDER BY avg_rssi DESC
""").show()

# Time travel demo for Colab
print("\n🕐 Time Travel Demo:")
snapshots = spark.sql("SELECT snapshot_id, committed_at FROM local.db.telecom_metrics.snapshots ORDER BY committed_at").collect()
print(f"📸 Available snapshots: {len(snapshots)}")

for i, snapshot in enumerate(snapshots):
    print(f"   Snapshot {i+1}: {snapshot['snapshot_id']} at {snapshot['committed_at']}")

print("\n✅ Google Colab Iceberg Demo Completed!")
print("🔗 Share this notebook with your team for collaboration!")

# Optional: Stop Spark to free memory
# spark.stop()


In [None]:
# Basic Iceberg Operations - Colab Optimized
print("🔧 Basic Iceberg Operations (Colab)")
print("=" * 40)

# 1. Table Information
print("\n📋 Table Schema:")
spark.sql("DESCRIBE local.db.telecom_metrics").show()

# 2. Quick data exploration
print("\n📊 Data Overview:")
spark.sql("""
    SELECT 
        COUNT(*) as total_records,
        COUNT(DISTINCT site_id) as unique_sites,
        MIN(timestamp) as earliest_record,
        MAX(timestamp) as latest_record
    FROM local.db.telecom_metrics
""").show()

# 3. Performance by vendor (Colab-friendly query)
print("\n🏢 Vendor Performance Summary:")
spark.sql("""
    SELECT 
        s.vendor,
        COUNT(DISTINCT s.site_id) as sites,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.vendor
    ORDER BY avg_rssi DESC
""").show()

print("✅ Basic operations completed in Colab!")


In [None]:
# Time Travel & Schema Evolution - Colab Demo
print("🕐 Time Travel & Schema Evolution (Colab)")
print("=" * 45)

# 1. Show current snapshots
print("\n📸 Current Snapshots:")
snapshots = spark.sql("SELECT snapshot_id, committed_at FROM local.db.telecom_metrics.snapshots ORDER BY committed_at")
snapshots.show()

# 2. Add new column for Colab demo
print("\n🔄 Schema Evolution Demo:")
spark.sql("ALTER TABLE local.db.telecom_metrics ADD COLUMN signal_strength_category STRING").collect()
print("   ✅ Added signal_strength_category column")

# 3. Insert data with new schema
print("\n📊 Adding categorized data...")
from pyspark.sql.functions import when, col

# Create sample data with categories
new_colab_records = []
for site in sites[:10]:  # Small dataset for Colab
    rssi = np.random.normal(loc=-75, scale=5)
    category = "Excellent" if rssi > -70 else ("Good" if rssi > -80 else "Poor")
    
    new_colab_records.append({
        "timestamp": datetime.now(),
        "region": site["region"],
        "city": site["city"],
        "site_id": site["site_id"],
        "technology": site["technology"],
        "vendor": site["vendor"],
        "rssi_dbm": round(rssi, 2),
        "latency_ms": round(np.random.normal(loc=45, scale=10), 2),
        "data_volume_mb": round(np.random.exponential(scale=5), 2),
        "drop_rate_percent": round(np.random.beta(1, 200) * 100, 2),
        "cpu_usage_percent": round(np.clip(np.random.normal(loc=50, scale=10), 0, 100), 2),
        "signal_strength_category": category
    })

new_colab_df = spark.createDataFrame(new_colab_records)
new_colab_df.write.format("iceberg").mode("append").saveAsTable("local.db.telecom_metrics")

print("✅ Categorized data added!")

# 4. Query with new schema
print("\n📊 Signal Strength Analysis:")
spark.sql("""
    SELECT 
        signal_strength_category,
        COUNT(*) as count,
        ROUND(AVG(rssi_dbm), 2) as avg_rssi
    FROM local.db.telecom_metrics 
    WHERE signal_strength_category IS NOT NULL
    GROUP BY signal_strength_category
    ORDER BY avg_rssi DESC
""").show()

# 5. Show updated snapshots
print("\n📸 Updated Snapshots:")
updated_snapshots = spark.sql("SELECT snapshot_id, committed_at FROM local.db.telecom_metrics.snapshots ORDER BY committed_at")
updated_snapshots.show()

print("✅ Time travel & schema evolution demo completed in Colab!")


In [None]:
# Colab Analytics & Best Practices
print("🚀 Colab Analytics & Best Practices")
print("=" * 40)

# 1. Colab-optimized analytics
print("\n📊 Telecom Network Analysis:")
spark.sql("""
    SELECT 
        s.region,
        s.technology,
        COUNT(*) as measurements,
        ROUND(AVG(m.rssi_dbm), 2) as avg_signal,
        ROUND(AVG(m.latency_ms), 2) as avg_latency,
        CASE 
            WHEN AVG(m.rssi_dbm) > -75 THEN 'Good Coverage'
            ELSE 'Coverage Issues'
        END as coverage_status
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.region, s.technology
    ORDER BY avg_signal DESC
""").show()

# 2. Simple visualization data prep
print("\n📈 Visualization Data (for plotting in Colab):")
viz_data = spark.sql("""
    SELECT 
        s.vendor,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.vendor
    ORDER BY s.vendor
""")

# Convert to Pandas for easy plotting in Colab
viz_df = viz_data.toPandas()
print("📊 Data ready for matplotlib/seaborn plotting:")
print(viz_df)

# 3. Colab Best Practices
print("\n📚 Google Colab Best Practices:")
print("""
🔬 COLAB OPTIMIZATION:
   • Use smaller datasets for faster execution
   • Leverage Colab's free GPU/TPU when available
   • Save intermediate results to avoid re-computation
   • Use %%time magic commands to measure performance

📊 DATA VISUALIZATION:
   • Convert Spark DataFrames to Pandas for plotting
   • Use matplotlib, seaborn, or plotly for visualizations
   • Create interactive plots with widgets
   • Export plots to Google Drive

🤝 COLLABORATION:
   • Share notebooks via Google Drive links
   • Use comments and markdown for documentation
   • Version control with Git integration
   • Export to GitHub for team collaboration

💾 DATA PERSISTENCE:
   • Mount Google Drive for data persistence
   • Use Colab's built-in file system for temporary storage
   • Export results to CSV/Parquet for later use
   • Consider upgrading to Colab Pro for more resources
""")

# 4. Final statistics
print("\n📈 Final Colab Demo Statistics:")
total_records = spark.sql("SELECT COUNT(*) as total FROM local.db.telecom_metrics").collect()[0]['total']
total_snapshots = spark.sql("SELECT COUNT(*) as snapshots FROM local.db.telecom_metrics.snapshots").collect()[0]['snapshots']

print(f"   📊 Total Records: {total_records:,}")
print(f"   📸 Total Snapshots: {total_snapshots}")
print(f"   🔬 Platform: Google Colab")
print(f"   ⚡ Spark Version: {spark.version}")

print("\n🎉 Google Colab Iceberg Demo Complete!")
print("🔗 Share this notebook with your team for collaborative learning!")

# Optional: Create a simple plot if matplotlib is available
try:
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 6))
    plt.subplot(1, 2, 1)
    plt.bar(viz_df['vendor'], viz_df['avg_rssi'])
    plt.title('Average RSSI by Vendor')
    plt.ylabel('RSSI (dBm)')
    plt.xticks(rotation=45)
    
    plt.subplot(1, 2, 2)
    plt.bar(viz_df['vendor'], viz_df['avg_latency'])
    plt.title('Average Latency by Vendor')
    plt.ylabel('Latency (ms)')
    plt.xticks(rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    print("📊 Visualization created successfully!")
    
except ImportError:
    print("📊 Install matplotlib for visualizations: !pip install matplotlib")
