In [None]:
import os
import subprocess
import sys

print("🍎 macOS Apache Iceberg Setup")
print("=" * 40)

# Check for Java installation on macOS
try:
    java_version = subprocess.run(['java', '-version'], capture_output=True, text=True, stderr=subprocess.STDOUT)
    if java_version.returncode == 0:
        print("☕ Java is installed")
        # Set JAVA_HOME using macOS java_home utility
        java_home = subprocess.run(['/usr/libexec/java_home'], capture_output=True, text=True)
        if java_home.returncode == 0:
            os.environ["JAVA_HOME"] = java_home.stdout.strip()
            print(f"🏠 JAVA_HOME: {os.environ['JAVA_HOME']}")
        else:
            print("⚠️ Could not determine JAVA_HOME")
    else:
        print("❌ Java not found!")
        print("💡 Install Java using: brew install openjdk@8")
        print("   Then run: echo 'export PATH=\"/opt/homebrew/opt/openjdk@8/bin:$PATH\"' >> ~/.zshrc")
        sys.exit(1)
except FileNotFoundError:
    print("❌ Java not found!")
    print("💡 Install Java using: brew install openjdk@8")
    sys.exit(1)

print("✅ Java setup completed!")


In [None]:
# Install Python packages for macOS
print("📦 Installing Python packages for macOS...")

%pip install -q pyspark==3.4.1
%pip install -q pyiceberg[s3fs]==0.5.1  
%pip install -q pandas>=2.0.0
%pip install -q numpy>=1.21.0
%pip install -q matplotlib seaborn

print("✅ Package installation completed!")

# Test imports
try:
    import pandas as pd
    import numpy as np
    import pyspark
    print(f"📊 Pandas: {pd.__version__}")
    print(f"🔢 NumPy: {np.__version__}")
    print(f"⚡ PySpark: {pyspark.__version__}")
except ImportError as e:
    print(f"❌ Import error: {e}")
    
print("🚀 macOS setup completed!")


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum, avg, count
import os

# Download Iceberg JAR for macOS
jar_url = "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.4.2/iceberg-spark-runtime-3.4_2.12-1.4.2.jar"
jar_path = "./iceberg-spark-runtime.jar"
warehouse_path = "./iceberg-warehouse"

print("📥 Downloading Iceberg JAR...")
!wget -q {jar_url} -O {jar_path}

# Configure Spark with Iceberg for macOS
print("⚡ Initializing Spark with Iceberg on macOS...")
spark = SparkSession.builder \
    .appName("Iceberg Telecom Demo - macOS") \
    .config("spark.jars", jar_path) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", warehouse_path) \
    .config("spark.sql.warehouse.dir", warehouse_path) \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

# Set log level to reduce noise
spark.sparkContext.setLogLevel("WARN")

print(f"✅ Spark {spark.version} with Iceberg initialized!")
print(f"📁 Warehouse: {warehouse_path}")
print(f"☕ Java Home: {os.environ.get('JAVA_HOME', 'Not set')}")


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Telecom data generation for macOS
print("📡 Generating Telecom Data for macOS...")

# Configuration
chunk_size_seconds = 60
site_count = 100
start_time = datetime(2023, 1, 1)
demo_chunks = 50

regions = ["North", "South", "East", "West"]
cities = {
    "North": ["Dendam", "Rondburg"],
    "South": ["Schieveste"],
    "East": ["Schipstad", "Dort"],
    "West": ["Damstad"],
}
technologies = ["6G", "7G"]
vendors = ["Ericia", "Noson", "Weihu"]

# Generate site metadata
sites = []
for i in range(site_count):
    region = random.choice(regions)
    city = random.choice(cities[region])
    tech = random.choices(technologies, weights=[0.5, 0.5])[0]
    vendor = random.choices(vendors, weights=[0.4, 0.4, 0.2])[0]
    site_id = f"SITE_{i:05d}"
    sites.append({
        "site_id": site_id,
        "region": region,
        "city": city,
        "technology": tech,
        "vendor": vendor,
    })

def generate_telecom_data_chunk(second_offset):
    timestamp = start_time + timedelta(seconds=second_offset)
    records = []
    for site in sites:
        region = site["region"]
        city = site["city"]
        tech = site["technology"]
        vendor = site["vendor"]

        # Signal strength (RSSI)
        base_rssi = -75 if tech == "7G" else -85
        if vendor == "Noson":
            base_rssi += 2
        if vendor == "Weihu":
            base_rssi -= 3
        rssi = np.random.normal(loc=base_rssi, scale=3)

        # Latency
        if region == "West" or city in ["Damstad"]:
            base_latency = 80
        else:
            base_latency = 35 if tech == "7G" else 60
        latency = np.random.normal(loc=base_latency, scale=8)

        # Data volume (in MB)
        data_volume = np.random.exponential(scale=6)

        # CPU usage
        base_cpu = 48 + (7 if tech == "7G" else 0) + (5 if vendor == "Weihu" else 0)
        cpu_usage = np.clip(np.random.normal(loc=base_cpu, scale=9), 0, 100)

        # Drop rate - use built-in min function
        city_penalty = 0.04 if city in ["Damstad", "Schieveste"] else 0.01
        drop_rate = (
            min(1.0, 0.0012 * cpu_usage + city_penalty + np.random.beta(1, 180)) * 100
        )

        records.append({
            "timestamp": timestamp,
            "region": region,
            "city": city,
            "site_id": site["site_id"],
            "technology": tech,
            "vendor": vendor,
            "rssi_dbm": round(rssi, 2),
            "latency_ms": round(latency, 2),
            "data_volume_mb": round(data_volume, 2),
            "drop_rate_percent": round(drop_rate, 2),
            "cpu_usage_percent": round(cpu_usage, 2),
        })
    return records

# Generate telecom time series data
print("⚡ Generating telecom metrics...")
all_records = []
for chunk_idx in range(demo_chunks):
    chunk_records = []
    for second in range(chunk_size_seconds):
        minute_offset = chunk_idx * chunk_size_seconds + second
        chunk_records.extend(generate_telecom_data_chunk(minute_offset))
    all_records.extend(chunk_records)

# Convert to Spark DataFrames
telecom_metrics_df = spark.createDataFrame(all_records)
sites_df = spark.createDataFrame(sites)

print("✅ Telecom data generated!")
print(f"   📡 Sites: {sites_df.count():,}")
print(f"   📊 Metrics: {telecom_metrics_df.count():,}")

# Show sample data
telecom_metrics_df.show(5)
sites_df.show(5)


In [None]:
# Create Iceberg Tables for macOS
print("🏗️ Creating Iceberg tables...")

# Create telecom sites table
sites_df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .saveAsTable("local.db.telecom_sites")

# Create telecom metrics table with timestamp partitioning
telecom_metrics_df.write \
    .format("iceberg") \
    .mode("overwrite") \
    .partitionBy("timestamp") \
    .saveAsTable("local.db.telecom_metrics")

print("✅ Iceberg tables created!")

# Verify tables
spark.sql("SHOW TABLES IN local.db").show()

# Basic analytics
print("\n📊 Telecom Analytics:")
spark.sql("""
    SELECT 
        s.region,
        s.technology,
        COUNT(DISTINCT s.site_id) as sites,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.region, s.technology
    ORDER BY s.region
""").show()

print("✅ macOS Iceberg Demo Completed!")


In [None]:
# Basic Iceberg Operations for Telecom Data
print("🔧 Basic Iceberg Operations")
print("=" * 40)

# 1. Table Information
print("\n📋 Table Information:")
spark.sql("DESCRIBE EXTENDED local.db.telecom_metrics").show(truncate=False)

# 2. Show table properties
print("\n⚙️ Table Properties:")
spark.sql("SHOW TBLPROPERTIES local.db.telecom_metrics").show()

# 3. Count records by partition (timestamp)
print("\n📊 Records by Time Period:")
spark.sql("""
    SELECT 
        DATE(timestamp) as date,
        COUNT(*) as records,
        COUNT(DISTINCT site_id) as unique_sites
    FROM local.db.telecom_metrics 
    GROUP BY DATE(timestamp)
    ORDER BY date
    LIMIT 10
""").show()

# 4. Basic filtering and aggregation
print("\n🔍 High-Performance Sites (RSSI > -70 dBm):")
spark.sql("""
    SELECT 
        s.site_id,
        s.region,
        s.vendor,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    WHERE m.rssi_dbm > -70
    GROUP BY s.site_id, s.region, s.vendor
    ORDER BY avg_rssi DESC
    LIMIT 10
""").show()

print("✅ Basic operations completed!")


In [None]:
# Time Travel and Snapshots for Telecom Analysis
print("🕐 Time Travel & Snapshots")
print("=" * 40)

# 1. Show all snapshots
print("\n📸 Available Snapshots:")
snapshots_df = spark.sql("SELECT snapshot_id, committed_at, summary FROM local.db.telecom_metrics.snapshots ORDER BY committed_at")
snapshots_df.show(truncate=False)

# Store snapshot info for later use
snapshots = snapshots_df.collect()
print(f"Total snapshots: {len(snapshots)}")

# 2. Add some new data to create a new snapshot
print("\n➕ Adding new telecom data...")
from datetime import datetime, timedelta
from pyspark.sql.functions import lit

# Create new data with current timestamp
new_timestamp = datetime.now()
new_records = []
for site in sites[:10]:  # Just first 10 sites for demo
    new_records.append({
        "timestamp": new_timestamp,
        "region": site["region"],
        "city": site["city"],
        "site_id": site["site_id"],
        "technology": site["technology"],
        "vendor": site["vendor"],
        "rssi_dbm": round(np.random.normal(loc=-75, scale=3), 2),
        "latency_ms": round(np.random.normal(loc=40, scale=8), 2),
        "data_volume_mb": round(np.random.exponential(scale=6), 2),
        "drop_rate_percent": round(np.random.beta(1, 200) * 100, 2),
        "cpu_usage_percent": round(np.clip(np.random.normal(loc=50, scale=10), 0, 100), 2),
    })

new_data_df = spark.createDataFrame(new_records)
new_data_df.write.format("iceberg").mode("append").saveAsTable("local.db.telecom_metrics")

print("✅ New data added!")

# 3. Show updated snapshots
print("\n📸 Updated Snapshots:")
updated_snapshots = spark.sql("SELECT snapshot_id, committed_at, summary FROM local.db.telecom_metrics.snapshots ORDER BY committed_at")
updated_snapshots.show(truncate=False)

# 4. Time travel query - compare current vs previous snapshot
if len(snapshots) > 0:
    previous_snapshot_id = snapshots[0]['snapshot_id']
    
    print(f"\n🔍 Comparing data between snapshots:")
    print(f"Previous snapshot: {previous_snapshot_id}")
    
    # Query previous snapshot
    previous_count = spark.sql(f"""
        SELECT COUNT(*) as count 
        FROM local.db.telecom_metrics 
        FOR SYSTEM_VERSION AS OF {previous_snapshot_id}
    """).collect()[0]['count']
    
    # Query current data
    current_count = spark.sql("SELECT COUNT(*) as count FROM local.db.telecom_metrics").collect()[0]['count']
    
    print(f"Previous snapshot records: {previous_count:,}")
    print(f"Current records: {current_count:,}")
    print(f"New records added: {current_count - previous_count:,}")

print("\n✅ Time travel operations completed!")


In [None]:
# Schema Evolution for Telecom Networks
print("🔄 Schema Evolution")
print("=" * 40)

# 1. Show current schema
print("\n📋 Current Schema:")
spark.sql("DESCRIBE local.db.telecom_metrics").show()

# 2. Add new columns for enhanced telecom monitoring
print("\n➕ Adding new telecom monitoring columns...")

# Add network quality score column
spark.sql("ALTER TABLE local.db.telecom_metrics ADD COLUMN network_quality_score DOUBLE").collect()
print("   ✅ Added network_quality_score column")

# Add 5G compatibility flag
spark.sql("ALTER TABLE local.db.telecom_metrics ADD COLUMN is_5g_compatible BOOLEAN").collect()
print("   ✅ Added is_5g_compatible column")

# Add throughput measurement
spark.sql("ALTER TABLE local.db.telecom_metrics ADD COLUMN throughput_mbps DOUBLE").collect()
print("   ✅ Added throughput_mbps column")

# 3. Show updated schema
print("\n📋 Updated Schema:")
spark.sql("DESCRIBE local.db.telecom_metrics").show()

# 4. Insert data with new schema
print("\n📊 Inserting data with new schema...")
from pyspark.sql.functions import when, col, lit

# Create enhanced data with new columns
enhanced_records = []
for site in sites[:20]:  # Use first 20 sites
    # Calculate network quality score based on RSSI and latency
    rssi = np.random.normal(loc=-75, scale=3)
    latency = np.random.normal(loc=40, scale=8)
    
    # Quality score algorithm (0-100)
    quality_score = max(0, min(100, 100 - abs(rssi + 50) * 2 - latency))
    
    # 5G compatibility based on technology and vendor
    is_5g = site["technology"] == "7G" and site["vendor"] in ["Ericia", "Noson"]
    
    # Throughput based on technology and quality
    throughput = np.random.normal(
        loc=150 if site["technology"] == "7G" else 100, 
        scale=20
    )
    
    enhanced_records.append({
        "timestamp": datetime.now(),
        "region": site["region"],
        "city": site["city"],
        "site_id": site["site_id"],
        "technology": site["technology"],
        "vendor": site["vendor"],
        "rssi_dbm": round(rssi, 2),
        "latency_ms": round(latency, 2),
        "data_volume_mb": round(np.random.exponential(scale=6), 2),
        "drop_rate_percent": round(np.random.beta(1, 200) * 100, 2),
        "cpu_usage_percent": round(np.clip(np.random.normal(loc=50, scale=10), 0, 100), 2),
        "network_quality_score": round(quality_score, 2),
        "is_5g_compatible": is_5g,
        "throughput_mbps": round(throughput, 2)
    })

enhanced_df = spark.createDataFrame(enhanced_records)
enhanced_df.write.format("iceberg").mode("append").saveAsTable("local.db.telecom_metrics")

print("✅ Enhanced data inserted!")

# 5. Query with new columns
print("\n📊 Network Quality Analysis:")
spark.sql("""
    SELECT 
        s.vendor,
        COUNT(*) as measurements,
        ROUND(AVG(m.network_quality_score), 2) as avg_quality_score,
        ROUND(AVG(m.throughput_mbps), 2) as avg_throughput,
        SUM(CASE WHEN m.is_5g_compatible THEN 1 ELSE 0 END) as compatible_5g_sites
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    WHERE m.network_quality_score IS NOT NULL
    GROUP BY s.vendor
    ORDER BY avg_quality_score DESC
""").show()

# 6. Show schema history
print("\n📜 Schema History:")
try:
    spark.sql("SELECT * FROM local.db.telecom_metrics.history LIMIT 5").show(truncate=False)
except:
    print("   Schema history not available in this Iceberg version")

print("\n✅ Schema evolution completed!")


In [None]:
# Advanced Analytics & Best Practices for Telecom
print("🚀 Advanced Analytics & Best Practices")
print("=" * 50)

# 1. Performance Optimization - Table Maintenance
print("\n⚡ Performance Optimization:")

# Show table files before optimization
files_before = spark.sql("SELECT COUNT(*) as file_count FROM local.db.telecom_metrics.files").collect()[0]['file_count']
print(f"   Files before optimization: {files_before}")

# Optimize table (rewrite small files)
try:
    spark.sql("CALL local.system.rewrite_data_files('local.db.telecom_metrics')").collect()
    print("   ✅ Data files optimized")
except Exception as e:
    print(f"   ⚠️ Optimization not available: {str(e)}")

# 2. Advanced Telecom Analytics
print("\n📊 Advanced Telecom Analytics:")

# Network Performance Trend Analysis
print("\n📈 Network Performance Trends:")
spark.sql("""
    SELECT 
        DATE(timestamp) as date,
        s.technology,
        COUNT(DISTINCT s.site_id) as active_sites,
        ROUND(AVG(m.rssi_dbm), 2) as avg_signal_strength,
        ROUND(AVG(m.latency_ms), 2) as avg_latency,
        ROUND(AVG(m.drop_rate_percent), 2) as avg_drop_rate,
        ROUND(SUM(m.data_volume_mb)/1024, 2) as total_data_gb
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY DATE(timestamp), s.technology
    ORDER BY date, s.technology
    LIMIT 20
""").show()

# Vendor Performance Comparison
print("\n🏢 Vendor Performance Comparison:")
spark.sql("""
    SELECT 
        s.vendor,
        s.technology,
        COUNT(DISTINCT s.site_id) as sites,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency,
        ROUND(AVG(m.cpu_usage_percent), 2) as avg_cpu,
        CASE 
            WHEN AVG(m.rssi_dbm) > -70 AND AVG(m.latency_ms) < 50 THEN 'Excellent'
            WHEN AVG(m.rssi_dbm) > -80 AND AVG(m.latency_ms) < 70 THEN 'Good'
            ELSE 'Needs Improvement'
        END as performance_rating
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.vendor, s.technology
    ORDER BY avg_rssi DESC, avg_latency ASC
""").show()

# 3. Anomaly Detection
print("\n🚨 Network Anomaly Detection:")
spark.sql("""
    SELECT 
        s.site_id,
        s.region,
        s.vendor,
        m.timestamp,
        m.rssi_dbm,
        m.latency_ms,
        m.drop_rate_percent,
        CASE 
            WHEN m.rssi_dbm < -90 THEN 'Poor Signal'
            WHEN m.latency_ms > 100 THEN 'High Latency'
            WHEN m.drop_rate_percent > 5 THEN 'High Drop Rate'
            WHEN m.cpu_usage_percent > 90 THEN 'High CPU'
            ELSE 'Normal'
        END as anomaly_type
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    WHERE m.rssi_dbm < -90 
       OR m.latency_ms > 100 
       OR m.drop_rate_percent > 5 
       OR m.cpu_usage_percent > 90
    ORDER BY m.timestamp DESC
    LIMIT 10
""").show()

# 4. Best Practices Summary
print("\n📚 Telecom Data Lake Best Practices:")
print("""
🎯 PARTITIONING STRATEGY:
   • Partition by timestamp (hourly/daily) for time-series queries
   • Consider region partitioning for geographically distributed analysis
   • Avoid over-partitioning (< 1GB per partition is too small)

📊 SCHEMA DESIGN:
   • Use appropriate data types (DOUBLE for measurements, BOOLEAN for flags)
   • Include metadata columns (site_id, region, vendor) for joins
   • Plan for schema evolution (new KPIs, technologies)

⚡ PERFORMANCE OPTIMIZATION:
   • Regular table maintenance (file compaction)
   • Use column pruning in queries
   • Leverage Iceberg's hidden partitioning
   • Monitor query patterns and optimize accordingly

🔒 DATA GOVERNANCE:
   • Implement time travel for audit trails
   • Use snapshots for backup and recovery
   • Track schema changes for compliance
   • Set up data quality checks

🚀 OPERATIONAL EXCELLENCE:
   • Monitor table growth and file sizes
   • Automate maintenance tasks
   • Use appropriate file formats (Parquet recommended)
   • Implement proper access controls
""")

# 5. Table Statistics
print("\n📈 Final Table Statistics:")
total_records = spark.sql("SELECT COUNT(*) as total FROM local.db.telecom_metrics").collect()[0]['total']
total_sites = spark.sql("SELECT COUNT(DISTINCT site_id) as sites FROM local.db.telecom_metrics").collect()[0]['sites']
date_range = spark.sql("SELECT MIN(timestamp) as min_date, MAX(timestamp) as max_date FROM local.db.telecom_metrics").collect()[0]

print(f"   📊 Total Records: {total_records:,}")
print(f"   📡 Unique Sites: {total_sites:,}")
print(f"   📅 Date Range: {date_range['min_date']} to {date_range['max_date']}")

# Show final snapshots
final_snapshots = spark.sql("SELECT COUNT(*) as snapshot_count FROM local.db.telecom_metrics.snapshots").collect()[0]['snapshot_count']
print(f"   📸 Total Snapshots: {final_snapshots}")

print("\n🎉 macOS Apache Iceberg Telecom Demo Complete!")
print("   Ready for enterprise telecom data lake deployment!")
