# Apache Iceberg for Telecom Enterprises (Windows)
## A Complete Guide with PySpark & Telecom Time Series Data

This notebook demonstrates how to use Apache Iceberg in telecom enterprise environments using PySpark on **Windows**.

### Key Features:
- **ACID transactions** for reliable telecom data operations
- **Schema evolution** for adapting to new network technologies
- **Time travel** for historical network performance analysis
- **Hidden partitioning** for optimal time-series data queries
- **Data compaction** for efficient storage of large telecom datasets

### Windows Requirements:
- Java 8+ (install via [Oracle JDK](https://www.oracle.com/java/technologies/downloads/) or [OpenJDK](https://adoptium.net/))
- Python 3.8+ (install via [Python.org](https://www.python.org/downloads/windows/) or [Anaconda](https://www.anaconda.com/products/distribution))
- Windows 10/11 or Windows Server 2016+
- PowerShell or Command Prompt access
- Sufficient disk space for demo data

### Windows-Specific Features:
- Automatic Java detection via Windows Registry
- PowerShell and CMD compatibility
- Windows path handling (backslashes)
- Conda/pip environment management
- Windows Defender compatibility

**Note:** This notebook is optimized specifically for Windows environments.

## 1. Windows Environment Setup

In [None]:
import os
import subprocess
import sys
import platform

print("🪟 Windows Apache Iceberg Setup")
print("=" * 40)

# Verify we're on Windows
if platform.system() != "Windows":
    print("⚠️ This notebook is optimized for Windows")
    print("💡 Consider using the macOS or Linux version for your platform")

print(f"🖥️ Windows Version: {platform.platform()}")
print(f"🐍 Python Version: {platform.python_version()}")

# Check for Java installation on Windows
def find_java_windows():
    """Find Java installation on Windows using multiple methods"""
    java_paths = []
    
    # Method 1: Check JAVA_HOME environment variable
    java_home = os.environ.get('JAVA_HOME')
    if java_home:
        java_exe = os.path.join(java_home, 'bin', 'java.exe')
        if os.path.exists(java_exe):
            java_paths.append(java_home)
    
    # Method 2: Check common installation paths
    common_paths = [
        r"C:\Program Files\Java",
        r"C:\Program Files (x86)\Java",
        r"C:\Program Files\Eclipse Adoptium",
        r"C:\Program Files\Microsoft\jdk",
        r"C:\Program Files\Zulu\zulu-8",
    ]
    
    for base_path in common_paths:
        if os.path.exists(base_path):
            for item in os.listdir(base_path):
                potential_java_home = os.path.join(base_path, item)
                java_exe = os.path.join(potential_java_home, 'bin', 'java.exe')
                if os.path.exists(java_exe):
                    java_paths.append(potential_java_home)
    
    # Method 3: Try to find java.exe in PATH
    try:
        result = subprocess.run(['where', 'java'], capture_output=True, text=True, shell=True)
        if result.returncode == 0:
            java_exe_path = result.stdout.strip().split('\n')[0]
            java_home = os.path.dirname(os.path.dirname(java_exe_path))
            java_paths.append(java_home)
    except:
        pass
    
    return list(set(java_paths))  # Remove duplicates

# Find and configure Java
print("\n☕ Checking Java installation...")
try:
    # Test if java command works
    java_version = subprocess.run(['java', '-version'], capture_output=True, text=True, stderr=subprocess.STDOUT, shell=True)
    if java_version.returncode == 0:
        print("✅ Java is accessible via PATH")
        
        # Find Java Home
        java_homes = find_java_windows()
        if java_homes:
            # Use the first valid Java installation
            selected_java_home = java_homes[0]
            os.environ["JAVA_HOME"] = selected_java_home
            print(f"🏠 JAVA_HOME set to: {selected_java_home}")
            
            # Show Java version
            version_lines = java_version.stdout.split('\n')
            for line in version_lines[:2]:
                if line.strip():
                    print(f"📋 {line.strip()}")
        else:
            print("⚠️ Could not determine JAVA_HOME automatically")
            print("💡 Please set JAVA_HOME environment variable manually")
    else:
        print("❌ Java not found in PATH!")
        print("\n💡 Install Java on Windows:")
        print("   Option 1: Download from https://adoptium.net/ (Recommended)")
        print("   Option 2: Download from https://www.oracle.com/java/technologies/downloads/")
        print("   Option 3: Use Chocolatey: choco install openjdk")
        print("   Option 4: Use winget: winget install Microsoft.OpenJDK.11")
        print("\n   After installation, restart your terminal/Jupyter and run this cell again.")
        sys.exit(1)
        
except FileNotFoundError:
    print("❌ Java command not found!")
    print("\n💡 Install Java on Windows:")
    print("   1. Download OpenJDK from https://adoptium.net/")
    print("   2. Run the installer (it will set up PATH automatically)")
    print("   3. Restart your terminal/Jupyter")
    print("   4. Run this cell again")
    sys.exit(1)

print("\n✅ Java setup completed for Windows!")

In [None]:
# Install Python packages for Windows
print("📦 Installing Python packages for Windows...")
print("🔄 This may take a few minutes on Windows...")

# Windows-specific pip installation with timeout handling
%pip install -q --timeout=300 pyspark==3.4.1
%pip install -q --timeout=300 pyiceberg[s3fs]==0.5.1  
%pip install -q --timeout=300 pandas>=2.0.0
%pip install -q --timeout=300 numpy>=1.21.0
%pip install -q --timeout=300 matplotlib seaborn

# Windows-specific: Install additional utilities
%pip install -q --timeout=300 requests urllib3

print("✅ Package installation completed!")

# Test imports with Windows-specific error handling
print("\n🧪 Testing package imports...")
try:
    import pandas as pd
    import numpy as np
    import pyspark
    import requests
    
    print(f"📊 Pandas: {pd.__version__}")
    print(f"🔢 NumPy: {np.__version__}")
    print(f"⚡ PySpark: {pyspark.__version__}")
    print(f"🌐 Requests: {requests.__version__}")
    
except ImportError as e:
    print(f"❌ Import error: {e}")
    print("💡 Try restarting the kernel and running the cell again")
    print("💡 On Windows, some packages may require Visual C++ Build Tools")
    
print("\n🚀 Windows package setup completed!")

In [None]:
# Creating Iceberg Tables for Windows
print("🏗️ Creating Iceberg Tables for Windows Environment")
print("=" * 50)

# Create database/namespace in Iceberg
spark.sql("CREATE NAMESPACE IF NOT EXISTS local.db")
print("✅ Created database namespace")

# Create Sites table (reference data)
print("\n🏢 Creating Sites table...")
sites_df.write \
    .mode("overwrite") \
    .format("iceberg") \
    .saveAsTable("local.db.telecom_sites")

print("✅ Sites table created")

# Create Metrics table with time-based partitioning (optimal for Windows)
print("\n📊 Creating Metrics table with partitioning...")
telecom_metrics_df.write \
    .mode("overwrite") \
    .format("iceberg") \
    .partitionBy("timestamp") \
    .option("write.spark.fanout.enabled", "true") \
    .option("write.metadata.compression-codec", "gzip") \
    .saveAsTable("local.db.telecom_metrics")

print("✅ Metrics table created with time partitioning")

# Verify tables
print("\n📋 Available Tables:")
spark.sql("SHOW TABLES IN local.db").show()

# Initial analytics
print("\n📊 Windows Telecom Analytics:")
spark.sql("""
    SELECT
        s.vendor,
        COUNT(DISTINCT s.site_id) as sites,
        ROUND(AVG(m.rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(m.latency_ms), 2) as avg_latency,
        ROUND(AVG(m.drop_rate_percent), 2) as avg_drop_rate
    FROM local.db.telecom_sites s
    JOIN local.db.telecom_metrics m ON s.site_id = m.site_id
    GROUP BY s.vendor
    ORDER BY avg_rssi DESC
""").show()

print("\n🕐 Time Travel Demo:")
snapshots = spark.sql("SELECT snapshot_id, committed_at FROM local.db.telecom_metrics.snapshots ORDER BY committed_at")
snapshot_count = snapshots.count()
print(f"📸 Available snapshots: {snapshot_count}")
for row in snapshots.collect():
    print(f"   Snapshot: {row.snapshot_id} at {row.committed_at}")

print("\n✅ Windows Iceberg setup completed!")


In [None]:
# Basic Iceberg Operations - Windows Optimized
print("🔧 Basic Iceberg Operations (Windows)")
print("=" * 40)

# 1. Table Information
print("\n📋 Table Schema:")
spark.sql("DESCRIBE local.db.telecom_metrics").show()

# 2. Table properties (Windows-specific display)
print("\n⚙️ Table Properties:")
props = spark.sql("SHOW TBLPROPERTIES local.db.telecom_metrics")
props.show(truncate=False)

# 3. Quick data exploration
print("\n📊 Data Overview:")
spark.sql("""
    SELECT
        COUNT(*) as total_records,
        COUNT(DISTINCT site_id) as unique_sites,
        MIN(timestamp) as earliest_record,
        MAX(timestamp) as latest_record
    FROM local.db.telecom_metrics
""").show()

# 4. Performance by region (Windows-optimized query)
print("\n🌍 Regional Performance Analysis:")
spark.sql("""
    SELECT
        region,
        COUNT(*) as measurements,
        ROUND(AVG(rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(latency_ms), 2) as avg_latency,
        ROUND(AVG(cpu_usage_percent), 2) as avg_cpu
    FROM local.db.telecom_metrics
    GROUP BY region
    ORDER BY avg_rssi DESC
""").show()

# 5. Technology comparison
print("\n📡 Technology Performance:")
spark.sql("""
    SELECT
        technology,
        COUNT(*) as measurements,
        ROUND(AVG(rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(latency_ms), 2) as avg_latency,
        ROUND(MAX(drop_rate_percent), 2) as max_drop_rate
    FROM local.db.telecom_metrics
    GROUP BY technology
    ORDER BY technology
""").show()

# 6. Filtering examples
print("\n🔍 Filtering High-Performance Sites:")
spark.sql("""
    SELECT site_id, region, AVG(rssi_dbm) as avg_rssi
    FROM local.db.telecom_metrics
    WHERE rssi_dbm > -70
    GROUP BY site_id, region
    ORDER BY avg_rssi DESC
    LIMIT 10
""").show()

print("✅ Basic operations completed successfully on Windows!")


In [None]:
# Time Travel & Snapshots - Windows Implementation
print("🕐 Time Travel & Snapshots (Windows)")
print("=" * 40)

# Add more data to create snapshots
print("\n📊 Adding new data for time travel demo...")
import random
from datetime import datetime, timedelta

additional_records = []
for site in sites[:20]:  # Use subset for Windows performance
    record = {
        "timestamp": datetime.now() + timedelta(minutes=random.randint(1, 60)),
        "region": site["region"],
        "city": site["city"],
        "site_id": site["site_id"],
        "technology": site["technology"],
        "vendor": site["vendor"],
        "rssi_dbm": round(random.uniform(-85, -65), 2),
        "latency_ms": round(random.uniform(25, 75), 2),
        "data_volume_mb": round(random.uniform(1, 15), 2),
        "drop_rate_percent": round(random.uniform(0, 2), 2),
        "cpu_usage_percent": round(random.uniform(30, 80), 2),
    }
    additional_records.append(record)

# Insert additional data
additional_df = spark.createDataFrame(additional_records)
additional_df.write \
    .mode("append") \
    .format("iceberg") \
    .saveAsTable("local.db.telecom_metrics")

print("✅ Added new data")

# Show snapshots
print("\n📸 Available Snapshots:")
snapshots_df = spark.sql("SELECT snapshot_id, committed_at FROM local.db.telecom_metrics.snapshots ORDER BY committed_at")
snapshots_df.show()

# Compare snapshots if available
snapshots = snapshots_df.collect()
if len(snapshots) >= 2:
    first_snapshot = snapshots[0].snapshot_id
    latest_snapshot = snapshots[-1].snapshot_id
    
    print(f"\n🔍 Comparing snapshots on Windows:")
    print(f"   First: {first_snapshot}")
    print(f"   Latest: {latest_snapshot}")
    
    # Query first snapshot
    first_count = spark.sql(f"""
        SELECT COUNT(*) as record_count 
        FROM local.db.telecom_metrics 
        VERSION AS OF {first_snapshot}
    """).collect()[0].record_count
    
    # Query latest snapshot
    latest_count = spark.sql(f"""
        SELECT COUNT(*) as record_count 
        FROM local.db.telecom_metrics 
        VERSION AS OF {latest_snapshot}
    """).collect()[0].record_count
    
    print(f"📊 Records in first snapshot: {first_count:,}")
    print(f"📊 Records in latest snapshot: {latest_count:,}")
    print(f"📈 Records added: {latest_count - first_count:,}")

print("✅ Time travel operations completed on Windows!")


In [None]:
# Schema Evolution - Windows Implementation
print("🔄 Schema Evolution (Windows)")
print("=" * 35)

# Add new columns for Windows telecom metrics
print("\n➕ Adding new columns for enhanced Windows monitoring...")

# Add signal quality category
spark.sql("""
    ALTER TABLE local.db.telecom_metrics 
    ADD COLUMN signal_quality_category STRING
""")
print("   ✅ Added signal_quality_category column")

# Add Windows-specific monitoring metrics
spark.sql("""
    ALTER TABLE local.db.telecom_metrics 
    ADD COLUMN windows_compatibility_score DOUBLE
""")
print("   ✅ Added windows_compatibility_score column")

# Show updated schema
print("\n📋 Updated Schema:")
spark.sql("DESCRIBE local.db.telecom_metrics").show()

# Insert data with new schema
print("\n📊 Inserting data with enhanced Windows metrics...")

enhanced_records = []
for site in sites[:25]:  # Windows-optimized subset
    rssi = round(random.uniform(-85, -65), 2)
    
    # Calculate signal quality category
    if rssi > -70:
        quality = "Excellent"
    elif rssi > -80:
        quality = "Good"
    else:
        quality = "Poor"
    
    # Calculate Windows compatibility score
    base_score = 85 if site["technology"] == "8G" else (75 if site["technology"] == "7G" else 65)
    if site["vendor"] == "Samsong":
        base_score += 10  # Better Windows integration
    elif site["vendor"] == "Noson":
        base_score += 5
    
    windows_score = min(100, max(0, base_score + random.uniform(-10, 10)))
    
    record = {
        "timestamp": datetime.now() + timedelta(minutes=random.randint(61, 120)),
        "region": site["region"],
        "city": site["city"],
        "site_id": site["site_id"],
        "technology": site["technology"],
        "vendor": site["vendor"],
        "rssi_dbm": rssi,
        "latency_ms": round(random.uniform(25, 75), 2),
        "data_volume_mb": round(random.uniform(1, 15), 2),
        "drop_rate_percent": round(random.uniform(0, 2), 2),
        "cpu_usage_percent": round(random.uniform(30, 80), 2),
        "signal_quality_category": quality,
        "windows_compatibility_score": round(windows_score, 2)
    }
    enhanced_records.append(record)

# Insert enhanced data
enhanced_df = spark.createDataFrame(enhanced_records)
enhanced_df.write \
    .mode("append") \
    .format("iceberg") \
    .saveAsTable("local.db.telecom_metrics")

print("✅ Enhanced data inserted")

# Query with mixed schema data
print("\n🔍 Querying mixed schema data:")
spark.sql("""
    SELECT 
        region,
        COUNT(*) as total_records,
        COUNT(signal_quality_category) as records_with_quality,
        COUNT(windows_compatibility_score) as records_with_windows_score,
        ROUND(AVG(windows_compatibility_score), 2) as avg_windows_score
    FROM local.db.telecom_metrics
    GROUP BY region
    ORDER BY region
""").show()

print("✅ Schema evolution completed successfully on Windows!")


In [None]:
# Advanced Analytics & Windows Best Practices
print("📊 Advanced Analytics & Windows Best Practices")
print("=" * 50)

# Performance optimization queries
print("\n🚀 Performance Optimization Analysis:")
spark.sql("""
    SELECT 
        HOUR(timestamp) as hour,
        COUNT(*) as measurements,
        ROUND(AVG(rssi_dbm), 2) as avg_rssi,
        ROUND(AVG(latency_ms), 2) as avg_latency,
        ROUND(AVG(cpu_usage_percent), 2) as avg_cpu
    FROM local.db.telecom_metrics
    GROUP BY HOUR(timestamp)
    ORDER BY hour
    LIMIT 10
""").show()

# Windows-specific vendor analysis
print("\n🪟 Windows Vendor Compatibility Analysis:")
spark.sql("""
    SELECT 
        vendor,
        technology,
        COUNT(*) as sites,
        ROUND(AVG(windows_compatibility_score), 2) as avg_windows_score,
        ROUND(AVG(rssi_dbm), 2) as avg_rssi
    FROM local.db.telecom_metrics
    WHERE windows_compatibility_score IS NOT NULL
    GROUP BY vendor, technology
    ORDER BY avg_windows_score DESC
""").show()

# Data compaction for Windows
print("\n🗜️ Data Compaction (Windows Optimization):")
print("   Compacting data files for better Windows performance...")
try:
    spark.sql("CALL local.system.rewrite_data_files('db.telecom_metrics')")
    print("   ✅ Data compaction completed")
except:
    print("   ℹ️ Compaction not available in this Iceberg version")

# Table statistics
print("\n📈 Table Statistics:")
try:
    files_info = spark.sql("""
        SELECT 
            COUNT(*) as total_files,
            ROUND(SUM(file_size_in_bytes) / 1024 / 1024, 2) as total_size_mb
        FROM local.db.telecom_metrics.files
    """)
    files_info.show()
except:
    print("   ℹ️ File statistics not available")

print("\n💡 Windows Best Practices Summary:")
print("=" * 40)
print("✅ Windows Environment:")
print("   • Use SSD storage for warehouse directories")
print("   • Configure Windows Defender exclusions for Spark/Java")
print("   • Set appropriate JVM heap sizes for Windows memory")
print("   • Use Windows file system monitoring for data integrity")

print("\n✅ Performance Optimization:")
print("   • Enable adaptive query execution")
print("   • Use columnar file formats (Parquet)")
print("   • Regular compaction for optimal read performance")
print("   • Monitor query plans and execution times")

print("\n✅ Security & Compliance:")
print("   • Integrate with Windows Active Directory")
print("   • Use Windows file permissions for access control")
print("   • Enable audit logging for compliance")
print("   • Implement encryption at rest and in transit")

print("\n🚀 Windows deployment completed successfully!")
print("📊 Your Iceberg telecom data lake is ready for enterprise use!")


## 2. Spark Configuration with Iceberg

Configure Spark to work with Apache Iceberg. In enterprise environments, you would typically configure this in your cluster settings.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum, avg, count
import os
import requests

# Windows-specific paths (using forward slashes for consistency)
jar_url = "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/1.4.2/iceberg-spark-runtime-3.4_2.12-1.4.2.jar"
jar_path = "./iceberg-spark-runtime.jar"
warehouse_path = "./iceberg-warehouse"

# S3 configuration (commented out - uncomment and configure for S3)
# s3_warehouse_path = "s3a://your-iceberg-bucket/your-warehouse-path"
# s3_access_key = "YOUR_AWS_ACCESS_KEY_ID"
# s3_secret_key = "YOUR_AWS_SECRET_ACCESS_KEY"
# s3_hadoop_jar = "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.1/hadoop-aws-3.3.1.jar"
# s3_bundle_jar = "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.11.901/aws-java-sdk-bundle-1.11.901.jar"

# Create warehouse directory if it doesn't exist (Windows-compatible)
os.makedirs(warehouse_path, exist_ok=True)

# Download Iceberg JAR for Windows (using requests for better Windows compatibility)
print("📥 Downloading Iceberg JAR for Windows...")
try:
    # Use requests instead of wget (which may not be available on Windows)
    response = requests.get(jar_url, timeout=300)
    response.raise_for_status()
    
    with open(jar_path, 'wb') as f:
        f.write(response.content)
    
    print(f"✅ Downloaded JAR to: {os.path.abspath(jar_path)}")
    
except Exception as e:
    print(f"❌ Failed to download JAR: {e}")
    print("💡 Check your internet connection and try again")
    print("💡 You may need to download manually from:")
    print(f"   {jar_url}")
    sys.exit(1)

# Download S3 related JARs if using S3 (uncomment below if using S3)
# if 's3_hadoop_jar' in locals():
#     print("📥 Downloading Hadoop AWS JAR for Windows...")
#     try:
#         s3_response = requests.get(s3_hadoop_jar, timeout=300)
#         s3_response.raise_for_status()
#         with open("./hadoop-aws.jar", 'wb') as f:
#             f.write(s3_response.content)
#         print("✅ Downloaded Hadoop AWS JAR")
#     except Exception as e:
#         print(f"❌ Failed to download Hadoop AWS JAR: {e}")
# 
# if 's3_bundle_jar' in locals():
#     print("📥 Downloading AWS SDK Bundle JAR for Windows...")
#     try:
#         bundle_response = requests.get(s3_bundle_jar, timeout=300)
#         bundle_response.raise_for_status()
#         with open("./aws-java-sdk-bundle.jar", 'wb') as f:
#             f.write(bundle_response.content)
#         print("✅ Downloaded AWS SDK Bundle JAR")
#     except Exception as e:
#         print(f"❌ Failed to download AWS SDK Bundle JAR: {e}")

# Configure Spark with Iceberg for Windows
print("\n⚡ Initializing Spark with Iceberg on Windows...")

# Windows-specific Spark configuration
spark_builder = SparkSession.builder \
    .appName("Iceberg Telecom Demo - Windows") \
    .config("spark.jars", jar_path) \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.driver.memory", "3g") \
    .config("spark.executor.memory", "2g")

# Configure for local warehouse
spark_builder = spark_builder.config("spark.sql.catalog.local.warehouse", warehouse_path) \
                             .config("spark.sql.warehouse.dir", warehouse_path)

# Configure for S3 warehouse (commented out - uncomment and configure for S3)
# spark_builder = spark_builder.config("spark.sql.catalog.local.warehouse", s3_warehouse_path) \
#                              .config("spark.sql.warehouse.dir", s3_warehouse_path) \
#                              .config("spark.jars", f"{jar_path},./hadoop-aws.jar,./aws-java-sdk-bundle.jar") \
#                              .config("spark.hadoop.fs.s3a.access.key", s3_access_key) \
#                              .config("spark.hadoop.fs.s3a.secret.key", s3_secret_key) \
#                              .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")

# Windows-specific: Handle potential firewall/antivirus issues
spark_builder = spark_builder.config("spark.driver.host", "localhost")
spark_builder = spark_builder.config("spark.driver.bindAddress", "127.0.0.1")

try:
    spark = spark_builder.getOrCreate()
    
    # Set log level to reduce noise
    spark.sparkContext.setLogLevel("WARN")
    
    print(f"✅ Spark {spark.version} with Iceberg initialized on Windows!")
    print(f"📁 Warehouse: {os.path.abspath(warehouse_path)}")
    print(f"☕ Java Home: {os.environ.get('JAVA_HOME', 'Not set')}")
    print(f"🖥️ Spark UI: http://localhost:4040")
    
except Exception as e:
    print(f"❌ Failed to initialize Spark: {e}")
    print("\n💡 Windows troubleshooting:")
    print("   1. Check Windows Defender/Antivirus settings")
    print("   2. Ensure Java is properly installed")
    print("   3. Try running as Administrator")
    print("   4. Check firewall settings for Java/Python")
    raise

## 3. Generating Synthetic Telecom Time Series Data

Let's generate realistic telecom network performance data that represents typical enterprise telecom monitoring scenarios.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# Telecom data generation for Windows
print("📡 Generating Telecom Data for Windows...")

# Windows-optimized configuration
chunk_size_seconds = 60
site_count = 150  # Moderate size for Windows
start_time = datetime(2023, 1, 1)
demo_chunks = 75  # Good balance for Windows performance

regions = ["North", "South", "East", "West", "Central"]
cities = {
    "North": ["Dendam", "Rondburg", "Nordville"],
    "South": ["Schieveste", "Southpark"],
    "East": ["Schipstad", "Dort", "Eastport"],
    "West": ["Damstad", "Westfield"],
    "Central": ["Centrum", "Midtown"]
}
technologies = ["6G", "7G", "8G"]
vendors = ["Ericia", "Noson", "Weihu", "Samsong"]

# Generate site metadata
print("🏗️ Creating site infrastructure...")
sites = []
for i in range(site_count):
    region = random.choice(regions)
    city = random.choice(cities[region])
    tech = random.choices(technologies, weights=[0.3, 0.5, 0.2])[0]
    vendor = random.choices(vendors, weights=[0.3, 0.3, 0.2, 0.2])[0]
    site_id = f"SITE_{i:05d}"
    sites.append({
        "site_id": site_id,
        "region": region,
        "city": city,
        "technology": tech,
        "vendor": vendor,
    })

def generate_telecom_data_chunk(second_offset):
    """Generate telecom metrics for a specific time offset"""
    timestamp = start_time + timedelta(seconds=second_offset)
    records = []
    
    for site in sites:
        region = site["region"]
        city = site["city"]
        tech = site["technology"]
        vendor = site["vendor"]

        # Enhanced signal strength (RSSI) modeling
        base_rssi = -70 if tech == "8G" else (-75 if tech == "7G" else -85)
        if vendor == "Noson":
            base_rssi += 3
        elif vendor == "Weihu":
            base_rssi -= 2
        elif vendor == "Samsong":
            base_rssi += 1
        
        # Add regional variations
        if region == "Central":
            base_rssi += 2  # Better infrastructure in central areas
        
        rssi = np.random.normal(loc=base_rssi, scale=3)

        # Latency modeling with Windows-specific considerations
        if region == "Central":
            base_latency = 25 if tech == "8G" else (35 if tech == "7G" else 60)
        else:
            base_latency = 30 if tech == "8G" else (40 if tech == "7G" else 65)
        
        latency = np.random.normal(loc=base_latency, scale=8)

        # Data volume (in MB)
        data_volume = np.random.exponential(scale=7)

        # CPU usage with vendor-specific characteristics
        base_cpu = 45 + (5 if tech == "8G" else (7 if tech == "7G" else 0))
        if vendor == "Weihu":
            base_cpu += 8
        elif vendor == "Samsong":
            base_cpu += 3
        cpu_usage = np.clip(np.random.normal(loc=base_cpu, scale=10), 0, 100)

        # Drop rate calculation (using built-in min function)
        city_penalty = 0.03 if city in ["Damstad", "Schieveste"] else 0.008
        vendor_penalty = 0.015 if vendor == "Weihu" else 0.005
        drop_rate = (
            min(1.0, 0.001 * cpu_usage + city_penalty + vendor_penalty + np.random.beta(1, 200)) * 100
        )

        records.append({
            "timestamp": timestamp,
            "region": region,
            "city": city,
            "site_id": site["site_id"],
            "technology": tech,
            "vendor": vendor,
            "rssi_dbm": round(rssi, 2),
            "latency_ms": round(latency, 2),
            "data_volume_mb": round(data_volume, 2),
            "drop_rate_percent": round(drop_rate, 2),
            "cpu_usage_percent": round(cpu_usage, 2),
        })
    return records

# Generate telecom time series data with Windows progress indication
print("⚡ Generating comprehensive telecom metrics for Windows...")
print("📊 Progress: ", end="", flush=True)

all_records = []
progress_step = demo_chunks // 10

for chunk_idx in range(demo_chunks):
    chunk_records = []
    for second in range(chunk_size_seconds):
        minute_offset = chunk_idx * chunk_size_seconds + second
        chunk_records.extend(generate_telecom_data_chunk(minute_offset))
    all_records.extend(chunk_records)
    
    # Show progress
    if chunk_idx % progress_step == 0:
        print("█", end="", flush=True)

print(" ✅")

# Convert to Spark DataFrames
print("🔄 Converting to Spark DataFrames...")
telecom_metrics_df = spark.createDataFrame(all_records)
sites_df = spark.createDataFrame(sites)

print("✅ Windows telecom data generated successfully!")
print(f"   📡 Sites: {sites_df.count():,}")
print(f"   📊 Metrics: {telecom_metrics_df.count():,}")
print(f"   💾 Memory usage: ~{len(all_records) * 200 / 1024 / 1024:.1f} MB")

# Show sample data
print("\n📊 Sample Telecom Data:")
telecom_metrics_df.show(10)
print("\n🏢 Sample Site Data:")
sites_df.show(10)