# IoT Sensor Data Demo with VARIANT Type

This notebook demonstrates working with VARIANT columns in Databricks, specifically:
- Creating tables with VARIANT columns
- Generating synthetic IoT sensor data
- Storing nested metadata as VARIANT
- Querying and analyzing VARIANT data

## Prerequisites
- Databricks Connect configured locally
- Environment variables set: DATABRICKS_HOST, DATABRICKS_TOKEN, DATABRICKS_CLUSTER_ID


In [1]:
# MAGIC %pip install dbldatagen


In [2]:
# Initialize Databricks Connect
print("🔗 Initializing Databricks Connect...")
try:
    from databricks.connect import DatabricksSession
    spark = DatabricksSession.builder.getOrCreate()
    print("✅ Connected to Databricks cluster via Databricks Connect")
    
    # Verify connection
    print(f"✅ Spark version: {spark.version}")
    test_count = spark.range(3).count()
    print(f"✅ Connected to remote cluster: {test_count} test rows")
    
    # Check Databricks Runtime version
    try:
        version_info = spark.sql("SELECT current_version() as version").collect()[0]
        print(f"🏢 Databricks Runtime: {version_info.version}")
        print("✅ CONFIRMED: Running on DATABRICKS CLUSTER")
    except Exception as e:
        print(f"⚠️  Could not get runtime version: {str(e)[:50]}...")
    
except Exception as e:
    print(f"❌ Connection failed: {e}")
    print("💡 Try restarting the kernel and running again")
    raise


🔗 Initializing Databricks Connect...
❌ Connection failed: No module named 'databricks'
💡 Try restarting the kernel and running again


ModuleNotFoundError: No module named 'databricks'

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, IntegerType
from pyspark.sql.functions import expr, when, col, concat, lit, to_json, struct, current_timestamp, make_interval, parse_json, floor, rand
import dbldatagen as dg
import uuid

print("✅ Required imports completed")


In [None]:
# Configuration for high-throughput streaming
PARTITIONS = 12  # Increased for better streaming performance
ROWS_PER_SECOND = 100000  # 100K rows/sec (adjust based on your cluster capacity)

# Checkpoint location for streaming
CHECKPOINT_BASE = "s3://test-external-volume-bucket-2/test-folder"
CHECKPOINT_PATH = f"{CHECKPOINT_BASE}/iot_variant_stream_checkpoint-{uuid.uuid4()}"

# Base schema for generating data
base_schema = StructType([
    StructField("sensor_id", StringType(), False),
    StructField("location", StringType(), False),
    StructField("temperature", DoubleType(), False),
    StructField("humidity", IntegerType(), False),
    StructField("pressure", DoubleType(), False),
    StructField("battery_level", IntegerType(), False),
    StructField("signal_strength", IntegerType(), False),
    StructField("fault_code", StringType(), False)
])

print("🔧 Streaming Configuration:")
print(f"   • Partitions: {PARTITIONS}")
print(f"   • Rows/second: {ROWS_PER_SECOND:,}")
print(f"   • Checkpoint: {CHECKPOINT_PATH}")
print("✅ Schema and configuration set")


In [None]:
# Create a table with VARIANT column for metadata and flat columns for readings
print("🏗️ Creating table with VARIANT and flat columns...")
spark.sql("""
CREATE TABLE IF NOT EXISTS soni.default.iot_sensor_variant_data (
    sensor_id STRING,
    location STRING,
    sensor_metadata VARIANT,
    temperature DOUBLE,
    humidity INTEGER,
    pressure DOUBLE,
    reading_timestamp TIMESTAMP,
    temp_status STRING,
    sample_count INTEGER,
    event_timestamp TIMESTAMP
)
""")
print("✅ Table created successfully")


In [None]:
# Build streaming DataFrame with optimizations
print("🔄 Setting up streaming data generator...")
streaming_df = (
    dataspec.build(
        withStreaming=True,
        options={
            'rowsPerSecond': ROWS_PER_SECOND,
            'numPartitions': PARTITIONS,
        }
    )
    # Create sensor_metadata VARIANT with nested integer, string, and timestamp
    .withColumn("sensor_metadata", 
                to_json(struct(
                    col("battery_level").alias("battery_level"),  # INTEGER
                    col("signal_strength").alias("signal_strength"),  # INTEGER
                    col("fault_code").alias("status"),  # STRING
                    current_timestamp().alias("last_maintenance"),  # TIMESTAMP
                    current_timestamp().alias("installation_date"),  # TIMESTAMP
                    lit("v2.1.4").alias("firmware_version"),  # STRING
                    (floor(rand() * 1000) + 1).alias("calibration_count")  # INTEGER
                )))
    # Create flat sensor reading columns with optimized expressions
    .withColumn("reading_timestamp", current_timestamp())
    .withColumn("temp_status", when(col("temperature") > 35, "HIGH_TEMP")
                              .when(col("temperature") < 5, "LOW_TEMP")
                              .otherwise("NORMAL"))
    .withColumn("sample_count", floor(rand() * 100) + 1)
    .withColumn("event_timestamp", current_timestamp())
    # Deterministic tracker for compression (1000x reduction)
    .withColumn("tracker_row", expr("abs(hash(sensor_id)) % 1000 + 1"))
    # Parse JSON to VARIANT efficiently
    .withColumn("sensor_metadata_variant", parse_json(col("sensor_metadata")))
    .drop("sensor_metadata")
    .withColumnRenamed("sensor_metadata_variant", "sensor_metadata")
)

print("✅ Streaming DataFrame configured with optimizations:")
print("   • Streaming mode enabled")
print("   • Deterministic tracking for compression")
print("   • Efficient timestamp generation")
print("   • Optimized column expressions")


In [None]:
# Write the streaming data to Delta table with VARIANT columns
print("🚀 Starting streaming write operation...")

(
    streaming_df.writeStream
        .queryName("iot_variant_sensor_stream")
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", CHECKPOINT_PATH)
        .toTable("soni.default.iot_sensor_variant_data")
)

print("✅ Streaming write started to soni.default.iot_sensor_variant_data")


In [None]:
# Write the streaming data to Delta table with VARIANT columns
print("🚀 Starting streaming write operation...")

streaming_query = (
    streaming_df.writeStream
        .queryName("iot_variant_sensor_stream")
        .outputMode("append")
        .format("delta")
        .option("checkpointLocation", CHECKPOINT_PATH)
        .option("mergeSchema", "true")  # Handle schema evolution for VARIANT
        .toTable("soni.default.iot_sensor_variant_data")
)

print("✅ Streaming write configured:")
print(f"   • Query Name: {streaming_query.name}")
print(f"   • Status: {streaming_query.status}")
print("   • Mode: append")
print("   • Format: delta")
print("   • Schema evolution: enabled")

# Optional: Show current streaming status
print("\n📊 Active Streams:")
for stream in spark.streams.active:
    print(f"   • {stream.name}: {stream.status['message']}")


In [None]:
# Monitor streaming metrics
def monitor_streaming_metrics():
    """Display current streaming metrics"""
    if not spark.streams.active:
        print("❌ No active streaming queries")
        return
    
    print("📊 Streaming Metrics:")
    for query in spark.streams.active:
        print(f"\nQuery: {query.name}")
        print(f"Status: {query.status['message']}")
        
        # Get recent progress metrics
        recent_progress = query.recentProgress
        if recent_progress:
            latest = recent_progress[-1]
            print("\nLatest Metrics:")
            print(f"• Input rate: {latest.get('inputRowsPerSecond', 0):.0f} rows/second")
            print(f"• Processing rate: {latest.get('processedRowsPerSecond', 0):.0f} rows/second")
            print(f"• Batch duration: {latest.get('batchDuration', 0):.2f} ms")
            
            # Memory metrics
            mem_used = latest.get('memoryUsedBytes', 0) / 1024 / 1024  # Convert to MB
            print(f"• Memory used: {mem_used:.2f} MB")
            
            # State metrics if available
            state_operators = latest.get('stateOperators', [])
            if state_operators:
                print("\nState Store Metrics:")
                for op in state_operators:
                    print(f"• Rows: {op.get('numRowsTotal', 0):,}")
                    print(f"• Memory: {op.get('memoryUsedBytes', 0) / 1024 / 1024:.2f} MB")
        else:
            print("No metrics available yet")

# Run initial monitoring
monitor_streaming_metrics()


In [None]:
# Streaming control functions
def stop_streaming():
    """Stop all streaming queries"""
    print("🛑 Stopping all streaming queries...")
    for query in spark.streams.active:
        query.stop()
    print("✅ All streaming queries stopped")

def cleanup_streaming():
    """Stop streams and clean up resources"""
    print("🧹 Cleaning up streaming resources...")
    
    # Stop all streams
    stop_streaming()
    
    # Clean up checkpoint location
    print("Removing checkpoint directory...")
    spark.sql(f"DELETE FROM soni.default.iot_sensor_variant_data WHERE 1=1")
    print("✅ Cleanup completed")

# Example usage:
print("Available commands:")
print("• monitor_streaming_metrics() - Show current metrics")
print("• stop_streaming() - Stop all streams")
print("• cleanup_streaming() - Stop streams and clean up")


In [None]:
# Configuration for data generation
PARTITIONS = 8
ROWS_PER_SECOND = 10000

# Base schema for generating data
base_schema = StructType([
    StructField("sensor_id", StringType(), False),
    StructField("location", StringType(), False),
    StructField("temperature", DoubleType(), False),
    StructField("humidity", IntegerType(), False),
    StructField("pressure", DoubleType(), False),
    StructField("battery_level", IntegerType(), False),
    StructField("signal_strength", IntegerType(), False),
    StructField("fault_code", StringType(), False)
])

print("✅ Schema and configuration set")


In [None]:
# Data generator for base IoT data
print("🔄 Setting up data generator...")
dataspec = (
    dg.DataGenerator(spark, name="iot_variant_data", partitions=PARTITIONS)
    .withSchema(base_schema)
    .withColumnSpec("sensor_id", minValue=1000000, maxValue=4000000, prefix="SENSOR_", random=True)
    .withColumnSpec("location", 
                   values=["Building_A_Floor_1", "Building_A_Floor_2", "Building_B_Floor_1", "Building_C_Roof"],
                   weights=[0.3, 0.3, 0.2, 0.2], random=True)
    .withColumnSpec("temperature", minValue=-10.0, maxValue=40.0, random=True)
    .withColumnSpec("humidity", minValue=20, maxValue=90, random=True)
    .withColumnSpec("pressure", minValue=980.0, maxValue=1020.0, random=True)
    .withColumnSpec("battery_level", minValue=0, maxValue=100, random=True)
    .withColumnSpec("signal_strength", minValue=-100, maxValue=0, random=True)
    .withColumnSpec("fault_code", 
                   values=["OK","E01_sensor_fail","E02_battery_low","E03_comm_loss"],
                   weights=[0.85,0.05,0.05,0.05], random=True)
)
print("✅ Data generator configured")


In [None]:
# Build batch DataFrame with VARIANT metadata and flat readings for demonstration
print("🏗️ Building DataFrame with VARIANT metadata...")
batch_df = (
    dataspec.build(
        withStreaming=False,
        options={
            'rowsPerSecond': ROWS_PER_SECOND,
            'numPartitions': PARTITIONS,
        }
    )
    # Create sensor_metadata VARIANT with nested integer, string, and timestamp
    .withColumn("sensor_metadata", 
                to_json(struct(
                    col("battery_level").alias("battery_level"),  # INTEGER
                    col("signal_strength").alias("signal_strength"),  # INTEGER
                    col("fault_code").alias("status"),  # STRING
                    current_timestamp().alias("last_maintenance"),  # TIMESTAMP
                    current_timestamp().alias("installation_date"),  # TIMESTAMP
                    lit("v2.1.4").alias("firmware_version"),  # STRING
                    (floor(rand() * 1000) + 1).alias("calibration_count")  # INTEGER
                )))
    # Create flat sensor reading columns
    .withColumn("reading_timestamp", current_timestamp())
    .withColumn("temp_status", when(col("temperature") > 35, "HIGH_TEMP").when(col("temperature") < 5, "LOW_TEMP").otherwise("NORMAL"))
    .withColumn("sample_count", floor(rand() * 100) + 1)
    .withColumn("event_timestamp", current_timestamp())
    # Parse JSON string to VARIANT type for metadata only
    .withColumn("sensor_metadata_variant", parse_json(col("sensor_metadata")))
    .drop("sensor_metadata")  # Drop the JSON string column
    .withColumnRenamed("sensor_metadata_variant", "sensor_metadata")
)
print("✅ DataFrame built successfully")


In [None]:
# Display sample data to see the VARIANT structure
print("=== Sample Data Preview ===")
batch_df.limit(5).show(truncate=False)


In [None]:
# Write the batch data to the table with VARIANT columns
print("=== Writing data to table ===")
batch_df.write.mode("overwrite").saveAsTable("soni.default.iot_sensor_variant_data")
print("✅ Data written successfully!")


In [None]:
# Query examples to demonstrate VARIANT column usage

# 1. Query sensor metadata from VARIANT column
print("=== Sensor Metadata Query ===")
spark.sql("""
SELECT 
    sensor_id,
    location,
    sensor_metadata:battery_level as battery_level,
    sensor_metadata:status as sensor_status,
    sensor_metadata:firmware_version as firmware,
    sensor_metadata:calibration_count as calibrations,
    sensor_metadata:last_maintenance as last_maintenance
FROM soni.default.iot_sensor_variant_data
WHERE sensor_metadata:battery_level::INT < 30
LIMIT 10
""").show(truncate=False)


In [None]:
# 2. Query flat sensor readings columns
print("=== Sensor Readings Query ===")
spark.sql("""
SELECT 
    sensor_id,
    location,
    temperature as current_temp,
    humidity as current_humidity,
    pressure as current_pressure,
    temp_status as temperature_status,
    reading_timestamp as reading_time,
    sample_count as samples
FROM soni.default.iot_sensor_variant_data
WHERE temperature > 30
LIMIT 10
""").show(truncate=False)


In [None]:
# 3. Complex query combining VARIANT metadata with flat readings
print("=== Complex VARIANT Query ===")
spark.sql("""
SELECT 
    sensor_id,
    location,
    sensor_metadata:battery_level::INT as battery,
    sensor_metadata:status as status,
    temperature as temp,
    temp_status as temp_status,
    reading_timestamp as reading_time,
    sensor_metadata:last_maintenance as last_maintenance,
    CASE 
        WHEN sensor_metadata:battery_level::INT < 20 THEN 'CRITICAL_BATTERY'
        WHEN temperature > 35 THEN 'HIGH_TEMP_ALERT'
        WHEN sensor_metadata:status::STRING != 'OK' THEN 'SENSOR_FAULT'
        ELSE 'NORMAL'
    END as alert_level
FROM soni.default.iot_sensor_variant_data
WHERE sensor_metadata:battery_level::INT < 30 OR temperature > 30
ORDER BY reading_timestamp DESC
LIMIT 15
""").show(truncate=False)


In [None]:
# 4. Aggregation query using VARIANT metadata and flat readings
print("=== VARIANT Aggregation Query ===")
spark.sql("""
SELECT 
    location,
    COUNT(*) as total_sensors,
    AVG(temperature) as avg_temperature,
    AVG(humidity) as avg_humidity,
    AVG(sensor_metadata:battery_level::INT) as avg_battery,
    COUNT(CASE WHEN sensor_metadata:status::STRING != 'OK' THEN 1 END) as faulty_sensors,
    COUNT(CASE WHEN temp_status = 'HIGH_TEMP' THEN 1 END) as high_temp_alerts
FROM soni.default.iot_sensor_variant_data
GROUP BY location
ORDER BY avg_battery ASC
""").show(truncate=False)


In [None]:
# 5. Time-based analysis using flat timestamp fields
print("=== Time-based Analysis ===")
spark.sql("""
SELECT 
    DATE(reading_timestamp) as reading_date,
    HOUR(reading_timestamp) as reading_hour,
    COUNT(*) as readings_count,
    AVG(temperature) as avg_temp,
    AVG(humidity) as avg_humidity,
    COUNT(CASE WHEN temp_status = 'HIGH_TEMP' THEN 1 END) as high_temp_count
FROM soni.default.iot_sensor_variant_data
WHERE reading_timestamp >= current_timestamp() - INTERVAL 1 DAY
GROUP BY DATE(reading_timestamp), HOUR(reading_timestamp)
ORDER BY reading_date DESC, reading_hour DESC
LIMIT 24
""").show(truncate=False)


In [None]:
# 6. Test VARIANT functions
print("=== VARIANT Functions Test ===")
spark.sql("""
SELECT 
    sensor_id,
    schema_of_variant(sensor_metadata) as metadata_schema,
    to_json(sensor_metadata) as metadata_json,
    is_variant_null(sensor_metadata) as is_null
FROM soni.default.iot_sensor_variant_data
LIMIT 3
""").show(truncate=False)

print("\n🎉 VARIANT IoT Sensor Demo Completed Successfully!")
print("📚 Key Learnings Applied:")
print("1. ✅ Used ':' operator for VARIANT field access")
print("2. ✅ Used '::' operator for casting VARIANT fields to specific types")
print("3. ✅ Combined VARIANT metadata with flat sensor readings")
print("4. ✅ Used VARIANT functions like schema_of_variant() and to_json()")


## Cleanup (Optional)

Uncomment and run the following cell to clean up the demo resources:


In [None]:
# Clean up example
# spark.sql("DROP TABLE IF EXISTS soni.default.iot_sensor_variant_data")
# print("✅ Cleanup completed")
