# IoT Sensor Data Demo with VARIANT Type

This notebook demonstrates IoT sensor data streaming with VARIANT columns in Databricks:

## Features
- **VARIANT Column Support**: Store complex nested JSON metadata
- **Realistic Data Generation**: Uses dbldatagen for realistic IoT sensor data
- **Streaming Processing**: Real-time data ingestion and processing
- **Simple Implementation**: Clean, focused code demonstrating core functionality
- **Databricks Cluster Optimized**: Designed for remote cluster execution

## Prerequisites
- Databricks Runtime 13.3 LTS or higher
- Unity Catalog enabled workspace with volume access
- Cluster with appropriate permissions for streaming and Delta operations
- dbldatagen library for realistic data generation

## Architecture
- dbldatagen → Streaming source → Delta table with VARIANT columns → Real-time analytics
- Realistic IoT sensor data with weighted distributions and proper data types

In [None]:
# Install packages and restart Python runtime
%pip install dbldatagen jmespath pyparsing
dbutils.library.restartPython()

In [None]:
import uuid
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, IntegerType
from pyspark.sql.functions import *
from pyspark.sql.streaming import StreamingQuery
import dbldatagen as dg

# Use existing Spark session in Databricks
spark = SparkSession.getActiveSession()

print(f"✅ Spark version: {spark.version}")
print(f"📦 dbldatagen imported successfully")

In [None]:
# Simple configuration
table_name = f"soni.default.iot_variant_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
checkpoint_path = f"/Volumes/soni/default/checkpoints/iot_{uuid.uuid4()}"

# Create table
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {table_name} (
    sensor_id STRING,
    location STRING,
    temperature DOUBLE,
    humidity INTEGER,
    sensor_metadata VARIANT,
    reading_timestamp TIMESTAMP
) USING DELTA
""")

print(f"✅ Table created: {table_name}")
print(f"📍 Checkpoint: {checkpoint_path}")

In [None]:
# Create realistic IoT streaming data with dbldatagen
print("🔧 Creating realistic IoT streaming data with dbldatagen...")

# Define IoT sensor schema
iot_schema = StructType([
    StructField("sensor_id", StringType(), False),
    StructField("location", StringType(), False),
    StructField("temperature", DoubleType(), False),
    StructField("humidity", IntegerType(), False),
    StructField("battery_level", IntegerType(), False),
    StructField("signal_strength", IntegerType(), False),
    StructField("status", StringType(), False),
    StructField("firmware_version", StringType(), False)
])

# Create dbldatagen specification
dataspec = (
    dg.DataGenerator(spark, name="iot_sensors", partitions=8)
    .withSchema(iot_schema)
    .withColumnSpec("sensor_id", minValue=1, maxValue=100, prefix="SENSOR_", random=True)
    .withColumnSpec("location", values=["Building_A", "Building_B", "Building_C", "Warehouse", "DataCenter"], 
                   weights=[0.25, 0.25, 0.25, 0.15, 0.1], random=True)
    .withColumnSpec("temperature", minValue=-10.0, maxValue=50.0, random=True)
    .withColumnSpec("humidity", minValue=30, maxValue=90, random=True)
    .withColumnSpec("battery_level", minValue=1, maxValue=100, random=True)
    .withColumnSpec("signal_strength", minValue=-100, maxValue=-20, random=True)
    .withColumnSpec("status", values=["OK", "SENSOR_FAIL", "BATTERY_LOW", "COMM_LOSS"],
                   weights=[0.8, 0.05, 0.1, 0.05], random=True)
    .withColumnSpec("firmware_version", values=["v1.0", "v2.0", "v2.1"], 
                   weights=[0.2, 0.3, 0.5], random=True)
)

# Build streaming DataFrame with VARIANT metadata
streaming_df = (
    dataspec.build(
        withStreaming=True,
        options={
            'rowsPerSecond': 1000,
            'numPartitions': 8
        }
    )
    .withColumn("reading_timestamp", current_timestamp())
    
    # Create complex VARIANT metadata from the generated columns
    .withColumn("sensor_metadata", 
        parse_json(to_json(struct(
            col("battery_level").alias("battery_level"),
            col("signal_strength").alias("signal_strength"),
            col("status").alias("status"),
            col("firmware_version").alias("firmware_version"),
            current_timestamp().alias("last_maintenance"),
            
            # Device information
            struct(
                lit("Acme Corp").alias("manufacturer"),
                lit("TempSense Pro").alias("model"),
                lit("2023").alias("year"),
                concat(lit("TS-"), col("sensor_id")).alias("part_number")
            ).alias("device_info"),
            
            # Network connectivity  
            struct(
                lit("WiFi").alias("connection_type"),
                (col("sensor_id").cast("int") % 10 + 1).alias("network_id"),
                concat(lit("192.168.1."), (col("sensor_id").cast("int") % 254 + 1).cast("string")).alias("ip_address")
            ).alias("network"),
            
            # Environmental conditions
            struct(
                col("temperature").alias("ambient_temp"),
                col("humidity").alias("ambient_humidity"),
                (rand() * 200 + 800).alias("pressure_hpa")
            ).alias("environment")
        )))
    )
    
    # Keep only the final columns we want
    .select("sensor_id", "location", "temperature", "humidity", "sensor_metadata", "reading_timestamp")
)

print("✅ Realistic IoT streaming DataFrame created with dbldatagen")
print("📊 VARIANT metadata includes: battery, signal, status, device_info, network, environment")
print("🎯 Data generation: 1000 rows/second with realistic distributions")

In [None]:
# Start streaming with trigger(once=True) for testing
query = (
    streaming_df.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_path)
    .trigger(once=True)
    .toTable(table_name)
)

query.awaitTermination()
print("✅ Initial data loaded")

# Start continuous streaming
streaming_query = (
    streaming_df.writeStream
    .format("delta")
    .option("checkpointLocation", f"{checkpoint_path}_continuous")
    .trigger(processingTime="10 seconds")
    .toTable(table_name)
)

print("🚀 Streaming started")

In [None]:
# Test VARIANT column parsing
import time
time.sleep(30)  # Let streaming run

# Stop streaming for tests
if streaming_query.isActive:
    streaming_query.stop()

# Test 1: Basic VARIANT extraction
spark.sql(f"""
SELECT 
    sensor_id,
    sensor_metadata:battery_level::INT as battery,
    sensor_metadata:status::STRING as status
FROM {table_name} LIMIT 3
""").show()

# Test 2: Nested VARIANT access
spark.sql(f"""
SELECT 
    sensor_id,
    sensor_metadata:device_info.model::STRING as model,
    sensor_metadata:device_info.version::STRING as version
FROM {table_name} LIMIT 3
""").show()

print("✅ VARIANT tests completed")

In [None]:
# Summary
row_count = spark.sql(f"SELECT COUNT(*) as count FROM {table_name}").collect()[0].count
print(f"📊 Final table contains {row_count:,} rows")
print(f"✅ VARIANT streaming demo completed")
print(f"🎯 Table: {table_name}")

In [2]:
# Install missing dependency
import subprocess
import sys

print("🔧 Installing missing jmespath dependency...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "jmespath"])

# Now test dbldatagen
import dbldatagen as dg
print("✅ dbldatagen imported successfully after installing jmespath")

# Test with Databricks Connect
from databricks.connect import DatabricksSession
spark = DatabricksSession.builder.getOrCreate()

# Test basic dbldatagen functionality
test_spec = dg.DataGenerator(spark, name="test", partitions=1)
print("✅ dbldatagen DataGenerator works on Databricks cluster")

print("🎯 Notebook should work now - dbldatagen is properly functional")

🔧 Installing missing jmespath dependency...
Collecting jmespath
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Using cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Installing collected packages: jmespath
Successfully installed jmespath-1.0.1


ModuleNotFoundError: No module named 'pyparsing'

In [None]:
# Install all dependencies and test
import subprocess
import sys

# Install all missing dependencies
dependencies = ["jmespath", "pyparsing"]
for dep in dependencies:
    print(f"Installing {dep}...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", dep])

# Test dbldatagen import
import dbldatagen as dg
print("✅ dbldatagen imported successfully!")

# Test with Databricks Connect
from databricks.connect import DatabricksSession
spark = DatabricksSession.builder.getOrCreate()

# Test basic functionality
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

test_schema = StructType([
    StructField("id", IntegerType(), False),
    StructField("name", StringType(), False)
])

test_spec = (
    dg.DataGenerator(spark, name="test", partitions=1)
    .withSchema(test_schema)
    .withColumnSpec("id", minValue=1, maxValue=100)
    .withColumnSpec("name", values=["Alice", "Bob"], random=True)
)

print("✅ dbldatagen DataGenerator configuration successful")

# Test streaming build
test_df = test_spec.build(withStreaming=True, options={'rowsPerSecond': 10})
print("✅ dbldatagen streaming build successful")

print("🎯 All dependencies work - notebook is ready for testing!")