# IoT Sensor Data Demo with VARIANT Type

This notebook demonstrates working with VARIANT columns in Databricks, specifically:
- Creating tables with VARIANT columns
- Generating synthetic IoT sensor data
- Storing nested metadata as VARIANT
- Querying and analyzing VARIANT data

## Prerequisites
- Databricks Connect configured locally
- Environment variables set: DATABRICKS_HOST, DATABRICKS_TOKEN, DATABRICKS_CLUSTER_ID


In [0]:
%pip install dbldatagen

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, IntegerType
from pyspark.sql.functions import expr, when, col, concat, lit, to_json, struct, current_timestamp, make_interval, parse_json, floor, rand
import dbldatagen as dg
import uuid

print("✅ Required imports completed")


In [0]:
# Configuration
PARTITIONS = 8
ROWS_PER_SECOND = 100_000
CHECKPOINT_PATH = f"/Volumes/soni/default/checkpoints/_{uuid.uuid4()}"


# Create or replace table with VARIANT column
spark.sql("""
CREATE OR REPLACE TABLE soni.default.iot_sensor_variant_demo (
    sensor_id STRING,
    location STRING,
    temperature DOUBLE,
    humidity INTEGER,
    sensor_metadata VARIANT,
    reading_timestamp TIMESTAMP
)
""")

print("✅ Table with VARIANT column created/replaced")


In [0]:
# Define schema for data generation
schema = StructType([
    StructField("sensor_id", StringType(), False),
    StructField("location", StringType(), False),
    StructField("temperature", DoubleType(), False),
    StructField("humidity", IntegerType(), False),
    StructField("battery_level", IntegerType(), False),
    StructField("signal_strength", IntegerType(), False),
    StructField("fault_code", StringType(), False)
])

# Create data generator
dataspec = (
    dg.DataGenerator(spark, name="iot_variant_data", partitions=PARTITIONS)
    .withSchema(schema)
    .withColumnSpec("sensor_id", minValue=1000, maxValue=9999, prefix="SENSOR_", random=True)
    .withColumnSpec("location", values=["Building_A", "Building_B", "Building_C", "Warehouse"], random=True)
    .withColumnSpec("temperature", minValue=-10.0, maxValue=45.0, random=True)
    .withColumnSpec("humidity", minValue=20, maxValue=90, random=True)
    .withColumnSpec("battery_level", minValue=0, maxValue=100, random=True)
    .withColumnSpec("signal_strength", minValue=-100, maxValue=0, random=True)
    .withColumnSpec("fault_code", values=["OK", "SENSOR_FAIL", "BATTERY_LOW", "COMM_LOSS"], 
                   weights=[0.8, 0.1, 0.05, 0.05], random=True)
)

print("✅ Data generator configured")


In [0]:
# Create streaming DataFrame with VARIANT metadata
streaming_df = (
    dataspec.build(
        withStreaming=True,
        options={'rowsPerSecond': ROWS_PER_SECOND, 'numPartitions': PARTITIONS}
    )
    # Create VARIANT column with nested sensor metadata
    .withColumn("sensor_metadata", 
                parse_json(to_json(struct(
                    col("battery_level").alias("battery_level"),
                    col("signal_strength").alias("signal_strength"),
                    col("fault_code").alias("status"),
                    current_timestamp().alias("last_maintenance"),
                    lit("v2.1.5").alias("firmware_version"),
                    (floor(rand() * 1000) + 1).alias("calibration_count"),
                    struct(
                        lit("Acme Corp").alias("manufacturer"),
                        lit("TempSense Pro").alias("model"),
                        lit("2023").alias("year")
                    ).alias("device_info")
                ))))
    .withColumn("reading_timestamp", current_timestamp())
    # Keep only the columns we want in the final table
    .select("sensor_id", "location", "temperature", "humidity", "sensor_metadata", "reading_timestamp")
)

print("✅ Streaming DataFrame with VARIANT column created")

# COMMAND ----------

# Start streaming write to Delta table
streaming_query = (
    streaming_df.writeStream
    .queryName("iot_variant_sensor_stream")
    .outputMode("append")
    .format("delta")
    .option("checkpointLocation", CHECKPOINT_PATH)
    .option("mergeSchema", "true")
    .toTable("soni.default.iot_sensor_variant_demo")
)

print("🚀 Streaming with VARIANT data started!")
print(f"Query name: {streaming_query.name}")

In [0]:
# Wait a few seconds for data, then query VARIANT columns
print("=== Basic VARIANT Column Query ===")
spark.sql("""
SELECT 
    sensor_id,
    location,
    temperature,
    sensor_metadata:battery_level::INT as battery_level,
    sensor_metadata:status::STRING as sensor_status,
    sensor_metadata:firmware_version::STRING as firmware,
    reading_timestamp
FROM soni.default.iot_sensor_variant_demo
LIMIT 5
""").display()
