In [0]:
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType
)

json_schema = StructType([
    StructField("Body", StructType([
        StructField("code", StringType(), True),
        StructField("mnsn", StringType(), True),
        StructField("scheduleName", StringType(), True),
        StructField("timestamp", LongType(), True),
        StructField("type", StringType(), True),
    ]), True),

    StructField("EnqueuedTimeUtc", StringType(), True),

    StructField("Properties", StructType([
        StructField("appTopic", StringType(), True),
        StructField("customerId", StringType(), True),
        StructField("dealerId", StringType(), True),
        StructField("relatedGroupId", StringType(), True),
    ]), True),

    StructField("SystemProperties", StructType([
        StructField("connectionAuthMethod", StringType(), True),
        StructField("connectionDeviceGenerationId", StringType(), True),
        StructField("connectionDeviceId", StringType(), True),
        StructField("contentEncoding", StringType(), True),
        StructField("contentType", StringType(), True),
        StructField("enqueuedTime", StringType(), True),
    ]), True)
])

display(json_schema)

df = spark.read.format("json").schema(json_schema).load('/Volumes/workspace/default/json_data/15/')

display(df)

In [0]:
schema = df.printSchema()

In [0]:
from pyspark.sql.functions import col, to_timestamp

flat_df = df.select(
    col("Body.code").alias("code"),
    col("Body.mnsn").alias("mnsn"),
    col("Body.scheduleName").alias("schedule_name"),
    col("Body.type").alias("event_type"),
    col("Body.timestamp").alias("event_timestamp"),

    col("EnqueuedTimeUtc").alias("enqueued_time_utc"),

    col("Properties.appTopic").alias("app_topic"),
    col("Properties.customerId").alias("customer_id"),
    col("Properties.dealerId").alias("dealer_id"),
    col("Properties.relatedGroupId").alias("related_group_id"),

    col("SystemProperties.connectionAuthMethod").alias("connection_auth_method"),
    col("SystemProperties.connectionDeviceGenerationId").alias("connection_device_generation_id"),
    col("SystemProperties.connectionDeviceId").alias("connection_device_id"),
    col("SystemProperties.contentEncoding").alias("content_encoding"),
    col("SystemProperties.contentType").alias("content_type"),
    col("SystemProperties.enqueuedTime").alias("system_enqueued_time")
)

# optional: convert timestamp (epoch millis â†’ timestamp)
flat_df = flat_df.withColumn(
    "event_time",
    to_timestamp(col("event_timestamp") / 1000)
)
display(flat_df)

In [0]:
from pyspark.sql.functions import to_timestamp, col
# Write to a Unity Catalog managed Delta table
flat_df.write \
    .format("delta") \
    .option('mergeSchema', 'true') \
    .mode("append") \
    .saveAsTable("workspace.default.json_data")