In [0]:
from pyspark.sql.types import (
    StructType, StructField, StringType, LongType
)

from pyspark.sql.types import *

json_schema = StructType([
    StructField("Body", StructType([
        StructField("mnsn", StringType(), True),
        StructField("scheduleName", StringType(), True),

        StructField("smsProperties", StructType([
            StructField("device", StructType([
                StructField("address", StringType(), True),

                StructField("disposalMarkerList", ArrayType(
                    StructType([
                        StructField("amount", StructType([
                            StructField("capacity", StringType(), True),
                            StructField("state", StringType(), True),
                            StructField("typical", StringType(), True),
                            StructField("unit", StringType(), True)
                        ]), True),
                        StructField("color", StringType(), True),
                        StructField("description", StringType(), True),
                        StructField("type", StringType(), True)
                    ])
                ), True),

                StructField("familyName", StringType(), True),
                StructField("friendlyName", StringType(), True),

                StructField("inTrayList", ArrayType(
                    StructType([
                        StructField("amount", StructType([
                            StructField("capacity", StringType(), True),
                            StructField("state", StringType(), True),
                            StructField("typical", StringType(), True),
                            StructField("unit", StringType(), True)
                        ]), True),
                        StructField("description", StringType(), True),
                        StructField("id", StringType(), True),
                        StructField("inserter", StringType(), True),
                        StructField("manual", StringType(), True),
                        StructField("mediaDimFeed", StringType(), True),
                        StructField("mediaDimXFeed", StringType(), True),
                        StructField("mediaName", StringType(), True),
                        StructField("mediaSize", StringType(), True),
                        StructField("mediaSizeHeight", StringType(), True),
                        StructField("mediaSizeName", StringType(), True),
                        StructField("mediaSizeUnit", StringType(), True),
                        StructField("mediaSizeWidth", StringType(), True),
                        StructField("mediaType", StringType(), True),
                        StructField("modelName", StringType(), True),
                        StructField("name", StringType(), True),
                        StructField("virtual", StringType(), True)
                    ])
                ), True),

                StructField("location", StructType([
                    StructField("address", StringType(), True)
                ]), True),

                StructField("markerList", ArrayType(
                    StructType([
                        StructField("amount", StructType([
                            StructField("capacity", StringType(), True),
                            StructField("state", StringType(), True),
                            StructField("typical", StringType(), True),
                            StructField("unit", StringType(), True)
                        ]), True),
                        StructField("color", StringType(), True),
                        StructField("description", StringType(), True),
                        StructField("type", StringType(), True)
                    ])
                ), True),

                StructField("modelName", StringType(), True),

                StructField("outTrayList", ArrayType(
                    StructType([
                        StructField("amount", StructType([
                            StructField("capacity", StringType(), True),
                            StructField("state", StringType(), True),
                            StructField("typical", StringType(), True),
                            StructField("unit", StringType(), True)
                        ]), True),
                        StructField("deliveryOrientation", StringType(), True),
                        StructField("description", StringType(), True),
                        StructField("id", StringType(), True),
                        StructField("modelName", StringType(), True),
                        StructField("name", StringType(), True),
                        StructField("stackingOrder", StringType(), True)
                    ])
                ), True),

                StructField("serialId", StringType(), True),
                StructField("statusRawValue", StringType(), True),

                StructField("suppliesCounter", StructType([
                    StructField("TYPE", StructType([
                        StructField("toner", StructType([
                            StructField("large", StructType([
                                StructField("end", StructType([
                                    StructField("black", StructType([StructField("value", LongType(), True)]), True),
                                    StructField("cyan", StructType([StructField("value", LongType(), True)]), True),
                                    StructField("magenta", StructType([StructField("value", LongType(), True)]), True),
                                    StructField("yellow", StructType([StructField("value", LongType(), True)]), True)
                                ]), True)
                            ]), True)
                        ]), True)
                    ]), True)
                ]), True)

            ]), True)
        ]), True),

        StructField("interface", StructType([
            StructField("ethernetList", ArrayType(
                StructType([
                    StructField("address", StringType(), True),
                    StructField("id", StringType(), True),
                    StructField("type", StringType(), True)
                ])
            ), True),
            StructField("ipList", ArrayType(
                StructType([
                    StructField("address", StringType(), True),
                    StructField("defaultRoute", StringType(), True),
                    StructField("ethernetId", StringType(), True),
                    StructField("subnetMask", StringType(), True)
                ])
            ), True)
        ]), True),

        StructField("timestamp", LongType(), True),
        StructField("type", StringType(), True)
    ]), True),

    StructField("EnqueuedTimeUtc", StringType(), True),

    StructField("Properties", StructType([
        StructField("appTopic", StringType(), True),
        StructField("customerId", StringType(), True),
        StructField("dealerId", StringType(), True),
        StructField("relatedGroupId", StringType(), True)
    ]), True),

    StructField("SystemProperties", StructType([
        StructField("connectionAuthMethod", StringType(), True),
        StructField("connectionDeviceGenerationId", StringType(), True),
        StructField("connectionDeviceId", StringType(), True),
        StructField("contentEncoding", StringType(), True),
        StructField("contentType", StringType(), True),
        StructField("enqueuedTime", StringType(), True)
    ]), True)
])


display(json_schema)

df = spark.read.format("json").schema(json_schema).load('/Volumes/workspace/default/json_data/14/')
# df = spark.read.json('/Volumes/workspace/default/json_data/15/02-28.json')

display(df)

In [0]:
schema = df.printSchema()

In [0]:
from pyspark.sql.functions import col, to_timestamp

flat_df = df.select(
    col("Body.mnsn").alias("mnsn"),
    col("Body.scheduleName").alias("schedule_name"),
    col("Body.type").alias("event_type"),
    col("Body.timestamp").alias("event_timestamp"),

    col("Body.smsProperties.device.serialId").alias("device_serial_id"),
    col("Body.smsProperties.device.modelName").alias("device_model_name"),
    col("Body.smsProperties.device.familyName").alias("device_family_name"),
    col("Body.smsProperties.device.friendlyName").alias("device_friendly_name"),
    col("Body.smsProperties.device.statusRawValue").alias("device_status"),
    col("Body.smsProperties.device.address").alias("device_address"),

    col("Body.smsProperties.device.location.address")
        .alias("device_location_address"),

    col("Body.smsProperties.device.suppliesCounter.TYPE.toner.large.end.black.value")
        .alias("toner_black_end"),
    col("Body.smsProperties.device.suppliesCounter.TYPE.toner.large.end.cyan.value")
        .alias("toner_cyan_end"),
    col("Body.smsProperties.device.suppliesCounter.TYPE.toner.large.end.magenta.value")
        .alias("toner_magenta_end"),
    col("Body.smsProperties.device.suppliesCounter.TYPE.toner.large.end.yellow.value")
        .alias("toner_yellow_end"),

    col("Body.smsProperties.device.inTrayList").alias("in_tray_list"),
    col("Body.smsProperties.device.outTrayList").alias("out_tray_list"),
    col("Body.smsProperties.device.markerList").alias("marker_list"),
    col("Body.smsProperties.device.disposalMarkerList").alias("disposal_marker_list"),

    col("Body.interface.ethernetList").alias("ethernet_list"),
    col("Body.interface.ipList").alias("ip_list"),

    col("EnqueuedTimeUtc").alias("enqueued_time_utc"),

    col("Properties.appTopic").alias("app_topic"),
    col("Properties.customerId").alias("customer_id"),
    col("Properties.dealerId").alias("dealer_id"),
    col("Properties.relatedGroupId").alias("related_group_id"),

    col("SystemProperties.connectionAuthMethod")
        .alias("connection_auth_method"),
    col("SystemProperties.connectionDeviceGenerationId")
        .alias("connection_device_generation_id"),
    col("SystemProperties.connectionDeviceId")
        .alias("connection_device_id"),
    col("SystemProperties.contentEncoding")
        .alias("content_encoding"),
    col("SystemProperties.contentType")
        .alias("content_type"),
    col("SystemProperties.enqueuedTime")
        .alias("system_enqueued_time")
)


# optional: convert timestamp (epoch millis â†’ timestamp)
flat_df = flat_df.withColumn(
    "event_time",
    to_timestamp(col("event_timestamp") / 1000)
)
display(flat_df)

In [0]:
from pyspark.sql.functions import to_timestamp, col
# Write to a Unity Catalog managed Delta table
flat_df.write \
    .format("delta") \
    .option('mergeSchema', 'true') \
    .mode("append") \
    .saveAsTable("workspace.default.json_data")

In [0]:
renamed = df.withColumnRenamed()