# Event Hub Avro desirialization

hyssh@microsoft.com


## Reference code sample 

Ref 1: https://stackoverflow.com/questions/72417454/azure-schema-registry-spark-structured-streaming-kafka-eventhub-compatibilit

Ref 2: https://spark.apache.org/docs/latest/sql-data-sources-avro.html#to_avro-and-from_avro

In [1]:
import os

from azure.identity import DefaultAzureCredential

# from azure.eventhub import EventHubConsumerClient
from azure.schemaregistry import SchemaRegistryClient
from azure.schemaregistry.serializer.avroserializer import AvroSerializer


# from pyspark.sql.avro.functions import from_avro, to_avro


StatementMeta(sparkpool3, 16, 1, Finished, Available)

## Get Environment variables

Get the variables from [Spark configuration](https://learn.microsoft.com/en-us/azure/synapse-analytics/spark/apache-spark-azure-create-spark-configuration)

In [2]:
os.environ["AZURE_CLIENT_ID"] = spark.sparkContext.environment.get("AZURE_CLIENT_ID")
os.environ["AZURE_TENANT_ID"] = spark.sparkContext.environment.get("AZURE_TENANT_ID")
os.environ["AZURE_CLIENT_SECRET"] = spark.sparkContext.environment.get("AZURE_CLIENT_SECRET")
os.environ["EVENT_HUB_CONN_STR"] = spark.sparkContext.environment.get("EVENT_HUB_CONN_STR")
os.environ["EVENT_HUB_CONN_STR_ENT"] = spark.sparkContext.environment.get("EVENT_HUB_CONN_STR_ENT")
os.environ["EVENT_HUB_CONN_STR_LISTEN"] = spark.sparkContext.environment.get("EVENT_HUB_CONN_STR_LISTEN")

SCHEMAREGISTRY_FULLY_QUALIFIED_NAMESPACE = "shin-eventhub-ns.servicebus.windows.net"
EVENTHUB_NAME="transactions"


StatementMeta(sparkpool3, 16, 2, Finished, Available)

## Define connection string

In [3]:
ehConf = {}

# For versions before 2.3.15, set the connection string without encryption
# ehConf['eventhubs.connectionString'] = os.environ["EVENT_HUB_CONN_STR"]

# For 2.3.15 version and above, the configuration dictionary requires that connection string be encrypted.
ehConf['eventhubs.connectionString'] = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(os.environ["EVENT_HUB_CONN_STR_ENT"])

# Confrim the consumer group from Event Hub
# https://learn.microsoft.com/en-us/azure/event-hubs/event-hubs-features#consumer-groups
ehConf['eventhubs.consumerGroup'] = "spark"

StatementMeta(sparkpool3, 16, 3, Finished, Available)

## Create UDF

Create a UDF to desrialize Avro message using Avro Serializer that supported SchemaRegistry

In [4]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def deserializeBody(encodedBody):    

    token_credential = DefaultAzureCredential()

    schema_registry_client = SchemaRegistryClient("shin-eventhub-ns.servicebus.windows.net", token_credential)
    avro_serializer = AvroSerializer(client=schema_registry_client, group_name="tranxs")

    return avro_serializer.deserialize(encodedBody)

deserializedBody_udf = udf(deserializeBody, StringType())

StatementMeta(sparkpool3, 16, 4, Finished, Available)

## (Optional test) Get schema from schema SchemaRegistry


In [5]:
token_credential = DefaultAzureCredential()

schema_registry_client = SchemaRegistryClient(SCHEMAREGISTRY_FULLY_QUALIFIED_NAMESPACE, token_credential)
jsonFormatSchema = schema_registry_client.get_schema("71e03f67bd694b30b6dad6dae1fb8d86") 
print(jsonFormatSchema)

# bytes_payload = b"".join(b for b in event.body)
# deserialized_data = avro_serializer.deserialize(bytes_payload)

StatementMeta(sparkpool3, 16, 5, Finished, Available)

Schema(definition={"namespace":"example.avro","type":"record","name":"User","fields":[{"name":"name","type":"string"},{"name":"favorite_number","type":["int","null"]},{"name":"favorite_color","type":["string","null"]}]}, properties=SchemaProperties(id=71e03f67bd694b30b6dad6dae1fb8d86, format=Avro, group_name=tranxs, name=example.avro.User, version=1))


## Read streaming data from Event Hubs

In [6]:
# Simple batch query
df = spark.readStream.format("eventhubs").options(**ehConf).load()

# df = df.outputMode("append").format("console").start().awaitTermination()
df.printSchema()

StatementMeta(sparkpool3, 16, 6, Finished, Available)

root
 |-- body: binary (nullable = true)
 |-- partition: string (nullable = true)
 |-- offset: string (nullable = true)
 |-- sequenceNumber: long (nullable = true)
 |-- enqueuedTime: timestamp (nullable = true)
 |-- publisher: string (nullable = true)
 |-- partitionKey: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- systemProperties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



## Write streaming data to ADLS as Parquet

In [7]:
# Save stream as parquet
ds1 = df.writeStream.format("parquet")\
    .option("path", "abfss://dev-synapse@hyundevsynapsestorage.dfs.core.windows.net/streaming")\
    .option("checkpointLocation", "abfss://dev-synapse@hyundevsynapsestorage.dfs.core.windows.net/streaming_checkpoint")\
    .trigger(processingTime='10 seconds')\
    .start()

## The result wont be seen in the output of this cell
## 
ds2 = df.select("body").writeStream.outputMode("append").format("console").start()#.awaitTermination()

# ds.start().awaitTermination()
#.writeStream.outputMode("append").format("console").start().awaitTermination()


StatementMeta(sparkpool3, 16, 7, Finished, Available)

In [8]:
print(ds1.status)
print(ds2.status)

StatementMeta(sparkpool3, 16, 8, Finished, Available)

{'message': 'Processing new data', 'isDataAvailable': True, 'isTriggerActive': True}
{'message': 'Writing offsets to log', 'isDataAvailable': False, 'isTriggerActive': True}


In [9]:
print(ds1.stop())
print(ds2.stop())

StatementMeta(sparkpool3, 16, 9, Finished, Available)

None
None


In [10]:
filedf = spark.read.load('abfss://dev-synapse@hyundevsynapsestorage.dfs.core.windows.net/streaming/*.parquet', format='parquet')
print(filedf.count())

StatementMeta(sparkpool3, 16, 10, Finished, Available)

110


## The messages are saved

The column body contains the unreadable raw data

In [11]:
filedf.show()

StatementMeta(sparkpool3, 16, 11, Finished, Available)

+--------------------+---------+------------+--------------+--------------------+---------+------------+----------+----------------+
|                body|partition|      offset|sequenceNumber|        enqueuedTime|publisher|partitionKey|properties|systemProperties|
+--------------------+---------+------------+--------------+--------------------+---------+------------+----------+----------------+
|[00 00 00 00 37 3...|        0|167504141520|       6072574|2022-10-13 20:34:...|     null|        null|        {}|              {}|
|[00 00 00 00 37 3...|        0|167504141632|       6072575|2022-10-13 20:34:...|     null|        null|        {}|              {}|
|[00 00 00 00 37 3...|        0|167504141744|       6072576|2022-10-13 20:34:...|     null|        null|        {}|              {}|
|[00 00 00 00 37 3...|        0|167504141856|       6072577|2022-10-13 20:34:...|     null|        null|        {}|              {}|
|[00 00 00 00 37 3...|        0|167504141968|       6072578|2022-10-1

## Test UDF 1

Use UDF to deserialize body in the file

In [12]:
filedf = filedf.withColumn("decodedBody", deserializedBody_udf(filedf["body"]))
filedf.show()

StatementMeta(sparkpool3, 16, 12, Finished, Available)

+--------------------+---------+------------+--------------+--------------------+---------+------------+----------+----------------+--------------------+
|                body|partition|      offset|sequenceNumber|        enqueuedTime|publisher|partitionKey|properties|systemProperties|         decodedBody|
+--------------------+---------+------------+--------------+--------------------+---------+------------+----------+----------------+--------------------+
|[00 00 00 00 37 3...|        0|167504141520|       6072574|2022-10-13 20:34:...|     null|        null|        {}|              {}|{name=Bob, favori...|
|[00 00 00 00 37 3...|        0|167504141632|       6072575|2022-10-13 20:34:...|     null|        null|        {}|              {}|{name=Bob, favori...|
|[00 00 00 00 37 3...|        0|167504141744|       6072576|2022-10-13 20:34:...|     null|        null|        {}|              {}|{name=Bob, favori...|
|[00 00 00 00 37 3...|        0|167504141856|       6072577|2022-10-13 20:34

## Test 2. Use udf before wright data in ADLS

In stread of dederiailze body reading data that saved in ADLS, we can try to deserialize body right before written in ADLS

In [13]:
# see result in the console
# output = df.select("value")
# output.printSchema()

# df.writeStream.outputMode("append").format("concole").option("truncate", False).start().awaitTermination()

StatementMeta(sparkpool3, 16, 13, Finished, Available)

In [14]:
# ds3 = df.select("body")#writeStream.outputMode("append").format("console").start().awaitTermination(10)
# ds3 = ds3.withColumn("decodedBody", deserializedBody_udf(ds3["body"]))
# ds3.writeStream.outputMode("append").format("console").start().awaitTermination(300)

StatementMeta(sparkpool3, 16, 14, Finished, Available)

Use `deserializeBody_udf` to deserialize the body and save the result as Parquet

In [15]:
ds4 = df.withColumn("decodedBody", deserializedBody_udf(df["body"])).writeStream.format("parquet")\
    .option("path", "abfss://dev-synapse@hyundevsynapsestorage.dfs.core.windows.net/streamingAfterdecode")\
    .option("checkpointLocation", "abfss://dev-synapse@hyundevsynapsestorage.dfs.core.windows.net/streamingAfterdecode_checkpoint")\
    .start()
    # .trigger(processingTime='10 seconds')\
    # .start()


StatementMeta(sparkpool3, 16, 15, Finished, Available)

In [16]:
print(ds4.status)

StatementMeta(sparkpool3, 16, 16, Finished, Available)

{'message': 'Initializing sources', 'isDataAvailable': False, 'isTriggerActive': True}


### See the result

In [17]:
afterDecodedf = spark.read.load('abfss://dev-synapse@hyundevsynapsestorage.dfs.core.windows.net/streamingAfterdecode/*.parquet', format='parquet')
print(afterDecodedf.count())

StatementMeta(sparkpool3, 16, 17, Finished, Available)

98


In [18]:
afterDecodedf.count()

StatementMeta(sparkpool3, 16, 18, Finished, Available)

98

In [19]:
afterDecodedf.select("body", "decodedBody").show()

StatementMeta(sparkpool3, 16, 19, Finished, Available)

+--------------------+--------------------+
|                body|         decodedBody|
+--------------------+--------------------+
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob, favori...|
|[00 00 00 00 37 3...|{name=Bob,

In [20]:
ds4.status

StatementMeta(sparkpool3, 16, 20, Finished, Available)

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

In [21]:
ds4.stop()

StatementMeta(sparkpool3, 16, 21, Finished, Available)

## End of Notebook