In [1]:
import configargparse
from pathlib import Path
from spark_utils.streaming_utils import EventHubStreamer

p = configargparse.ArgParser(prog='streaming.py',
                             description='Streaming Job Sample',
                             default_config_files=[Path().joinpath('configuration/run_args_data_generator.conf').resolve().as_posix()],
                             formatter_class=configargparse.ArgumentDefaultsHelpFormatter)
p.add('--output-eh-connection-string', type=str, required=True,
      help='Output Event Hub connection string', env_var='GENERATOR_OUTPUT_EH_CONNECTION_STRING')

args, unknown_args = p.parse_known_args()

if unknown_args:
    print("Unknown args:")
    _ = [print(arg) for arg in unknown_args]


Unknown args:
--ip=127.0.0.1
--stdin=9068
--control=9066
--hb=9065
--Session.signature_scheme="hmac-sha256"
--Session.key=b"f039a383-d7c3-49ca-9f77-0d829506d42d"
--shell=9067
--transport="tcp"
--iopub=9069
--f=/tmp/tmp-179C5G5YsfqXA7z.json


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

spark_conf = SparkConf(loadDefaults=True)

spark = SparkSession\
    .builder\
    .config(conf=spark_conf)\
    .getOrCreate()

sc = spark.sparkContext
print("Spark Configuration:")
_ = [print(k + '=' + v) for k, v in sc.getConf().getAll()]


Spark Configuration:
spark.app.id=local-1610737526762
spark.submit.pyFiles=/home/jovyan/.ivy2/jars/com.microsoft.azure_azure-eventhubs-spark_2.12-2.3.17.jar,/home/jovyan/.ivy2/jars/org.apache.hadoop_hadoop-azure-3.3.0.jar,/home/jovyan/.ivy2/jars/io.delta_delta-core_2.12-0.7.0.jar,/home/jovyan/.ivy2/jars/com.microsoft.azure_azure-eventhubs-3.2.0.jar,/home/jovyan/.ivy2/jars/org.scala-lang.modules_scala-java8-compat_2.12-0.9.0.jar,/home/jovyan/.ivy2/jars/org.apache.qpid_proton-j-0.33.4.jar,/home/jovyan/.ivy2/jars/com.microsoft.azure_qpid-proton-j-extensions-1.2.3.jar,/home/jovyan/.ivy2/jars/org.slf4j_slf4j-api-1.7.28.jar,/home/jovyan/.ivy2/jars/com.microsoft.azure_azure-client-authentication-1.7.3.jar,/home/jovyan/.ivy2/jars/com.nimbusds_nimbus-jose-jwt-6.0.1.jar,/home/jovyan/.ivy2/jars/com.microsoft.azure_azure-client-runtime-1.7.3.jar,/home/jovyan/.ivy2/jars/commons-codec_commons-codec-1.11.jar,/home/jovyan/.ivy2/jars/com.microsoft.azure_adal4j-1.6.4.jar,/home/jovyan/.ivy2/jars/com.micr

In [4]:

rateStream = spark \
  .readStream \
  .format("rate") \
  .option("rowsPerSecond", 10) \
  .load()

EventHubStreamer.preview_stream(rateStream, await_seconds=3)

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)

+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+

+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2021-01-15 19:06:...|    0|
|2021-01-15 19:06:...|    4|
|2021-01-15 19:06:...|    8|
|2021-01-15 19:06:...|   12|
|2021-01-15 19:06:...|   16|
|2021-01-15 19:06:...|    1|
|2021-01-15 19:06:...|    5|
|2021-01-15 19:06:...|    9|
|2021-01-15 19:06:...|   13|
|2021-01-15 19:06:...|   17|
|2021-01-15 19:06:...|    2|
|2021-01-15 19:06:...|    6|
|2021-01-15 19:06:...|   10|
|2021-01-15 19:06:...|   14|
|2021-01-15 19:06:...|   18|
|2021-01-15 19:06:...|    3|
|2021-01-15 19:06:...|    7|
|2021-01-15 19:06:...|   11|
|2021-01-15 19:06:...|   15|
|2021-01-15 19:06:...|   19|
+--------------------+-----+



In [5]:
from pyspark.sql.functions import col, lit, struct

generatedData = rateStream \
    .withColumn("value", col("value") * 3019) \
    .withColumnRenamed("timestamp", "ObservationTime") \
    .withColumn("MeterId", col("value") % lit(127)) \
    .withColumn("SupplierId", col("value") % lit(733)) \
    .withColumn("Measurement", struct(
        (col("value") % lit(59)).alias("Value"),
        lit("kWH").alias("Unit")
    )) \
    .drop("value")


EventHubStreamer.preview_stream(rateStream, await_seconds=3)

root
 |-- timestamp: timestamp (nullable = true)
 |-- value: long (nullable = true)

+---------+-----+
|timestamp|value|
+---------+-----+
+---------+-----+

+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2021-01-15 19:06:...|    0|
|2021-01-15 19:06:...|    4|
|2021-01-15 19:06:...|    8|
|2021-01-15 19:06:...|    1|
|2021-01-15 19:06:...|    5|
|2021-01-15 19:06:...|    9|
|2021-01-15 19:06:...|    2|
|2021-01-15 19:06:...|    6|
|2021-01-15 19:06:...|    3|
|2021-01-15 19:06:...|    7|
+--------------------+-----+

+--------------------+-----+
|           timestamp|value|
+--------------------+-----+
|2021-01-15 19:06:...|   10|
|2021-01-15 19:06:...|   14|
|2021-01-15 19:06:...|   18|
|2021-01-15 19:06:...|   11|
|2021-01-15 19:06:...|   15|
|2021-01-15 19:06:...|   19|
|2021-01-15 19:06:...|   12|
|2021-01-15 19:06:...|   16|
|2021-01-15 19:06:...|   13|
|2021-01-15 19:06:...|   17|
+--------------------+-----+



In [6]:
from pyspark.sql.functions import to_json

jsonData = generatedData \
    .select(to_json(struct(col("*"))).cast("string").alias("body"))

EventHubStreamer.preview_stream(jsonData, await_seconds=3)

root
 |-- body: string (nullable = true)

+----+
|body|
+----+
+----+

+--------------------+
|                body|
+--------------------+
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
+--------------------+

+--------------------+
|                body|
+--------------------+
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
|{"ObservationTime...|
+--------------------+



In [1]:
from pyspark.sql.functions import to_json

eh_conf = {
    'eventhubs.connectionString':
    sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(args.output_eh_connection_string)
}

exec = jsonData \
    .writeStream \
    .format("eventhubs") \
    .options(**eh_conf) \
    .option("checkpointLocation", '.checkpoint/data-generator') \
    .start()

exec.awaitTermination(600)
exec.stop()