In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import os

delta_package = "io.delta:delta-spark_2.12:3.0.0"  # Replace with the correct Delta version
xml_package = "com.databricks:spark-xml_2.12:0.14.0"
# Initialize Spark Session
spark = SparkSession.builder.appName("MergeToProcessed").master('spark://spark-test1:7077') \
    .config("spark.jars.packages", f"{delta_package},{xml_package}") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.cores.max", "1") \
    .getOrCreate()

hdfs_path = "hdfs://spark-test1:9000"
raw = os.path.join(hdfs_path, 'raw', 'transactions')
processed = os.path.join(hdfs_path, 'processed', 'transactions')
checkpoint = os.path.join(hdfs_path, 'checkpoint', 'processed', 'transactions')
dlq = os.path.join(hdfs_path, 'dlq', 'processed', 'transactions')


:: loading settings :: url = jar:file:/home/spark/.local/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/spark/.ivy2/cache
The jars for the packages stored in: /home/spark/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
com.databricks#spark-xml_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a402e44f-f946-480d-b7aa-81e66c09e4e1;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.0.0 in central
	found io.delta#delta-storage;3.0.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
	found com.databricks#spark-xml_2.12;0.14.0 in central
	found commons-io#commons-io;2.8.0 in central
	found org.glassfish.jaxb#txw2;2.3.4 in central
	found org.apache.ws.xmlschema#xmlschema-core;2.2.5 in central
:: resolution report :: resolve 218ms :: artifacts dl 20ms
	:: modules in use:
	com.databricks#spark-xml_2.12;0.14.0 from central in [default]
	commons-io#commons-io;2.8.0 from central in [default]
	io.delta#delta-spark_2.12;3.0.0 from central in [default]
	io.delta#delta-storage;3.0.0 from central in [def

In [2]:
from pyspark.sql.column import Column, _to_java_column
from pyspark.sql.types import _parse_datatype_json_string
from delta.tables import DeltaTable

def ext_from_xml(xml_column, schema, options={}):
    java_column = _to_java_column(xml_column.cast('string'))
    java_schema = spark._jsparkSession.parseDataType(schema.json())
    scala_map = spark._jvm.org.apache.spark.api.python.PythonUtils.toScalaMap(options)
    jc = spark._jvm.com.databricks.spark.xml.functions.from_xml(
        java_column, java_schema, scala_map)
    return Column(jc)

def ext_schema_of_xml_df(df, options={}):
    assert len(df.columns) == 1

    scala_options = spark._jvm.PythonUtils.toScalaMap(options)
    java_xml_module = getattr(getattr(
        spark._jvm.com.databricks.spark.xml, "package$"), "MODULE$")
    java_schema = java_xml_module.schema_of_xml_df(df._jdf, scala_options)
    return _parse_datatype_json_string(java_schema.json())

In [3]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

# Define the schema of your raw layer
raw_schema = StructType([
    StructField("key", StringType()),
    StructField("value", StringType()),
    StructField("topic", StringType()),
    StructField("partition", StringType()),
    StructField("offset", StringType()),
    StructField("timestamp", StringType()),
    StructField("timestampType", StringType()),
    StructField("_raw_insert_timestamp", TimestampType()),
    StructField("_raw_insert_date", StringType()),
    StructField("_raw_insert_hour", IntegerType())
])
# Read the data from the raw layer using structured streaming
raw_df = spark.readStream.format("parquet")\
    .option("path", raw)\
    .schema(raw_schema) \
    .load()


# Function to process each batch
def process_batch(batch_df, batch_id):
    if not batch_df.rdd.isEmpty():

        json_schema = StructType([
            StructField("path", StringType()),
            StructField("modificationTime", StringType()),
            StructField("length", StringType()),
            StructField("content", StringType())
        ])
        parsed_json_df = batch_df.withColumn("json_data", from_json(col("value"), json_schema))

        # Extract and decode the base64 content
        decoded_df = parsed_json_df.withColumn("decoded_content", unbase64(col("json_data.content"))) \
            .withColumn("xml_content", expr("CAST(decoded_content AS STRING)"))

        #schema_def = ext_schema_of_xml_df(decoded_df.select("xml_content"))
        #decoded_df = decoded_df.withColumn('test_debug', lit(schema_def).cast('string'))

        xml_schema = StructType([
            StructField(
                'Transaction', 
                ArrayType(
                    StructType([
                        StructField('TransactionId', LongType(), True),
                        StructField('Amount', FloatType(), True),
                        StructField('CustomerId', LongType(), True),
                        StructField('DateTime', TimestampType(), True),
                        StructField('Location', StringType(), True),
                        StructField('Result', StringType(), True)
                    ]),
                    True
                ),
                True
            )
        ])

        xml_df = decoded_df.withColumn(
            "parsed",
            ext_from_xml(
                xml_column = col("xml_content"),
                schema=xml_schema,
                options={"mode": "FAILFAST"}
            )
        )
        
        windowSpec = Window.partitionBy("TransactionId").orderBy(col("_raw_insert_timestamp").desc())
        # Flatten the DataFrame
        flattened_df = xml_df.select(
            explode(col("parsed.Transaction")).alias("Transaction"),
            col('_raw_insert_timestamp').alias('_raw_insert_timestamp')
        ).select(
            col("Transaction.TransactionId").alias("TransactionId"),
            col("Transaction.Amount").alias("Amount"),
            col("Transaction.CustomerId").alias("CustomerId"),
            col("Transaction.DateTime").alias("TransactionDateTime"),
            to_date(col("Transaction.DateTime")).alias("TransactionDate"),
            upper(trim(col("Transaction.Location"))).alias("Location"),
            upper(trim(col("Transaction.Result"))).alias("Result"),
            current_timestamp().alias("_processed_insert_timestamp"),
            col('_raw_insert_timestamp').alias('_raw_insert_timestamp')
        ).withColumn("row_rank", row_number().over(windowSpec)) \
            .filter(col("row_rank") == 1) \
            .drop("row_rank")

        # Check for the existence of the Delta table
        if DeltaTable.isDeltaTable(spark, processed):
            # If the table exists, create a DeltaTable instance for it
            delta_table = DeltaTable.forPath(spark, processed)
            # Perform the merge operation
            delta_table.alias("target").merge(
                flattened_df.alias("source"),
                "target.TransactionId = source.TransactionId"
            ).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()
        else:
            # If the Delta table does not exist, create one from the batch DataFrame
            flattened_df.write.format("delta").partitionBy("TransactionDate").save(processed)
    else:
        print("Empty batch")

# Write the transformed data to the processed layer
query = raw_df.writeStream \
    .foreachBatch(process_batch) \
    .option("checkpointLocation", checkpoint) \
    .start() \
    .awaitTermination()

# query = flattened_df \
#     .writeStream \
#     .outputMode("append") \
#     .format("console") \
#     .start()

23/12/10 07:10:28 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
23/12/10 07:10:28 WARN HadoopFSUtils: The directory hdfs://spark-test1:9000/raw/transactions/_raw_insert_date=2023-12-10/_raw_insert_hour=06/part-00000-46bb570b-d10b-489a-baec-29c50ce541fe.c000.snappy.parquet was not found. Was it deleted very recently?
23/12/10 07:10:28 WARN HadoopFSUtils: The directory hdfs://spark-test1:9000/raw/transactions/_raw_insert_date=2023-12-10/_raw_insert_hour=06/part-00000-dda1102e-2a59-4fca-b5b4-4ca8bb8830b6.c000.snappy.parquet was not found. Was it deleted very recently?
23/12/10 07:10:28 WARN HadoopFSUtils: The directory hdfs://spark-test1:9000/raw/transactions/_raw_insert_date=2023-12-10/_raw_insert_hour=06/part-00000-a43495e1-7682-4970-9ae0-d1b32ad259bd.c000.snappy.parquet was not found. Was it deleted very recently?


Empty batch


23/12/10 07:11:21 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
----------------------------------------                                        
Exception occurred during processing of request from ('127.0.0.1', 56128)
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/spark/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/spark/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/spark/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 539, 

Py4JError: An error occurred while calling o52.awaitTermination

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/home/spark/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/spark/.local/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/spark/.local/lib/python3.10/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
