In [0]:
from delta.tables import DeltaTable
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType
import time
import os

schema_transacoes = StructType([
    StructField("transacao_id", StringType(), True),
    StructField("timestamp", TimestampType(), True), 
    StructField("cliente_id", StringType(), True),
    StructField("valor", DoubleType(), True),
    StructField("status", StringType(), True)
])

parquet_path = "/Volumes/workspace/bravium/bravium_db/transacao/"
bronze_path = "/Volumes/workspace/bravium/bronze/transacao/"

In [0]:
df_stream = (
    spark.readStream
    .format("parquet")
    .schema(schema_transacoes)
    .load(parquet_path)
)

df_stream = (
    df_stream.dropDuplicates(["transacao_id"])
             .withWatermark("timestamp", "10 minutes")
)

In [0]:
def upsert_to_delta(batch_df, batch_id):
    delta_table = DeltaTable.forPath(spark, bronze_path)

    (
        delta_table.alias("t")
        .merge(
            batch_df.alias("s"),
            "t.transacao_id = s.transacao_id"
        )
        .whenMatchedUpdateAll()
        .whenNotMatchedInsertAll()
        .execute()
    )

(
    df_stream.writeStream
    .foreachBatch(upsert_to_delta)
    .option("checkpointLocation", "/Volumes/workspace/bravium/bronze/checkpoints/transacao")
    .outputMode("update")
    .trigger(availableNow=True)
    .start()
    .awaitTermination()
)