In [0]:
import pyspark.sql.functions as F
from delta.tables import DeltaTable

In [0]:
table_cad001p002_bronze  = "data_intelligence.bronze.cad001p002"
table_cad001p002_silver  = "data_intelligence.silver.cad001p002"
table_cad001p002_control = "data_intelligence.silver.cad001p002_control"

In [0]:
def get_last_created_at_control():
    return spark.sql(f"SELECT MAX(created_at) AS last_digtao_ptcao FROM {table_cad001p002_control}").first()['last_digtao_ptcao']

In [0]:
# Get the latest value of 'created_at' from the control table
last_created_at_control = get_last_created_at_control()

type_of_load = "incremental" if last_created_at_control is not None else "full"

# Define extra columns to add to the DataFrame
dict_extra_columns = {
    "is_active"    : F.lit(True), # Static value for 'is_active'
    "hash_columns" : F.sha2(F.concat_ws("||", F.col("endereco"), F.col("salario")), 256)  # Hash of 'endereco' and 'salario'
}

try:
    if last_created_at_control is None:
        print("FULL")

        # Read all data from bronze table and add extra columns
        df_bronze = spark.table(table_cad001p002_bronze).withColumns(dict_extra_columns).dropDuplicates(["id", "hash_columns"])
    else:
        print("Incremental")

        # Filter bronze table for new records and add extra columns
        df_bronze = (
            spark.table(table_cad001p002_bronze)
                .filter(F.col("created_at") > F.lit(last_created_at_control))
                .withColumns(dict_extra_columns)
                .dropDuplicates(["id", "hash_columns"])
        )

    # data silver path
    delta_silver = DeltaTable.forName(spark, table_cad001p002_silver).alias("silver")
    
    # Insert new records into silver where id does not exist or hash_columns are the same (i.e., new or unchanged data)
    delta_silver_insert = delta_silver.merge(
        df_bronze.alias("bronze"),
        "silver.id = bronze.id AND silver.hash_columns = bronze.hash_columns"
    ).whenNotMatchedInsert(values={f"{c}": f"bronze.{c}" for c in df_bronze.columns}).execute()

    # Get only the newly inserted records from the silver table
    df_full_or_incr = df_bronze

    # incremental load
    if last_created_at_control is not None:
        df_full_or_incr = spark.table(table_cad001p002_silver).filter(F.col("created_at") > F.lit(last_created_at_control))

    # SCD 2
    # Update 'nbande_reg' to 0 in the silver table for records where:
    # - the 'id' matches between silver and source
    # - the 'created_at' in silver is less than the latest 'created_at' from the source
    delta_silver_update = delta_silver.merge(
        # For each id, get the max created_at from the incremental DataFrame
        df_full_or_incr.select("id", "created_at").groupBy("id").agg(F.max("created_at").alias("created_at")).alias("source"),
        "silver.id = source.id AND silver.created_at < source.created_at AND silver.updated_at IS NULL"
    ).whenMatchedUpdate(set={
        "is_active" : F.lit(False),
        "updated_at": "source.created_at"
    }).execute()

except Exception as e:
    # Handle exceptions and set DataFrame to None
    print(str(e)[:300])
    df_full_or_incr = None

if df_full_or_incr is not None:
    # If there are new records, aggregate by 'created_at' and count rows, then append to control table
    (df_full_or_incr
        .groupBy(F.col("created_at"))
        .agg(F.count("*").cast("int").alias("rows_count"))
        .write.option("mergeSchema", "true")
        .mode("append")
        .saveAsTable(table_cad001p002_control))
    
    display(
        delta_silver_insert
            .withColumn("operation", F.lit("insert"))
            .unionByName(delta_silver_update.withColumn("operation", F.lit("update")))
            .select(*["operation", "num_affected_rows", "num_updated_rows", "num_inserted_rows"])
    )