In [0]:
from pyspark.sql import functions as F

# source_df: latest snapshot/incremental data
# target_df: existing dimension
# key_cols: list of business keys, e.g. ["customer_id"]

key_cols = ["customer_id"]

# 1) Find rows to update (matched on key, at least one non-key column changed)
join_cond = [source_df[k] == target_df[k] for k in key_cols]
joined = target_df.alias("t").join(source_df.alias("s"), join_cond, "inner")

# pick non-key columns from target for change detection
non_key_cols = [c for c in source_df.columns if c not in key_cols]

changed_rows = joined.filter(
    F.concat_ws("||", *[F.col(f"s.{c}") for c in non_key_cols]) !=
    F.concat_ws("||", *[F.col(f"t.{c}") for c in non_key_cols])
).select("s.*")

# 2) Rows to keep asâ€‘is (no new version in source or unchanged)
to_keep = target_df.join(source_df, key_cols, "left_anti")

# 3) New rows from source (no match in target)
new_rows = source_df.join(target_df, key_cols, "left_anti")

# 4) Final SCD1 dimension (keep + changed become latest + new)
dim_scd1 = (
    to_keep
    .unionByName(changed_rows)
    .unionByName(new_rows)
)
