# SCD Upsert Demo

Demonstrates `scd1_upsert` and `scd2_upsert` utilities for managing Slowly Changing Dimensions with Delta Lake.

In [8]:
from pyspark.sql import Row
from spark_fuse.spark import create_session
from spark_fuse.utils.scd import scd1_upsert, scd2_upsert, SCDMode, apply_scd

spark = create_session(
    app_name="spark-fuse-scd-demo",
    master="local[2]",
    extra_configs={
        "spark.ui.enabled": "false",
        "spark.driver.bindAddress": "127.0.0.1",
        "spark.driver.host": "localhost",
    },
)
spark

## SCD Type 1 Upsert

In [9]:
from pyspark.sql import functions as F

initial = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

target_path = "/tmp/scd1_demo"
scd1_upsert(
    spark,
    initial,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

updates = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])
scd1_upsert(
    spark,
    updates,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

spark.read.format("delta").load(target_path).orderBy("id").show()

25/11/18 11:47:10 WARN MapPartitionsRDD: RDD 19 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:47:11 WARN MapPartitionsRDD: RDD 101 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+---+--------------------+
| id|val| ts|            row_hash|
+---+---+---+--------------------+
|  1|  c|  3|2e7d2c03a9507ae26...|
|  2|  x|  5|2d711642b726b0440...|
|  3|  z|  1|594e519ae499312b2...|
+---+---+---+--------------------+



### SCD1 schema evolution

Enable Delta schema evolution when new attributes appear in later batches.


In [10]:
scd1_evolution_target = "/tmp/scd1_demo_schema"

initial_scd1 = spark.createDataFrame([Row(id=1, val="a")])
scd1_upsert(
    spark,
    initial_scd1,
    scd1_evolution_target,
    business_keys=["id"],
    tracked_columns=["val"],
    allow_schema_evolution=True,
)

scd1_upsert(
    spark,
    spark.createDataFrame([Row(id=1, val="b", color="red")]),
    scd1_evolution_target,
    business_keys=["id"],
    tracked_columns=["val", "color"],
    allow_schema_evolution=True,
)

spark.read.format("delta").load(scd1_evolution_target).show()


25/11/18 11:47:14 WARN MapPartitionsRDD: RDD 184 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:47:15 WARN MapPartitionsRDD: RDD 267 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+--------------------+-----+
| id|val|            row_hash|color|
+---+---+--------------------+-----+
|  1|  b|552aaee99f0c5a1d8...|  red|
+---+---+--------------------+-----+



## SCD Type 2 Upsert

In [11]:
scd2_target_path = "/tmp/scd2_demo"

source_scd2 = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

scd2_upsert(
    spark,
    source_scd2,
    scd2_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-01 00:00:00')",
)

changes = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])

scd2_upsert(
    spark,
    changes,
    scd2_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-02 00:00:00')",
)

spark.read.format("delta").load(scd2_target_path).orderBy("id", "version").show()

25/11/18 11:47:17 WARN MapPartitionsRDD: RDD 366 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:47:19 WARN MapPartitionsRDD: RDD 470 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:47:22 WARN MapPartitionsRDD: RDD 590 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+---+--------------------+-------------------+-------------------+----------+-------+
| id|val| ts|            row_hash| effective_start_ts|   effective_end_ts|is_current|version|
+---+---+---+--------------------+-------------------+-------------------+----------+-------+
|  1|  a|  1|ca978112ca1bbdcaf...|2024-01-01 00:00:00|2024-01-01 00:00:00|     false|      1|
|  1|  b|  2|3e23e8160039594a3...|2024-01-01 00:00:00|2024-01-02 00:00:00|     false|      2|
|  1|  c|  3|2e7d2c03a9507ae26...|2024-01-02 00:00:00|2024-01-01 00:00:00|     false|      3|
|  1|  a|  1|ca978112ca1bbdcaf...|2024-01-01 00:00:00|2024-01-01 00:00:00|     false|      4|
|  1|  b|  2|3e23e8160039594a3...|2024-01-01 00:00:00|2024-01-02 00:00:00|     false|      5|
|  1|  c|  3|2e7d2c03a9507ae26...|2024-01-02 00:00:00|2024-01-01 00:00:00|     false|      6|
|  1|  a|  1|ca978112ca1bbdcaf...|2024-01-01 00:00:00|2024-01-01 00:00:00|     false|      7|
|  1|  b|  2|3e23e8160039594a3...|2024-01-01 00:00:00|2024-0

### Multiple updates per batch

When several changes for the same business key arrive in a single feed, `scd2_upsert` processes them in chronological order so every intermediate version is preserved.


In [12]:
multi_batch_target = "/tmp/scd2_multi_batch"

multi_batch = spark.createDataFrame(
    [
        Row(id=10, val="bronze", ts=1),
        Row(id=10, val="silver", ts=2),
        Row(id=10, val="gold", ts=3),
    ]
)

scd2_upsert(
    spark,
    multi_batch,
    multi_batch_target,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-03 00:00:00')",
)

spark.read.format("delta").load(multi_batch_target).orderBy("version").show()


25/11/18 11:47:25 WARN MapPartitionsRDD: RDD 723 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:47:27 WARN MapPartitionsRDD: RDD 827 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:47:29 WARN MapPartitionsRDD: RDD 920 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+------+---+--------------------+-------------------+-------------------+----------+-------+
| id|   val| ts|            row_hash| effective_start_ts|   effective_end_ts|is_current|version|
+---+------+---+--------------------+-------------------+-------------------+----------+-------+
| 10|bronze|  1|fcbbcd5a59bf48a18...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      1|
| 10|silver|  2|78cde64c3e47f2cbf...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      2|
| 10|  gold|  3|24d7f03d8dc3c3666...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      3|
| 10|bronze|  1|fcbbcd5a59bf48a18...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      4|
| 10|silver|  2|78cde64c3e47f2cbf...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      5|
| 10|  gold|  3|24d7f03d8dc3c3666...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      6|
| 10|bronze|  1|fcbbcd5a59bf48a18...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      7|
| 10|silver|  2|78cde64c3e47f2

### SCD2 schema evolution

Enable Delta schema evolution to automatically add new columns when later batches introduce them.


In [13]:
scd2_evolution_target = "/tmp/scd2_demo_schema"

initial_scd2 = spark.createDataFrame([Row(id=42, val="alpha")])
scd2_upsert(
    spark,
    initial_scd2,
    scd2_evolution_target,
    business_keys=["id"],
    tracked_columns=["val"],
    allow_schema_evolution=True,
)

scd2_upsert(
    spark,
    spark.createDataFrame([Row(id=42, val="beta", color="green")]),
    scd2_evolution_target,
    business_keys=["id"],
    tracked_columns=["val", "color"],
    allow_schema_evolution=True,
)

spark.read.format("delta").load(scd2_evolution_target).orderBy("version").show()


25/11/18 11:47:33 WARN MapPartitionsRDD: RDD 1084 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
                                                                                

+---+-----+--------------------+--------------------+--------------------+----------+-------+-----+
| id|  val|            row_hash|  effective_start_ts|    effective_end_ts|is_current|version|color|
+---+-----+--------------------+--------------------+--------------------+----------+-------+-----+
| 42|alpha|8ed3f6ad685b959ea...|2025-11-18 11:47:...|2025-11-18 11:47:...|     false|      1| NULL|
| 42| beta|016e3c8f35d9c830b...|2025-11-18 11:47:...|                NULL|      true|      2|green|
+---+-----+--------------------+--------------------+--------------------+----------+-------+-----+



## Unified dispatcher

In [14]:
dispatcher_target = "/tmp/apply_scd_demo"
apply_scd(
    spark,
    spark.createDataFrame([Row(id=1, val="a"), Row(id=2, val="b")]),
    dispatcher_target,
    scd_mode=SCDMode.SCD2,
    business_keys=["id"],
    tracked_columns=["val"],
    load_ts_expr="current_timestamp()",
)
spark.read.format("delta").load(dispatcher_target).show()

25/11/18 11:47:37 WARN MapPartitionsRDD: RDD 1206 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+--------------------+--------------------+----------------+----------+-------+
| id|val|            row_hash|  effective_start_ts|effective_end_ts|is_current|version|
+---+---+--------------------+--------------------+----------------+----------+-------+
|  1|  a|ca978112ca1bbdcaf...|2025-11-18 11:25:...|            NULL|      true|      1|
|  2|  b|3e23e8160039594a3...|2025-11-18 11:25:...|            NULL|      true|      1|
+---+---+--------------------+--------------------+----------------+----------+-------+



In [15]:
spark.stop()