# SCD Upsert Demo

Demonstrates `scd1_upsert` and `scd2_upsert` utilities for managing Slowly Changing Dimensions with Delta Lake.

In [1]:
from pyspark.sql import Row
from spark_fuse.spark import create_session
from spark_fuse.utils.scd import scd1_upsert, scd2_upsert, SCDMode, apply_scd

spark = create_session(
    app_name="spark-fuse-scd-demo",
    master="local[2]",
    extra_configs={
        "spark.ui.enabled": "false",
        "spark.driver.bindAddress": "127.0.0.1",
        "spark.driver.host": "localhost",
    },
)
spark

:: loading settings :: url = jar:file:/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/kevin/.ivy2.5.2/cache
The jars for the packages stored in: /Users/kevin/.ivy2.5.2/jars
io.delta#delta-spark_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-1cee8401-1655-4b9a-9068-edeac2aab143;1.0
	confs: [default]
	found io.delta#delta-spark_2.13;4.0.0 in central
	found io.delta#delta-storage;4.0.0 in central
	found org.antlr#antlr4-runtime;4.13.1 in central
:: resolution report :: resolve 148ms :: artifacts dl 18ms
	:: modules in use:
	io.delta#delta-spark_2.13;4.0.0 from central in [default]
	io.delta#delta-storage;4.0.0 from central in [default]
	org.antlr#antlr4-runtime;4.13.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules     

## SCD Type 1 Upsert

In [2]:
from pyspark.sql import functions as F

initial = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

target_path = "/tmp/scd1_demo"
scd1_upsert(
    spark,
    initial,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

updates = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])
scd1_upsert(
    spark,
    updates,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

spark.read.format("delta").load(target_path).orderBy("id").show()

25/11/18 11:25:39 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
25/11/18 11:25:43 WARN MapPartitionsRDD: RDD 55 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+---+--------------------+
| id|val| ts|            row_hash|
+---+---+---+--------------------+
|  1|  c|  3|2e7d2c03a9507ae26...|
|  2|  x|  5|2d711642b726b0440...|
|  3|  z|  1|594e519ae499312b2...|
+---+---+---+--------------------+



## SCD Type 2 Upsert

In [3]:
scd2_target_path = "/tmp/scd2_demo"

source_scd2 = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

scd2_upsert(
    spark,
    source_scd2,
    scd2_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-01 00:00:00')",
)

changes = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])

scd2_upsert(
    spark,
    changes,
    scd2_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-02 00:00:00')",
)

spark.read.format("delta").load(scd2_target_path).orderBy("id", "version").show()

25/11/18 11:25:46 WARN MapPartitionsRDD: RDD 172 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:25:49 WARN MapPartitionsRDD: RDD 284 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+---+--------------------+-------------------+-------------------+----------+-------+
| id|val| ts|            row_hash| effective_start_ts|   effective_end_ts|is_current|version|
+---+---+---+--------------------+-------------------+-------------------+----------+-------+
|  1|  a|  1|ca978112ca1bbdcaf...|2024-01-01 00:00:00|2024-01-01 00:00:00|     false|      1|
|  1|  b|  2|3e23e8160039594a3...|2024-01-01 00:00:00|2024-01-02 00:00:00|     false|      2|
|  1|  c|  3|2e7d2c03a9507ae26...|2024-01-02 00:00:00|               NULL|      true|      3|
|  2|  x|  5|2d711642b726b0440...|2024-01-01 00:00:00|               NULL|      true|      1|
|  3|  z|  1|594e519ae499312b2...|2024-01-02 00:00:00|               NULL|      true|      1|
+---+---+---+--------------------+-------------------+-------------------+----------+-------+



### Multiple updates per batch

When several changes for the same business key arrive in a single feed, `scd2_upsert` processes them in chronological order so every intermediate version is preserved.


In [4]:
multi_batch_target = "/tmp/scd2_multi_batch"

multi_batch = spark.createDataFrame(
    [
        Row(id=10, val="bronze", ts=1),
        Row(id=10, val="silver", ts=2),
        Row(id=10, val="gold", ts=3),
    ]
)

scd2_upsert(
    spark,
    multi_batch,
    multi_batch_target,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-03 00:00:00')",
)

spark.read.format("delta").load(multi_batch_target).orderBy("version").show()


25/11/18 11:25:53 WARN MapPartitionsRDD: RDD 430 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/18 11:25:55 WARN MapPartitionsRDD: RDD 515 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+------+---+--------------------+-------------------+-------------------+----------+-------+
| id|   val| ts|            row_hash| effective_start_ts|   effective_end_ts|is_current|version|
+---+------+---+--------------------+-------------------+-------------------+----------+-------+
| 10|bronze|  1|fcbbcd5a59bf48a18...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      1|
| 10|silver|  2|78cde64c3e47f2cbf...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      2|
| 10|  gold|  3|24d7f03d8dc3c3666...|2024-01-03 00:00:00|               NULL|      true|      3|
+---+------+---+--------------------+-------------------+-------------------+----------+-------+



## Unified dispatcher

In [5]:
dispatcher_target = "/tmp/apply_scd_demo"
apply_scd(
    spark,
    spark.createDataFrame([Row(id=1, val="a"), Row(id=2, val="b")]),
    dispatcher_target,
    scd_mode=SCDMode.SCD2,
    business_keys=["id"],
    tracked_columns=["val"],
    load_ts_expr="current_timestamp()",
)
spark.read.format("delta").load(dispatcher_target).show()

+---+---+--------------------+--------------------+----------------+----------+-------+
| id|val|            row_hash|  effective_start_ts|effective_end_ts|is_current|version|
+---+---+--------------------+--------------------+----------------+----------+-------+
|  1|  a|ca978112ca1bbdcaf...|2025-11-18 11:25:...|            NULL|      true|      1|
|  2|  b|3e23e8160039594a3...|2025-11-18 11:25:...|            NULL|      true|      1|
+---+---+--------------------+--------------------+----------------+----------+-------+



In [6]:
spark.stop()