# Change Tracking Demo

Demonstrates `current_only_upsert` and `track_history_upsert` utilities for managing Slowly Changing Dimensions with Delta Lake.

In [9]:
from pyspark.sql import Row
from spark_fuse.spark import create_session
from spark_fuse.utils.change_tracking import current_only_upsert, track_history_upsert, ChangeTrackingMode, apply_change_tracking

spark = create_session(
    app_name="spark-fuse-change-tracking-demo",
    master="local[2]",
    extra_configs={
        "spark.ui.enabled": "false",
        "spark.driver.bindAddress": "127.0.0.1",
        "spark.driver.host": "localhost",
    },
)
spark

## Current-only merges Upsert

In [10]:
from pyspark.sql import functions as F

initial = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

target_path = "/tmp/current_only_demo"
current_only_upsert(
    spark,
    initial,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

updates = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])
current_only_upsert(
    spark,
    updates,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

spark.read.format("delta").load(target_path).orderBy("id").show()

25/11/20 13:19:49 WARN MapPartitionsRDD: RDD 19 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:19:51 WARN MapPartitionsRDD: RDD 101 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+---+--------------------+
| id|val| ts|            row_hash|
+---+---+---+--------------------+
|  1|  c|  3|2e7d2c03a9507ae26...|
|  2|  x|  5|2d711642b726b0440...|
|  3|  z|  1|594e519ae499312b2...|
+---+---+---+--------------------+



### current-only schema evolution

Enable Delta schema evolution when new attributes appear in later batches.


In [11]:
current_only_evolution_target = "/tmp/current_only_demo_schema"

initial_current_only = spark.createDataFrame([Row(id=1, val="a")])
current_only_upsert(
    spark,
    initial_current_only,
    current_only_evolution_target,
    business_keys=["id"],
    tracked_columns=["val"],
    allow_schema_evolution=True,
)

current_only_upsert(
    spark,
    spark.createDataFrame([Row(id=1, val="b", color="red")]),
    current_only_evolution_target,
    business_keys=["id"],
    tracked_columns=["val", "color"],
    allow_schema_evolution=True,
)

spark.read.format("delta").load(current_only_evolution_target).show()


25/11/20 13:19:53 WARN MapPartitionsRDD: RDD 184 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:19:54 WARN MapPartitionsRDD: RDD 267 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+--------------------+-----+
| id|val|            row_hash|color|
+---+---+--------------------+-----+
|  1|  b|552aaee99f0c5a1d8...|  red|
+---+---+--------------------+-----+



## Track history Upsert

In [12]:
track_history_target_path = "/tmp/track_history_demo"

source_track_history = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

track_history_upsert(
    spark,
    source_track_history,
    track_history_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-01 00:00:00')",
)

changes = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])

track_history_upsert(
    spark,
    changes,
    track_history_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-02 00:00:00')",
)

spark.read.format("delta").load(track_history_target_path).orderBy("id", "version").show()

25/11/20 13:19:56 WARN MapPartitionsRDD: RDD 366 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:19:58 WARN MapPartitionsRDD: RDD 462 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:20:01 WARN MapPartitionsRDD: RDD 574 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+---+--------------------+-------------------+-------------------+----------+-------+
| id|val| ts|            row_hash| effective_start_ts|   effective_end_ts|is_current|version|
+---+---+---+--------------------+-------------------+-------------------+----------+-------+
|  1|  a|  1|ca978112ca1bbdcaf...|2024-01-01 00:00:00|2024-01-01 00:00:00|     false|      1|
|  1|  b|  2|3e23e8160039594a3...|2024-01-01 00:00:00|2024-01-02 00:00:00|     false|      2|
|  1|  c|  3|2e7d2c03a9507ae26...|2024-01-02 00:00:00|2024-01-01 00:00:00|     false|      3|
|  1|  a|  1|ca978112ca1bbdcaf...|2024-01-01 00:00:00|2024-01-01 00:00:00|     false|      4|
|  1|  b|  2|3e23e8160039594a3...|2024-01-01 00:00:00|2024-01-02 00:00:00|     false|      5|
|  1|  c|  3|2e7d2c03a9507ae26...|2024-01-02 00:00:00|               NULL|      true|      6|
|  2|  x|  5|2d711642b726b0440...|2024-01-01 00:00:00|               NULL|      true|      1|
|  3|  z|  1|594e519ae499312b2...|2024-01-02 00:00:00|      

### Multiple updates per batch

When several changes for the same business key arrive in a single feed, `track_history_upsert` processes them in chronological order so every intermediate version is preserved.


In [13]:
multi_batch_target = "/tmp/track_history_multi_batch"

multi_batch = spark.createDataFrame(
    [
        Row(id=10, val="bronze", ts=1),
        Row(id=10, val="silver", ts=2),
        Row(id=10, val="gold", ts=3),
    ]
)

track_history_upsert(
    spark,
    multi_batch,
    multi_batch_target,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-03 00:00:00')",
)

spark.read.format("delta").load(multi_batch_target).orderBy("version").show()


25/11/20 13:20:05 WARN MapPartitionsRDD: RDD 721 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:20:07 WARN MapPartitionsRDD: RDD 817 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:20:09 WARN MapPartitionsRDD: RDD 902 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+------+---+--------------------+-------------------+-------------------+----------+-------+
| id|   val| ts|            row_hash| effective_start_ts|   effective_end_ts|is_current|version|
+---+------+---+--------------------+-------------------+-------------------+----------+-------+
| 10|bronze|  1|fcbbcd5a59bf48a18...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      1|
| 10|silver|  2|78cde64c3e47f2cbf...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      2|
| 10|  gold|  3|24d7f03d8dc3c3666...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      3|
| 10|bronze|  1|fcbbcd5a59bf48a18...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      4|
| 10|silver|  2|78cde64c3e47f2cbf...|2024-01-03 00:00:00|2024-01-03 00:00:00|     false|      5|
| 10|  gold|  3|24d7f03d8dc3c3666...|2024-01-03 00:00:00|               NULL|      true|      6|
+---+------+---+--------------------+-------------------+-------------------+----------+-------+



### track-history schema evolution

Enable Delta schema evolution to automatically add new columns when later batches introduce them.


In [14]:
track_history_evolution_target = "/tmp/track_history_demo_schema"

initial_track_history = spark.createDataFrame([Row(id=42, val="alpha")])
track_history_upsert(
    spark,
    initial_track_history,
    track_history_evolution_target,
    business_keys=["id"],
    tracked_columns=["val"],
    allow_schema_evolution=True,
)

track_history_upsert(
    spark,
    spark.createDataFrame([Row(id=42, val="beta", color="green")]),
    track_history_evolution_target,
    business_keys=["id"],
    tracked_columns=["val", "color"],
    allow_schema_evolution=True,
)

spark.read.format("delta").load(track_history_evolution_target).orderBy("version").show()


25/11/20 13:20:14 WARN MapPartitionsRDD: RDD 1040 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting
25/11/20 13:20:16 WARN MapPartitionsRDD: RDD 1164 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+-----+--------------------+--------------------+--------------------+----------+-------+-----+
| id|  val|            row_hash|  effective_start_ts|    effective_end_ts|is_current|version|color|
+---+-----+--------------------+--------------------+--------------------+----------+-------+-----+
| 42|alpha|8ed3f6ad685b959ea...|2025-11-20 13:13:...|2025-11-20 13:13:...|     false|      1| NULL|
| 42| beta|016e3c8f35d9c830b...|2025-11-20 13:13:...|2025-11-20 13:20:...|     false|      2|green|
| 42|alpha|8ed3f6ad685b959ea...|2025-11-20 13:20:...|2025-11-20 13:20:...|     false|      3| NULL|
| 42| beta|016e3c8f35d9c830b...|2025-11-20 13:20:...|                NULL|      true|      4|green|
+---+-----+--------------------+--------------------+--------------------+----------+-------+-----+



## Unified dispatcher

In [15]:
dispatcher_target = "/tmp/apply_change_tracking_demo"
apply_change_tracking(
    spark,
    spark.createDataFrame([Row(id=1, val="a"), Row(id=2, val="b")]),
    dispatcher_target,
    change_tracking_mode=ChangeTrackingMode.TRACK_HISTORY,
    business_keys=["id"],
    tracked_columns=["val"],
    load_ts_expr="current_timestamp()",
)
spark.read.format("delta").load(dispatcher_target).show()

25/11/20 13:20:19 WARN MapPartitionsRDD: RDD 1288 was locally checkpointed, its lineage has been truncated and cannot be recomputed after unpersisting


+---+---+--------------------+--------------------+----------------+----------+-------+
| id|val|            row_hash|  effective_start_ts|effective_end_ts|is_current|version|
+---+---+--------------------+--------------------+----------------+----------+-------+
|  1|  a|ca978112ca1bbdcaf...|2025-11-20 13:13:...|            NULL|      true|      1|
|  2|  b|3e23e8160039594a3...|2025-11-20 13:13:...|            NULL|      true|      1|
+---+---+--------------------+--------------------+----------------+----------+-------+



In [16]:
writer_target = "/tmp/change_tracking_writer_demo"
spark.range(2).toDF("id").write.change_tracking.options(
    change_tracking_mode="track_history",
    change_tracking_options={
        "business_keys": ["id"],
        "tracked_columns": ["id"],
        "load_ts_expr": "current_timestamp()",
    },
).table(writer_target)
spark.read.format("delta").load(writer_target).show()


+---+--------------------+--------------------+----------------+----------+-------+
| id|            row_hash|  effective_start_ts|effective_end_ts|is_current|version|
+---+--------------------+--------------------+----------------+----------+-------+
|  1|6b86b273ff34fce19...|2025-11-20 13:20:...|            NULL|      true|      1|
|  0|5feceb66ffc86f38d...|2025-11-20 13:20:...|            NULL|      true|      1|
+---+--------------------+--------------------+----------------+----------+-------+



In [17]:
spark.stop()