# SCD Upsert Demo

Demonstrates `scd1_upsert` and `scd2_upsert` utilities for managing Slowly Changing Dimensions with Delta Lake.

In [None]:
from pyspark.sql import Row
from spark_fuse.spark import create_session
from spark_fuse.utils.scd import scd1_upsert, scd2_upsert, SCDMode, apply_scd

spark = create_session(
    app_name="spark-fuse-scd-demo",
    master="local[2]",
    extra_configs={
        "spark.ui.enabled": "false",
        "spark.driver.bindAddress": "127.0.0.1",
        "spark.driver.host": "localhost",
    },
)
spark

## SCD Type 1 Upsert

In [None]:
from pyspark.sql import functions as F

initial = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

target_path = "/tmp/scd1_demo"
scd1_upsert(
    spark,
    initial,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

updates = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])
scd1_upsert(
    spark,
    updates,
    target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
)

spark.read.format("delta").load(target_path).orderBy("id").show()

## SCD Type 2 Upsert

In [None]:
scd2_target_path = "/tmp/scd2_demo"

source_scd2 = spark.createDataFrame(
    [Row(id=1, val="a", ts=1), Row(id=1, val="b", ts=2), Row(id=2, val="x", ts=5)]
)

scd2_upsert(
    spark,
    source_scd2,
    scd2_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-01 00:00:00')",
)

changes = spark.createDataFrame([Row(id=1, val="c", ts=3), Row(id=3, val="z", ts=1)])

scd2_upsert(
    spark,
    changes,
    scd2_target_path,
    business_keys=["id"],
    tracked_columns=["val"],
    order_by=["ts"],
    load_ts_expr="to_timestamp('2024-01-02 00:00:00')",
)

spark.read.format("delta").load(scd2_target_path).orderBy("id", "version").show()

## Unified dispatcher

In [None]:
dispatcher_target = "/tmp/apply_scd_demo"
apply_scd(
    spark,
    spark.createDataFrame([Row(id=1, val="a"), Row(id=2, val="b")]),
    dispatcher_target,
    scd_mode=SCDMode.SCD2,
    business_keys=["id"],
    tracked_columns=["val"],
    load_ts_expr="current_timestamp()",
)
spark.read.format("delta").load(dispatcher_target).show()

In [None]:
spark.stop()