# Setup

In [13]:
spark # We will have a sparksession available

# DDL

## Create raw customer upstream table

In [14]:
spark.sql("DROP TABLE IF EXISTS prod.db.customer")

# Table DDL for customer upstream table
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
)""")

# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (1, 'john.doe@example.com', 'John', TIMESTAMP '2023-01-15 08:30:00'),
  (2, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-03-18 09:10:30'),
  (3, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-02-10 11:05:45');
""")

                                                                                

DataFrame[]

## Create customer_dim dimension table

In [15]:
spark.sql("DROP TABLE IF EXISTS prod.db.dim_customer")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.dim_customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_updated TIMESTAMP,
    -- scd2 columns
    valid_from TIMESTAMP,
    valid_to TIMESTAMP,
    is_current BOOLEAN,
    is_active BOOLEAN
) USING iceberg
PARTITIONED BY (datetime_updated)
TBLPROPERTIES (
    'format-version' = '2'
);""")

DataFrame[]

# Merge Into

![MERGE INTO](./assets/images/merge_into.jpg)

## Preview data

In [16]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
1,john.doe@example.com,John,2023-01-15 08:30:00
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
3,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [17]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active


## MERGE INTO logic

In [18]:
%%sql
With customers_with_updates as (
select c.*
    from prod.db.customer c
    join prod.db.dim_customer dc
    on c.customer_id = dc.customer_id -- Customer exists in dim_customer
    where c.datetime_updated > dc.datetime_updated -- ensure that the update in upstream customer is newer than the latest data in dim_customer
    and dc.is_current = true -- only look at the most current state of customer in dim_customer
)
MERGE INTO prod.db.dim_customer t -- target dim_customer to update
USING (
    select customer_id as join_key, * from prod.db.customer -- New customers to be INSERTED, existing customers to be UPDATED
    union all
    select NULL as join_key, * from customers_with_updates -- Existing customers, but updated values to be INSERTED
    ) s
ON t.customer_id = s.join_key -- natural key for customer

    WHEN MATCHED AND is_current = true AND s.datetime_updated > t.datetime_updated -- condition to UPDATE most recent customers in dim_customer that have had updates
    THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
    
    WHEN NOT MATCHED 
    THEN INSERT (customer_id,email,first_name,datetime_updated,valid_from,is_current,is_active) -- condition to INSERT new customers and customers with updates
    VALUES (s.customer_id,s.email,s.first_name,s.datetime_updated,s.datetime_updated,true,true)

    WHEN NOT MATCHED BY SOURCE -- condition to set deleted customers in dim_customer to be in-active
    THEN UPDATE SET is_active = false

## Check output SCD2 data

In [21]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True


## Simulate inserts and updates in upstream

In [22]:
# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (20, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-04-18 09:10:30'),
  (30, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-03-10 11:05:45');
""")


DataFrame[]

In [23]:

spark.sql("""
UPDATE prod.db.customer SET email =  'john.doe_new_email@example.com', datetime_updated = TIMESTAMP '2023-03-30 08:30:00' 
WHERE customer_id = 1
""")

DataFrame[]

## Preview data

In [27]:
%%sql
select * from prod.db.customer order by datetime_updated

customer_id,email,first_name,datetime_updated
3,robert.brown@example.com,Robert,2023-02-10 11:05:45
30,robert.brown@example.com,Robert,2023-03-10 11:05:45
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
1,john.doe_new_email@example.com,John,2023-03-30 08:30:00
20,jane.smith@example.com,Jane,2023-04-18 09:10:30


In [30]:
%%sql --show
select *
from prod.db.dim_customer
order by datetime_updated

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True


In [31]:
%%sql --show
With customers_with_updates as (
select c.*
    from prod.db.customer c
    join prod.db.dim_customer dc
    on c.customer_id = dc.customer_id -- Customer exists in dim_customer
    where c.datetime_updated > dc.datetime_updated -- ensure that the update in upstream customer is newer than the latest data in dim_customer
    and dc.is_current = true -- only look at the most current state of customer in dim_customer
)
select * from customers_with_updates

customer_id,email,first_name,datetime_updated
1,john.doe_new_email@example.com,John,2023-03-30 08:30:00


## MERGE INTO logic

In [32]:
%%sql
With customers_with_updates as (
select c.*
    from prod.db.customer c
    join prod.db.dim_customer dc
    on c.customer_id = dc.customer_id -- Customer exists in dim_customer
    where c.datetime_updated > dc.datetime_updated -- ensure that the update in upstream customer is newer than the latest data in dim_customer
    and dc.is_current = true -- only look at the most current state of customer in dim_customer
)
MERGE INTO prod.db.dim_customer t -- target dim_customer to update
USING (
    select customer_id as join_key, * from prod.db.customer -- New customers to be INSERTED, existing customers to be UPDATED
    union all
    select NULL as join_key, * from customers_with_updates -- Existing customers, but updated values to be INSERTED
    ) s
ON t.customer_id = s.join_key -- natural key for customer

    WHEN MATCHED AND is_current = true AND s.datetime_updated > t.datetime_updated -- condition to UPDATE most recent customers in dim_customer that have had updates
    THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
    
    WHEN NOT MATCHED 
    THEN INSERT (customer_id,email,first_name,datetime_updated,valid_from,is_current,is_active) -- condition to INSERT new customers and customers with updates
    VALUES (s.customer_id,s.email,s.first_name,s.datetime_updated,s.datetime_updated,true,true)

    WHEN NOT MATCHED BY SOURCE -- condition to set deleted customers in dim_customer to be in-active
    THEN UPDATE SET is_active = false

## Check output SCD2 data

In [34]:
%%sql
select * from prod.db.dim_customer
order by datetime_updated

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,2023-03-30 08:30:00,False,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True
30,robert.brown@example.com,Robert,2023-03-10 11:05:45,2023-03-10 11:05:45,,True,True
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
1,john.doe_new_email@example.com,John,2023-03-30 08:30:00,2023-03-30 08:30:00,,True,True
20,jane.smith@example.com,Jane,2023-04-18 09:10:30,2023-04-18 09:10:30,,True,True
