# Setup

In [56]:
spark # We will have a sparksession available

# DDL

## Create raw customer upstream table

In [57]:
spark.sql("DROP TABLE IF EXISTS prod.db.customer")

# Table DDL for customer upstream table
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
)""")

# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (1, 'john.doe@example.com', 'John', TIMESTAMP '2023-01-15 08:30:00'),
  (2, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-03-18 09:10:30'),
  (3, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-02-10 11:05:45');
""")

DataFrame[]

## Create customer_dim dimension table

In [68]:
spark.sql("DROP TABLE IF EXISTS prod.db.dim_customer")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.dim_customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_updated TIMESTAMP,
    -- scd2 columns
    valid_from TIMESTAMP,
    valid_to TIMESTAMP,
    is_current BOOLEAN,
    is_active BOOLEAN
) USING iceberg
PARTITIONED BY (datetime_updated)
TBLPROPERTIES (
    'format-version' = '2'
);""")

DataFrame[]

In [69]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
1,john.doe@example.com,John,2023-01-15 08:30:00
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
3,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [70]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active


# Merge Into

Merge into is an operation meant to insert/update/delete rows from a dataset called target given a new dataset called source. The MERGE INTO operation allows us to do updates/insert/deletes

1. WHEN MATCHED: Update & Delete
2. WHEN NOT MATCHED: Inserts
3. WHEN NOT MATCHED BY SOURCE: Update or Delete
 
I addition to this the when matched clause should only modify one target row, since ...

add: merge into image

cant have accept any schema on MERGE INTO

In [71]:
%%sql --show
select c.*
    from prod.db.customer c
    join prod.db.dim_customer dc
    on c.customer_id = dc.customer_id
    where c.datetime_updated > dc.datetime_updated 
    and dc.is_current = true

customer_id,email,first_name,datetime_updated


In [73]:
%%sql --show
select s.*
    from  prod.db.customer s
full outer join prod.db.dim_customer dc on s.customer_id = dc.customer_id

customer_id,email,first_name,datetime_updated
1,john.doe@example.com,John,2023-01-15 08:30:00
3,robert.brown@example.com,Robert,2023-02-10 11:05:45
2,jane.smith@example.com,Jane,2023-03-18 09:10:30


In [77]:
%%sql
With rows_to_insert as (
select c.*
    from prod.db.customer c
    join prod.db.dim_customer dc
    on c.customer_id = dc.customer_id
    where c.datetime_updated > dc.datetime_updated 
    and dc.is_current = true
)
MERGE INTO prod.db.dim_customer t
USING (select customer_id as join_key, * from prod.db.customer
    union all
    select NULL as join_key, * from rows_to_insert) s
ON t.customer_id = s.join_key

    WHEN MATCHED AND is_current = true AND s.datetime_updated > t.datetime_updated THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
    
    WHEN NOT MATCHED THEN INSERT (customer_id,email,first_name,datetime_updated,valid_from,is_current,is_active) 
    VALUES (s.customer_id,s.email,s.first_name,s.datetime_updated,s.datetime_updated,true,true)

    WHEN NOT MATCHED BY SOURCE THEN UPDATE SET is_active = false

In [78]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True


In [79]:
%%sql --show
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True


In [80]:
# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (20, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-04-18 09:10:30'),
  (30, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-03-10 11:05:45');
""")


DataFrame[]

In [81]:

spark.sql("""
UPDATE prod.db.customer SET email =  'john.doe_new_email@example.com', datetime_updated = TIMESTAMP '2023-03-30 08:30:00' 
WHERE customer_id = 1
""")

DataFrame[]

In [82]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
20,jane.smith@example.com,Jane,2023-04-18 09:10:30
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
30,robert.brown@example.com,Robert,2023-03-10 11:05:45
1,john.doe_new_email@example.com,John,2023-03-30 08:30:00
3,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [83]:
%%sql --show
select customer_id as join_key, * from prod.db.customer 
    union all
    select NULL as join_key, * from prod.db.customer

join_key,customer_id,email,first_name,datetime_updated
2.0,2,jane.smith@example.com,Jane,2023-03-18 09:10:30
3.0,3,robert.brown@example.com,Robert,2023-02-10 11:05:45
1.0,1,john.doe_new_email@example.com,John,2023-03-30 08:30:00
20.0,20,jane.smith@example.com,Jane,2023-04-18 09:10:30
30.0,30,robert.brown@example.com,Robert,2023-03-10 11:05:45
,2,jane.smith@example.com,Jane,2023-03-18 09:10:30
,1,john.doe_new_email@example.com,John,2023-03-30 08:30:00
,3,robert.brown@example.com,Robert,2023-02-10 11:05:45
,20,jane.smith@example.com,Jane,2023-04-18 09:10:30
,30,robert.brown@example.com,Robert,2023-03-10 11:05:45


In [41]:
%%sql --show
select *
from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True


In [84]:
%%sql
With rows_to_insert as (
select c.*
    from prod.db.customer c
    join prod.db.dim_customer dc
    on c.customer_id = dc.customer_id
    where c.datetime_updated > dc.datetime_updated 
    and dc.is_current = true
)
MERGE INTO prod.db.dim_customer t
USING (select customer_id as join_key, * from prod.db.customer
    union all
    select NULL as join_key, * from rows_to_insert) s
ON t.customer_id = s.join_key

    WHEN MATCHED AND is_current = true AND s.datetime_updated > t.datetime_updated THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
    
    WHEN NOT MATCHED THEN INSERT (customer_id,email,first_name,datetime_updated,valid_from,is_current,is_active) 
    VALUES (s.customer_id,s.email,s.first_name,s.datetime_updated,s.datetime_updated,true,true)

    WHEN NOT MATCHED BY SOURCE THEN UPDATE SET is_active = false

In [85]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
1,john.doe_new_email@example.com,John,2023-03-30 08:30:00
20,jane.smith@example.com,Jane,2023-04-18 09:10:30
30,robert.brown@example.com,Robert,2023-03-10 11:05:45
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
3,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [86]:
%%sql
select * from prod.db.dim_customer order by datetime_updated desc

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
20,jane.smith@example.com,Jane,2023-04-18 09:10:30,2023-04-18 09:10:30,,True,True
1,john.doe_new_email@example.com,John,2023-03-30 08:30:00,2023-03-30 08:30:00,,True,True
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
30,robert.brown@example.com,Robert,2023-03-10 11:05:45,2023-03-10 11:05:45,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,2023-03-30 08:30:00,False,True


25/04/06 04:41:01 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 4967543 ms exceeds timeout 120000 ms
25/04/06 04:41:01 WARN SparkContext: Killing executors is not supported by current scheduler.
25/04/06 04:41:01 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$