# Setup

In [1]:
spark # We will have a sparksession available

# DDL

## Create raw customer upstream table

In [1]:
spark.sql("DROP TABLE IF EXISTS prod.db.customer")

# Table DDL for customer upstream table
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_created TIMESTAMP,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
)""")

# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (1, 'john.doe@example.com', 'John', TIMESTAMP '2023-01-15 08:30:00', TIMESTAMP '2023-03-20 14:22:15'),
  (2, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-03-18 09:10:30', TIMESTAMP '2023-02-05 12:45:00'),
  (3, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-02-10 11:05:45', TIMESTAMP '2023-01-25 15:20:00');
""")

                                                                                

DataFrame[]

## Create customer_dim dimension table

In [41]:
spark.sql("DROP TABLE IF EXISTS prod.db.dim_customer")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.dim_customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_created TIMESTAMP,
    datetime_updated TIMESTAMP,
    -- scd2 columns
    valid_from TIMESTAMP,
    valid_to TIMESTAMP,
    is_current boolean
) USING iceberg
PARTITIONED BY (datetime_updated)
TBLPROPERTIES (
    'format-version' = '2'
);""")

DataFrame[]

In [42]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_created,datetime_updated
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-03-20 14:22:15
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-02-05 12:45:00
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-01-25 15:20:00


In [43]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_created,datetime_updated,valid_from,valid_to,is_current


# Merge Into

Merge into is an operation meant to insert/update/delete rows from a dataset called target given a new dataset called source. The MERGE INTO operation allows us to do updates/insert/deletes

1. WHEN MATCHED: Update & Delete
2. WHEN NOT MATCHED: Inserts
3. WHEN NOT MATCHED BY SOURCE: Update or Delete
 
I addition to this the when matched clause should only modify one target row, since ...

add: merge into image

In [29]:
%%sql
select *
from prod.db.dim_customer

customer_id,email,first_name,datetime_created,datetime_updated,valid_from,valid_to,is_current


In [30]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_created,datetime_updated
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-03-20 14:22:15
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-02-05 12:45:00
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-01-25 15:20:00


cant have accept any schema on MERGE INTO

In [32]:
%%sql
MERGE INTO prod.db.dim_customer t
USING prod.db.customer s
ON t.customer_id = s.customer_id
WHEN MATCHED THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
WHEN NOT MATCHED THEN INSERT (customer_id,email,first_name,datetime_created,datetime_updated,valid_from,is_current)
VALUES
(s.customer_id,s.email,s.first_name,s.datetime_created,s.datetime_updated,s.datetime_updated,true)

In [44]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_created,datetime_updated,valid_from,valid_to,is_current


In [47]:
%%sql
MERGE INTO prod.db.dim_customer t
USING (
    SELECT NULL as join_key, * FROM prod.db.customer
    UNION ALL
    SELECT customer_id as join_key, * FROM prod.db.customer
) s
ON t.customer_id = s.join_key
WHEN MATCHED THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
WHEN NOT MATCHED and s.join_key is null THEN INSERT (customer_id,email,first_name,datetime_created,datetime_updated,valid_from,is_current)
VALUES
(s.customer_id,s.email,s.first_name,s.datetime_created,s.datetime_updated,s.datetime_updated,true)

In [48]:
%%sql
select *
from prod.db.dim_customer

customer_id,email,first_name,datetime_created,datetime_updated,valid_from,valid_to,is_current
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-01-25 15:20:00,2023-01-25 15:20:00,,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-01-25 15:20:00,2023-01-25 15:20:00,2023-01-25 15:20:00,False
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-02-05 12:45:00,2023-02-05 12:45:00,,True
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-02-05 12:45:00,2023-02-05 12:45:00,2023-02-05 12:45:00,False
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-03-20 14:22:15,2023-03-20 14:22:15,,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-03-20 14:22:15,2023-03-20 14:22:15,2023-03-20 14:22:15,False
