# Setup

In [95]:
spark # We will have a sparksession available

# DDL

## Create raw customer upstream table

In [96]:
spark.sql("DROP TABLE IF EXISTS prod.db.customer")

# Table DDL for customer upstream table
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
)""")

# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (1, 'john.doe@example.com', 'John', TIMESTAMP '2023-01-15 08:30:00'),
  (2, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-03-18 09:10:30'),
  (3, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-02-10 11:05:45');
""")

DataFrame[]

## Create customer_dim dimension table

In [97]:
spark.sql("DROP TABLE IF EXISTS prod.db.dim_customer")

spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.dim_customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    datetime_updated TIMESTAMP,
    -- scd2 columns
    valid_from TIMESTAMP,
    valid_to TIMESTAMP,
    is_current BOOLEAN,
    is_active BOOLEAN
) USING iceberg
PARTITIONED BY (datetime_updated)
TBLPROPERTIES (
    'format-version' = '2'
);""")

DataFrame[]

In [98]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
1,john.doe@example.com,John,2023-01-15 08:30:00
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
3,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [99]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active


# Merge Into

Merge into is an operation meant to insert/update/delete rows from a dataset called target given a new dataset called source. The MERGE INTO operation allows us to do updates/insert/deletes

1. WHEN MATCHED: Update & Delete
2. WHEN NOT MATCHED: Inserts
3. WHEN NOT MATCHED BY SOURCE: Update or Delete
 
I addition to this the when matched clause should only modify one target row, since ...

add: merge into image

cant have accept any schema on MERGE INTO

In [100]:
%%sql
MERGE INTO prod.db.dim_customer t
USING (select customer_id as join_key, * from prod.db.customer 
    union all
    select NULL as join_key, * from prod.db.customer) s
ON t.customer_id = s.join_key

    WHEN MATCHED THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
    
    WHEN NOT MATCHED and s.join_key is null THEN INSERT (customer_id,email,first_name,datetime_updated,valid_from,is_current,is_active) 
    VALUES (s.customer_id,s.email,s.first_name,s.datetime_updated,s.datetime_updated,true,true)

    WHEN NOT MATCHED BY SOURCE THEN UPDATE SET is_active = false

In [101]:
%%sql
select * from prod.db.dim_customer

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True


In [102]:
# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (20, 'jane.smith@example.com', 'Jane', TIMESTAMP '2023-03-18 09:10:30'),
  (30, 'robert.brown@example.com', 'Robert', TIMESTAMP '2023-02-10 11:05:45');
""")


DataFrame[]

In [103]:

spark.sql("""
UPDATE prod.db.customer SET email =  'john.doe_new_email@example.com', datetime_updated = TIMESTAMP '2023-02-15 08:30:00' 
WHERE customer_id = 1
""")

DataFrame[]

In [104]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
1,john.doe_new_email@example.com,John,2023-02-15 08:30:00
20,jane.smith@example.com,Jane,2023-03-18 09:10:30
3,robert.brown@example.com,Robert,2023-02-10 11:05:45
30,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [106]:
%%sql
MERGE INTO prod.db.dim_customer t
USING (select customer_id as join_key, * from prod.db.customer 
    union all
    select NULL as join_key, * from prod.db.customer) s
ON t.customer_id = s.join_key

    WHEN MATCHED THEN UPDATE SET is_current = false, valid_to = s.datetime_updated
    
    WHEN NOT MATCHED and s.join_key is null THEN INSERT (customer_id,email,first_name,datetime_updated,valid_from,is_current,is_active) 
    VALUES (s.customer_id,s.email,s.first_name,s.datetime_updated,s.datetime_updated,true,true)

    WHEN NOT MATCHED BY SOURCE THEN UPDATE SET is_active = false

In [107]:
%%sql
select * from prod.db.customer

customer_id,email,first_name,datetime_updated
1,john.doe_new_email@example.com,John,2023-02-15 08:30:00
2,jane.smith@example.com,Jane,2023-03-18 09:10:30
3,robert.brown@example.com,Robert,2023-02-10 11:05:45
20,jane.smith@example.com,Jane,2023-03-18 09:10:30
30,robert.brown@example.com,Robert,2023-02-10 11:05:45


In [109]:
%%sql
select * from prod.db.dim_customer order by datetime_updated desc

customer_id,email,first_name,datetime_updated,valid_from,valid_to,is_current,is_active
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
20,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,,True,True
2,jane.smith@example.com,Jane,2023-03-18 09:10:30,2023-03-18 09:10:30,2023-03-18 09:10:30,False,True
1,john.doe_new_email@example.com,John,2023-02-15 08:30:00,2023-02-15 08:30:00,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True
30,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,,True,True
3,robert.brown@example.com,Robert,2023-02-10 11:05:45,2023-02-10 11:05:45,2023-02-10 11:05:45,False,True
1,john.doe@example.com,John,2023-01-15 08:30:00,2023-01-15 08:30:00,2023-02-15 08:30:00,False,True
