In [1]:
spark
# our spark session is available as spark variable

# DDL tables

Our catalog is named prod.

## Upstream tables

In [2]:
spark.sql("show databases;").show()

+---------+
|namespace|
+---------+
+---------+



In [49]:
spark.sql("DROP TABLE IF EXISTS prod.db.orders")
spark.sql("DROP TABLE IF EXISTS prod.db.orders")
spark.sql("DROP TABLE IF EXISTS prod.db.orders")

DataFrame[]

In [50]:
# Table DDL for OLTP tables
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    last_name STRING,
    phone STRING,
    status STRING,
    is_verified BOOLEAN,
    registration_date TIMESTAMP,
    last_login_date TIMESTAMP,
    datetime_created TIMESTAMP,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
)""")


spark.sql("""
-- Profile table (child to customer - many profiles per customer)
CREATE TABLE IF NOT EXISTS prod.db.profile (
    profile_id INT,
    customer_id INT,
    profile_name STRING,
    profile_type STRING,
    address_line1 STRING,
    address_line2 STRING,
    city STRING,
    state STRING,
    postal_code STRING,
    country STRING,
    is_default BOOLEAN,
    datetime_created TIMESTAMP,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
);""")

spark.sql("""
-- Orders table (references profile)
CREATE TABLE IF NOT EXISTS prod.db.orders (
    order_id INT,
    customer_id INT,
    order_date TIMESTAMP,
    order_status STRING,
    payment_method STRING,
    payment_status STRING,
    subtotal DECIMAL(10, 2),
    tax_amount DECIMAL(10, 2),
    shipping_amount DECIMAL(10, 2),
    discount_amount DECIMAL(10, 2),
    total_amount DECIMAL(10, 2),
    currency STRING,
    shipping_method STRING,
    tracking_number STRING,
    notes STRING,
    datetime_created TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
);
""") 


DataFrame[]

In [51]:
# Insert some fake data for the OLTP tables
spark.sql("""
-- Insert sample customers
INSERT INTO prod.db.customer VALUES
  (1, 'john.doe@example.com', 'John', 'Doe', '555-123-4567', 'active', true, TIMESTAMP '2023-01-15 08:30:00', TIMESTAMP '2023-03-20 14:22:15', TIMESTAMP '2023-01-15 08:30:00', TIMESTAMP '2023-03-20 14:22:15'),
  (2, 'jane.smith@example.com', 'Jane', 'Smith', '555-987-6543', 'active', true, TIMESTAMP '2023-02-05 12:45:00', TIMESTAMP '2023-03-18 09:10:30', TIMESTAMP '2023-02-05 12:45:00', TIMESTAMP '2023-03-18 09:10:30'),
  (3, 'robert.brown@example.com', 'Robert', 'Brown', '555-456-7890', 'inactive', false, TIMESTAMP '2023-01-25 15:20:00', TIMESTAMP '2023-02-10 11:05:45', TIMESTAMP '2023-01-25 15:20:00', TIMESTAMP '2023-02-10 11:05:45');
""")

spark.sql("""
-- Insert sample profiles
INSERT INTO prod.db.profile VALUES
  (101, 1, 'Home Address', 'home', '123 Main St', 'Apt 4B', 'New York', 'NY', '10001', 'US', true, TIMESTAMP '2023-01-15 08:35:00', TIMESTAMP '2023-01-15 08:35:00'),
  (102, 1, 'Work Address', 'work', '456 Business Ave', 'Suite 200', 'New York', 'NY', '10002', 'US', false, TIMESTAMP '2023-02-10 09:20:00', TIMESTAMP '2023-02-10 09:20:00'),
  (103, 2, 'Shipping Address', 'shipping', '789 Residential Blvd', NULL, 'Los Angeles', 'CA', '90001', 'US', true, TIMESTAMP '2023-02-05 12:50:00', TIMESTAMP '2023-02-05 12:50:00'),
  (104, 2, 'Billing Address', 'billing', '101 Finance St', '15th Floor', 'Los Angeles', 'CA', '90002', 'US', false, TIMESTAMP '2023-02-05 12:55:00', TIMESTAMP '2023-02-05 12:55:00'),
  (105, 3, 'Home Address', 'home', '202 Cedar Lane', NULL, 'Chicago', 'IL', '60601', 'US', true, TIMESTAMP '2023-01-25 15:25:00', TIMESTAMP '2023-01-25 15:25:00');
""")

spark.sql("""
-- Insert sample orders
INSERT INTO prod.db.orders VALUES
  (1001, 1, TIMESTAMP '2023-02-01 10:15:00', 'delivered', 'credit_card', 'paid', 89.99, 7.20, 5.99, 0.00, 103.18, 'USD', 'standard', 'TRK123456789', NULL, TIMESTAMP '2023-02-01 10:15:00'),
  (1002, 1, TIMESTAMP '2023-03-10 14:30:00', 'shipped', 'paypal', 'paid', 45.50, 3.64, 5.99, 5.00, 50.13, 'USD', 'express', 'TRK987654321', NULL, TIMESTAMP '2023-03-10 14:30:00'),
  (1003, 2, TIMESTAMP '2023-02-20 09:45:00', 'delivered', 'credit_card', 'paid', 129.95, 10.40, 9.99, 15.00, 135.34, 'USD', 'standard', 'TRK456789123', 'Please leave at front door', TIMESTAMP '2023-02-20 09:45:00'),
  (1004, 2, TIMESTAMP '2023-03-15 16:20:00', 'processing', 'apple_pay', 'paid', 75.25, 6.02, 5.99, 0.00, 87.26, 'USD', 'standard', NULL, NULL, TIMESTAMP '2023-03-15 16:20:00'),
  (1005, 3, TIMESTAMP '2023-02-05 11:10:00', 'cancelled', 'credit_card', 'refunded', 199.99, 16.00, 12.99, 20.00, 208.98, 'USD', 'overnight', NULL, 'Customer requested cancellation', TIMESTAMP '2023-02-05 11:10:00');
  """)

DataFrame[]

## Dimensional tables

In [77]:
spark.sql("DROP TABLE IF EXISTS prod.db.dim_customer")
spark.sql("DROP TABLE IF EXISTS prod.db.fct_orders")

DataFrame[]

In [78]:
spark.sql("""
-- Customer Dimension Table with profiles as array of structs
CREATE TABLE IF NOT EXISTS prod.db.dim_customer (
    customer_id INT,
    email STRING,
    first_name STRING,
    last_name STRING,
    phone STRING,
    status STRING,
    is_verified BOOLEAN,
    registration_date TIMESTAMP,
    last_login_date TIMESTAMP,
    
    -- Profiles as array of structs
    profiles ARRAY<STRUCT<
        profile_id: INT,
        profile_name: STRING,
        profile_type: STRING,
        address_line1: STRING,
        address_line2: STRING,
        city: STRING,
        state: STRING,
        postal_code: STRING,
        country: STRING,
        is_default: BOOLEAN,
        datetime_created: TIMESTAMP,
        datetime_updated: TIMESTAMP
    >>,
    
    datetime_created TIMESTAMP,
    datetime_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
);""")


spark.sql("""
-- Orders Fact Table
CREATE TABLE IF NOT EXISTS prod.db.fct_orders (
    order_id INT,
    
    -- Foreign keys
    customer_id INT,
    
    -- Time dimensions
    order_date TIMESTAMP,
    
    -- Order attributes
    order_status STRING,
    payment_method STRING,
    payment_status STRING,
    
    -- Order metrics
    subtotal DECIMAL(10, 2),
    tax_amount DECIMAL(10, 2),
    shipping_amount DECIMAL(10, 2),
    discount_amount DECIMAL(10, 2),
    total_amount DECIMAL(10, 2),
    
    -- Order details
    currency STRING,
    shipping_method STRING,
    tracking_number STRING,
    notes STRING

) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
);
""")

DataFrame[]

## Gold tables

### OBT

In [53]:
spark.sql("DROP TABLE IF EXISTS prod.db.obt_orders")

DataFrame[]

In [54]:
# Order Business Table (OBT) - Denormalized view of orders with customer information
spark.sql("""
CREATE TABLE IF NOT EXISTS prod.db.obt_orders (
    -- Order primary key
    order_id INT,
    
    -- Order date and time attributes
    order_date TIMESTAMP,
    order_year INT,
    order_month INT,
    order_day INT,
    order_dayofweek INT,
    
    -- Order attributes
    order_status STRING,
    payment_method STRING,
    payment_status STRING,
    shipping_method STRING,
    tracking_number STRING,
    notes STRING,
    
    -- Order metrics
    subtotal DECIMAL(10, 2),
    tax_amount DECIMAL(10, 2),
    shipping_amount DECIMAL(10, 2),
    discount_amount DECIMAL(10, 2),
    total_amount DECIMAL(10, 2),
    
    -- Customer information
    
    customer ARRAY<STRUCT<
    customer_id: INT,
    email: STRING,
    first_name: STRING,
    last_name: STRING,
    phone: STRING,
    customer_status: STRING,
    is_verified: BOOLEAN,
    registration_date: TIMESTAMP,
    last_login_date: TIMESTAMP,
    -- Profiles as array of structs
    profiles ARRAY<STRUCT<
        profile_id: INT,
        profile_name: STRING,
        profile_type: STRING,
        address_line1: STRING,
        address_line2: STRING,
        city: STRING,
        state: STRING,
        postal_code: STRING,
        country: STRING,
        is_default: BOOLEAN,
        datetime_created: TIMESTAMP,
        datetime_updated: TIMESTAMP
    >>
    >>,
    
    -- ETL metadata
    source_system STRING,
    etl_batch_id STRING,
    etl_inserted TIMESTAMP,
    etl_updated TIMESTAMP
) USING iceberg
TBLPROPERTIES (
    'format-version' = '2'
);
""")

DataFrame[]

# Table

## Schema evolution

In [8]:
customer_df = spark.table("prod.db.customer")
profile_df = spark.table("prod.db.profile")

In [9]:
from pyspark.sql import functions as F

In [100]:
profile_df.columns

['profile_id',
 'customer_id',
 'profile_name',
 'profile_type',
 'address_line1',
 'address_line2',
 'city',
 'state',
 'postal_code',
 'country',
 'is_default',
 'datetime_created',
 'datetime_updated']

In [101]:
new_columns = [c for c in profile_df.columns if 'customer_id' not in c]
new_columns

['profile_id',
 'profile_name',
 'profile_type',
 'address_line1',
 'address_line2',
 'city',
 'state',
 'postal_code',
 'country',
 'is_default',
 'datetime_created',
 'datetime_updated']

In [102]:
existing_cols = new_columns[0:11:2]
existing_cols

['profile_id',
 'profile_type',
 'address_line2',
 'state',
 'country',
 'datetime_created']

In [103]:
cols_to_add = list(set(new_columns) - set(existing_cols))
cols_to_add # alter table; 

['postal_code',
 'profile_name',
 'city',
 'datetime_updated',
 'is_default',
 'address_line1']

In [104]:
new_cols_order = existing_cols + cols_to_add
new_cols_order # if existing cols are removed; they will be null going forward

['profile_id',
 'profile_type',
 'address_line2',
 'state',
 'country',
 'datetime_created',
 'postal_code',
 'profile_name',
 'city',
 'datetime_updated',
 'is_default',
 'address_line1']

In [78]:
profile_df.groupBy(profile_df.customer_id).agg(
    F.collect_list(
        F.struct(sorted([c for c in profile_df.columns if 'customer_id' not in c]))
    ).alias("profile_info")
).show(truncate=False)

+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|customer_id|profile_info                                                                                                                                                                                                                                                                                                                                                                                                                      

In [130]:
dim_customer = spark.table("prod.db.dim_customer")
[c.name for c in dim_customer.select("profiles").schema.fields[0].dataType.elementType.fields] # get existing schema
#[elt for elt in [r.dataType for r in dim_customer.select("profiles").schema]]

['profile_id',
 'profile_name',
 'profile_type',
 'address_line1',
 'address_line2',
 'city',
 'state',
 'postal_code',
 'country',
 'is_default',
 'datetime_created',
 'datetime_updated']

In [134]:
def get_struct_cols_to_add(new_col_list, old_col_list):
    return sorted(list(set(new_col_list) - set(old_col_list)))

last_col = old_col_list[-1]
for new_col in get_struct_cols_to_add(new_col_list, old_col_list):
    ddl = f"ALTER TABLE prod.db.sample ADD COLUMN nested.new_column bigint FIRST"
    spark.sql(ddl)

In [10]:
[(c.jsonValue().get('name'), c.jsonValue().get('type')) for c in spark.table("prod.db.dim_customer").select("profiles").schema.fields[0].dataType.elementType.fields] 

[('profile_id', 'integer'),
 ('profile_name', 'string'),
 ('profile_type', 'string'),
 ('address_line1', 'string'),
 ('address_line2', 'string'),
 ('city', 'string'),
 ('state', 'string'),
 ('postal_code', 'string'),
 ('country', 'string'),
 ('is_default', 'boolean'),
 ('datetime_created', 'timestamp'),
 ('datetime_updated', 'timestamp')]

In [79]:
from pyspark.sql import functions as F

class DimCustomer:
    def extract_upstream(self, start_ts, end_ts):
        print("Starting EXRACT...")
        customer_df = spark.table("prod.db.customer").where(F.col("datetime_updated").between(start_ts, end_ts))
        profile_df = spark.table("prod.db.profile").where(F.col("datetime_updated").between(start_ts, end_ts))
        return {
            "customer": customer_df,
            "profile": profile_df
        }
    
    def transform(self, input_dfs):
        print("Starting TRANSFORM...")
        customer_df = input_dfs.get("customer")
        profile_df = input_dfs.get("profile")
        profile_struct_columns = [c for c in profile_df.columns if 'customer_id' not in c]

        customer_cols = customer_df.columns

        grouped_profile_id = profile_df.groupBy(
            F.col("customer_id")
        ).agg(
            F.collect_list(
                F.struct(profile_struct_columns)
            )
        .alias("profiles"))
        
        return customer_df.join(
            grouped_profile_id,
            on="customer_id"
        ).select(
            *customer_cols
            ,grouped_profile_id['profiles']
        )
    
    def load(self, transformed_df):
        print("Starting LOAD...")
        existing_struct_cols = [c.name for c in spark.table("prod.db.dim_customer").select("profiles").schema.fields[0].dataType.elementType.fields] # get existing schema
        #new_struct_cols = [c.name for c in transformed_df.select("profiles").schema.fields[0].dataType.elementType.fields]
        new_col_data_types = [(c.jsonValue().get('name'), c.jsonValue().get('type')) for c in transformed_df.select("profiles").schema.fields[0].dataType.elementType.fields] 

        #new_cols =  sorted(list(set(new_struct_cols) - set(existing_struct_cols)))
        
        last_col = existing_struct_cols[-1]
        for new_col, data_type in new_col_data_types:
            if new_col not in existing_struct_cols:
                print(f'Updating dim_customer profiles schema...adding {new_col}')
                ddl = f"ALTER TABLE prod.db.dim_customer ADD COLUMN profiles.element.{new_col} {data_type} AFTER {last_col}"
                spark.sql(ddl)
                last_col = new_col

        print("Loading in data...")
        transformed_df.writeTo("prod.db.dim_customer").overwritePartitions()
        
    
    def run(self, start_ts, end_ts):
        print("Starting RUN...")
        # Log: run_time, start_ts, end_ts, START state
        self.load(self.transform(self.extract_upstream(start_ts, end_ts)))
        # Log: run_time, start_ts, end_ts, END state

In [80]:
dim_customer = DimCustomer()
dim_customer.run('2022-01-01', '2024-01-01')

Starting RUN...
Starting EXRACT...
Starting TRANSFORM...
Starting LOAD...
Loading in data...


In [82]:
spark.sql("select * from prod.db.dim_customer").show()

+-----------+--------------------+----------+---------+------------+--------+-----------+-------------------+-------------------+--------------------+-------------------+-------------------+
|customer_id|               email|first_name|last_name|       phone|  status|is_verified|  registration_date|    last_login_date|            profiles|   datetime_created|   datetime_updated|
+-----------+--------------------+----------+---------+------------+--------+-----------+-------------------+-------------------+--------------------+-------------------+-------------------+
|          1|john.doe@example.com|      John|      Doe|555-123-4567|  active|       true|2023-01-15 08:30:00|2023-03-20 14:22:15|[{101, Home Addre...|2023-01-15 08:30:00|2023-03-20 14:22:15|
|          1|john.doe@example.com|      John|      Doe|555-123-4567|  active|       true|2023-01-15 08:30:00|2023-03-20 14:22:15|[{101, Home Addre...|2023-01-15 08:30:00|2023-03-20 14:22:15|
|          2|jane.smith@exampl...|      Jane|

## Pipelines time chunked; single-run catchup; backfillable

In [83]:
# The inputs are usually provided by the orchestrator

# The logic depends on implementation; and some orchestrators provide backfill, but not catchup

# start and end times

# log table: catchup flag -> check log table for last end ts and run from them
# Note: Airflow's catchup means it will run the pipeline from pipeline start date (one run per schedule till today)
# backfill: run one time chunk at a time; favored over catchup for its ability to use prior chunks data for 
# current chunks computation

# overwrites by partitions