In [0]:
%run ../functions/utils_silver

In [0]:
from pyspark.sql.functions import col, year, month, dayofmonth, sum, when

In [0]:
target_table = "milbom_silver.bakehouse_transactions_enrich"


In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS milbom_silver;


In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {target_table} (
    transactionID BIGINT PRIMARY KEY COMMENT 'Unique identifier for each sales transaction',
    customerID BIGINT COMMENT 'Unique identifier of the customer',
    franchiseID BIGINT COMMENT 'Unique identifier of the sales point',
    dateTime TIMESTAMP COMMENT 'Date and time when the transaction occurred',
    year BIGINT COMMENT 'year when the transaction occurred',
    month BIGINT COMMENT 'month when the transaction occurred',
    day BIGINT COMMENT 'day when the transaction occurred',
    product STRING COMMENT 'Name of the product sold',
    category STRING COMMENT 'Category: made or import',
    quantity BIGINT COMMENT 'Number of units sold',
    unitPrice DECIMAL(10,2) COMMENT 'Price per unit',
    totalPrice DECIMAL(10,2) COMMENT 'Total transaction amount',
    paymentMethod STRING COMMENT 'Payment method used',
    store_name STRING COMMENT 'Name of the store',
    store_city STRING COMMENT 'City where the store is located',
    store_country STRING COMMENT 'Country of the store',
    store_size STRING COMMENT 'Size of the store (e.g., small, medium, large)',
    customer_city STRING COMMENT 'Customer city of residence',
    customer_state STRING COMMENT 'Customer state of residence',
    customer_country STRING COMMENT 'Customer country of residence',
    _ingestion_timestamp TIMESTAMP COMMENT 'Timestamp of data ingestion of Transaction',
    _ingestion_date DATE COMMENT 'Date of ingestion of Transaction'
)
USING DELTA
COMMENT 'Silver layer table containing the latest version of sales transactions enrichment';
""")


In [0]:
df_tx = spark.table("milbom_silver.bakehouse_sales_transactions")
df_customers = spark.table("milbom_silver.bakehouse_customers")
df_stores = spark.table("milbom_silver.bakehouse_store")



In [0]:
df_enriched = (
    df_tx.alias("tx")
    .join(
        df_customers.alias("c"),
        col("tx.customerID") == col("c.customerID"),
        "left"
    )
    .join(
        df_stores.alias("s"),
        col("tx.franchiseID") == col("s.franchiseID"),
        "left"
    )
    .select(
        col("tx.transactionID"),
        col("tx.dateTime"),
        col("tx.product"),
        (
            when(col("tx.product") == "Outback Oatmeal", "Made")
            .when(col("tx.product") == "Orchard Oasis", "Made")
            .when(col("tx.product") == "Golden Gate Ginger", "Made")
            .when(col("tx.product") == "Pearly Pies", "Made")
            .when(col("tx.product") == "Tokyo Tidbits", "Imported")
            .when(col("tx.product") == "Austin Almond Biscotti", "Imported")
            .otherwise(None)
        ).alias("category"),
        col("tx.quantity"),
        col("tx.unitPrice"),
        col("tx.totalPrice"),
        col("tx.paymentMethod"),

        # Customer attributes
        col("tx.customerID"),
        col("c.city").alias("customer_city"),
        col("c.state").alias("customer_state"),
        col("c.country").alias("customer_country"),

        # Store attributes
        col("tx.franchiseID"),
        col("s.name").alias("store_name"),
        col("s.city").alias("store_city"),
        col("s.country").alias("store_country"),
        col("s.size").alias("store_size"),

        # Time attributes
        year(col("tx.dateTime")).alias("year"),
        month(col("tx.dateTime")).alias("month"),
        dayofmonth(col("tx.dateTime")).alias("day"),

        # Ingestion metadata
        col("tx._ingestion_timestamp"),
        col("tx._ingestion_date")
    )
)

# display(df_enriched)
merge_condition="target.transactionID = source.transactionID"

write_silver_table(
    df=df_enriched,
    target_table=target_table,
    merge_condition=merge_condition,
    optimize=True
)
