In [0]:
%run "./01_Project_Config"

In [0]:
from pyspark.sql.functions import *

In [0]:
def ingest_bronze_validated(source_path, table_name, mandatory_cols):
    print(f":) Ingesting {table_name}...")
    print(f"   => Checking for Critical Columns: {mandatory_cols}")
    
    reader = (spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "csv")
        .option("header", "true")
        .option("cloudFiles.inferColumnTypes", "true") 
        .option("cloudFiles.schemaLocation", f"{paths['checkpoints']}{table_name}_schema")
        .option("cloudFiles.schemaEvolutionMode", "addNewColumns")
        .load(source_path)
    )
    
    # 2. --- validation guard (Critical Check) ---
    detected_cols = reader.schema.names
    missing_cols = [col for col in mandatory_cols if col not in detected_cols]
    
    if missing_cols:
        raise ValueError(f":) STOP! {table_name} invalid. Missing columns: {missing_cols}")
    else:
        print(f"   => Validation Passed. Schema looks good.")
    
    # 3. add metadata
    df_enriched = (reader
        .withColumn("ingest_ts", current_timestamp())
        .withColumn("source_file", col("_metadata.file_path"))
    )
    
    # 4. write to bronze table
    query = (df_enriched.writeStream
        .format("delta")
        .option("checkpointLocation", f"{paths['checkpoints']}{table_name}")
        .outputMode("append")
        .option("mergeSchema", "true")
        .trigger(availableNow=True)
        .toTable(f"{catalog}.bronze.{table_name}")
    )
    return query

In [0]:
# 1. POS Sales 
cols_sales = ["txn_id", "txn_ts", "channel", "store_id", "customer_id", "product_id", "quantity", "unit_price", "discount_pct", "payment_type", "currency", "city", "promo_code"]
q1 = ingest_bronze_validated(paths['pos_sales'], "pos_sales", cols_sales)

# 2. Returns
cols_returns = ["return_id", "txn_id", "product_id", "return_qty",
"return_ts", "return_reason", "channel"]
q2 = ingest_bronze_validated(paths['returns'], "returns", cols_returns)

# 3. Inventory
cols_inventory = ["snapshot_date", "store_id", "product_id", "stock_on_hand",
"reorder_point", "shrinkage_qty", "stockout_flag"]
q3 = ingest_bronze_validated(paths['inventory'], "inventory", cols_inventory)

In [0]:
# 4. Dimensions Data

# Product
cols_prod = ["product_id", "product_name", "list_price", "brand", "category","cost_price","launch_date"]
q4 = ingest_bronze_validated(paths['products'], "dim_product", cols_prod)

# Store
cols_store = ["store_id", "store_name", "city","store_type","region"]
q5 = ingest_bronze_validated(paths['stores'], "dim_store", cols_store)

# Customer
cols_cust = ["customer_id", "name","email","phone","age_band","city","loyalty_tier"]
q6 = ingest_bronze_validated(paths['customers'], "dim_customer", cols_cust)

# Promo
cols_promo = ["promo_code","promo_type", "discount_rule","start_date","end_date"]
q7 = ingest_bronze_validated(paths['promotions'], "dim_promotion", cols_promo)

In [0]:
print(" (: Waiting for streams to complete...")
q1.awaitTermination()
q2.awaitTermination()
q3.awaitTermination()
q4.awaitTermination()
q5.awaitTermination()
q6.awaitTermination()
q7.awaitTermination()

print("=> Bronze Ingestion Complete (Permissive Mode).")

In [0]:
%sql
select * from retail_lakehouse.bronze.pos_sales;

In [0]:
%sql
select * from retail_lakehouse.bronze.returns;

In [0]:
%sql
select * from retail_lakehouse.bronze.inventory;