In [0]:
import pyspark.sql.functions as F
from datetime import datetime

run_id= datetime.utcnow().strftime("%Y%m%d%H%M%S")
pipeline="demo_pipeline_audit"

def log_check(entity:str,check_name:str,status:str,metric_value, details:str=""):
    row=[(run_id,pipeline,entity,check_name,status,str(metric_value),details,datetime.utcnow())]
    audit_df=spark.createDataFrame(row,["run_id","pipeline","entity","check_name","status","metric_value","details","check_time"])
    audit_df.write.mode("append").saveAsTable("pipeline_audit")

def pass_fail(actual:int , op:str , threshold:int) -> str:
    if op=="==":
        return "Pass" if actual == threshold else "Fail"
    elif op=="<=":
        return "Pass" if actual <= threshold else "Fail"
    elif op==">=":
        return "Pass" if actual >= threshold else "Fail"
    elif op==">":
        return "Pass" if actual > threshold else "Fail"
    elif op=="<":
        return "Pass" if actual < threshold else "Fail"
    else:
        raise ValueError(f"Invalid operator {op}")

def check_row_count(df,entity:str):
    cnt=df.count()
    log_check(entity,"row_count","Pass",cnt,"Total Rows")
    return cnt

def check_nulls(df,entity:str,cols:list,threshold:int=0,op:str="=="):
    cond=None
    for col in cols:
        expr=F.col(col).isNull()
        cond= expr if cond is None else cond | expr
    cnt=df.filter(cond).count()
    status=pass_fail(cnt,op,threshold)
    log_check(entity,f"null_check({",".join(cols)})",status,cnt,f"Nulls in {cols}")
    return cnt

def check_duplicates(df, entity: str, key_cols: list, threshold: int = 0, op: str = "=="):
    actual = (df.groupBy(*key_cols).count().filter(F.col("count") > 1).count())
    status = pass_fail(actual, op, threshold)
    log_check(entity, f"duplicate_keys({','.join(key_cols)})", status, actual, f"Expected {op} {threshold} duplicate groups")
    return actual

def check_negative(df, entity: str, colname: str, threshold: int = 0, op: str = "=="):
    actual = df.filter(F.col(colname) < 0).count()
    status = pass_fail(actual, op, threshold)
    log_check(entity, f"negative({colname})", status, actual, f"Expected {op} {threshold}")
    return actual


silver_orders = spark.table("workspace.default.silver_orders")
entity = "silver_orders"

check_row_count(silver_orders, entity)
check_nulls(silver_orders, entity, ["CustomerId", "OrderId"], threshold=0, op="==")
check_duplicates(silver_orders, entity, ["CustomerId", "OrderId"], threshold=0, op="==")
check_negative(silver_orders, entity, "Amount", threshold=0, op="==")

max_dt = silver_orders.agg(F.max("OrderDate").alias("max_dt")).first()["max_dt"]
log_check(entity, "max_order_date", "PASS", max_dt, "Freshness indicator")


silver_customers = spark.table("workspace.default.silver_customers")
entity = "silver_customers"

check_row_count(silver_customers, entity)
check_nulls(silver_customers, entity, ["CustomerId"], threshold=0, op="==")
check_duplicates(silver_customers, entity, ["CustomerId"], threshold=0, op="==")

# threshold example: allow < 5 missing names
missing_name = silver_customers.filter(
    F.col("CustomerName").isNull() | (F.trim(F.col("CustomerName")) == "")
).count()
status = pass_fail(missing_name, "<", 5)
log_check(entity, "missing_customer_name", status, missing_name, "Expected < 5")

audit = spark.table("workspace.default.pipeline_audit")
display(audit)

In [0]:
bronze_orders = spark.table("workspace.default.bronze_orders")

bronze_distinct_keys = (bronze_orders
  .select("CustomerId","OrderId")
  .dropna()
  .dropDuplicates()
  .count()
)

silver_cnt = spark.table("workspace.default.silver_orders").count()

status = "PASS" if silver_cnt == bronze_distinct_keys else "FAIL"
log_check(
    "recon_orders",
    "silver_count_equals_bronze_distinct_keys",
    status,
    f"silver={silver_cnt}, bronze_distinct_keys={bronze_distinct_keys}",
    "Expected equality (deduped business keys)"
)

bronze_customers = spark.table("workspace.default.bronze_customers")

bronze_distinct_cust = (bronze_customers
  .select("CustomerId")
  .dropna()
  .dropDuplicates()
  .count()
)

silver_cust_cnt = spark.table("workspace.default.silver_customers").count()

status = "PASS" if silver_cust_cnt == bronze_distinct_cust else "FAIL"
log_check(
    "recon_customers",
    "silver_count_equals_bronze_distinct_customerid",
    status,
    f"silver={silver_cust_cnt}, bronze_distinct_customerid={bronze_distinct_cust}",
    "Expected equality (deduped CustomerId)"
)

audit = spark.table("workspace.default.etl_audit_results")
display(audit.filter(F.col("run_id") == run_id).orderBy("created_at"))