In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS workspace.default.etl_audit_results (
  run_id       STRING,
  pipeline     STRING,
  entity       STRING,
  check_name   STRING,
  status       STRING,   -- PASS / FAIL
  metric_value STRING,   -- store counts/values as string
  details      STRING,
  created_at   TIMESTAMP
)
USING DELTA
""")

In [0]:
import pyspark.sql.functions as F
from datetime import datetime

run_id= datetime.utcnow().strftime("%Y%m%d_%H%M%S")
pipeline= "demo_pipeline"
entity = "silver_orders"

#print("RunId:", run_id)
df=spark.table("workspace.default.silver_orders")
#display(df)

def log_check(check_name: str, status: str, metric_value: str, details: str = ""):
    row = [(run_id, pipeline, entity, check_name, status, metric_value, details, datetime.utcnow())]
    audit_df = spark.createDataFrame(
        row,
        ["run_id","pipeline","entity","check_name","status","metric_value","details","created_at"]
    )
    audit_df.write.mode("append").saveAsTable("workspace.default.etl_audit_results")

#Row count
row_count = df.count()
log_check("row_count", "PASS", str(row_count), "Total rows in silver_orders")
print("Row count:", row_count)

#display(spark.table("workspace.default.etl_audit_results"))
#Null check on key columns (CustomerId, OrderId)
null_check = df.filter(F.col("CustomerId").isNull() | F.col("OrderId").isNull()).count()
log_check("null_check", "PASS" if null_check == 0 else "FAIL", str(null_check), "Null count in silver_orders")
print("Null check:", null_check)

#Duplicate check on business key (CustomerId, OrderId)
duplicat_check= df.groupby("CustomerId","OrderId").count().filter(F.col("count") > 1).count()
log_check("duplicate_check", "PASS" if duplicat_check == 0 else "FAIL", str(duplicat_check), "Duplicate count in silver_orders")
print("Duplicate check:", duplicat_check)

#Amount should be non-negative
amount_check = df.filter(F.col("Amount") < 0).count()
log_check("amount_check", "PASS" if amount_check == 0 else "FAIL", str(amount_check), "Negative amount count in silver_orders")
print("Amount check:", amount_check)

#Freshness check (latest OrderDate not too old)
max_order_date = df.agg(F.max("OrderDate").alias("max_dt")).first()["max_dt"]
log_check("max_order_date", "PASS", str(max_order_date), "Freshness indicator")
print("Max OrderDate:", max_order_date)

audit = spark.table("workspace.default.etl_audit_results")
display(audit.filter(F.col("run_id") == run_id).orderBy("created_at"))

fails = (spark.table("workspace.default.etl_audit_results")
         .filter((F.col("run_id") == run_id) & (F.col("status") == "FAIL"))
         .count())

if fails > 0:
    raise Exception(f"Data Quality checks failed: {fails} checks failed. See etl_audit_results for run_id={run_id}")
