# Chicago Taxi â€“ Quality & Governance (DBCE)

Purpose: Validate medallion outputs (bronze/silver/gold) with simple assertions and metadata inspection.

Exam coverage:
- Governance & Quality (checks, lineage signals).
- Delta fundamentals (DESCRIBE DETAIL, history).
- Productionizing mindset (fail fast).

In [0]:
# Core imports for quality & governance
from pyspark.sql.functions import col, count, sum as _sum, avg
from delta.tables import DeltaTable
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

In [0]:
# Notebook parameters for pipeline execution
dbutils.widgets.text("catalog", "taxi_catalog")
dbutils.widgets.text("schema", "taxi_schema")
dbutils.widgets.text("volume", "taxi_volume")

# Read widget values
catalog_name = dbutils.widgets.get("catalog")
schema_name  = dbutils.widgets.get("schema")
volume_name  = dbutils.widgets.get("volume")

# Base paths
base_path  = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
bronze_path = f"{base_path}/bronze"
silver_path = f"{base_path}/silver"
gold_path   = f"{base_path}/gold"

# Entities (paths + UC-safe views)
bronze_table_path = f"{bronze_path}/chicago_taxi_trips"
silver_table_path = f"{silver_path}/chicago_taxi_trips_silver"
gold_table_path   = f"{gold_path}/chicago_taxi_kpis"
bronze_view_name  = f"{catalog_name}.{schema_name}.chicago_taxi_bronze_v"
silver_view_name  = f"{catalog_name}.{schema_name}.chicago_taxi_silver_v"
gold_view_name    = f"{catalog_name}.{schema_name}.chicago_taxi_gold_v"


In [0]:
# Read views (governance prefers stable interfaces)
bronze_df = spark.table(bronze_view_name)
silver_df = spark.table(silver_view_name)
gold_df = spark.table(gold_view_name)

# 1) Existence checks (fail fast)
assert bronze_df.count() > 0, "Empty bronze view"
assert silver_df.count() > 0, "Empty silver view"
assert gold_df.count() > 0, "Empty gold view"

# 2) Simple quality rules (non-negative amounts)
neg_bronze = bronze_df.filter(
    (col("fare") < 0) | (col("tips") < 0) | (col("tolls") < 0) | (col("extras") < 0)).limit(1).count()
neg_silver = silver_df.filter(
    (col("fare") < 0) | (col("tips") < 0) | (col("tolls") < 0) | (col("extras") < 0)).limit(1).count()
neg_gold = gold_df.filter(col("total_revenue") < 0).limit(1).count()

assert neg_bronze == 0, "Negative monetary values in bronze"
assert neg_silver == 0, "Negative monetary values in silver"
assert neg_gold == 0, "Negative monetary values in gold"

# 3) Null sanity on business-critical columns (not exhaustive)
null_silver = silver_df.select(
    _sum((col("trip_start_timestamp").isNull()).cast("int")).alias("null_trip_start"),
    _sum((col("payment_type").isNull()).cast("int")).alias("null_payment_type"),
    _sum((col("company").isNull()).cast("int")).alias("null_company")
).collect()[0]
assert null_silver[0] < silver_df.count(), "All trip_start_timestamp are NULL in silver"
# payment_type/company may be NULL; silver set defaults in previous step

# 4) Delta metadata: Describe DETAIL + history (silver, gold)
display(spark.sql(f"DESCRIBE DETAIL delta.`{silver_table_path}`"))
display(spark.sql(f"DESCRIBE DETAIL delta.`{gold_table_path}`"))

display(DeltaTable.forPath(spark, silver_table_path).history(5))
display(DeltaTable.forPath(spark, gold_table_path).history(5))

print("Quality & governance checjs passed")