In [0]:
# Chicago Taxi â€“ Silver Transformations (DBCE)
Purpose: Normalize bronze into curated Delta silver (path-based) and expose a UC VIEW.


In [0]:
# Core imports for silver transformations
from pyspark.sql.functions import col, when, coalesce, lit, current_timestamp
from pyspark.sql.types import DoubleType, IntegerType, TimestampType, StringType
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


In [0]:
# Notebook parameters for pipeline execution
dbutils.widgets.text("catalog", "taxi_catalog")
dbutils.widgets.text("schema", "taxi_schema")
dbutils.widgets.text("volume", "taxi_volume")

# Read widget values
catalog_name = dbutils.widgets.get("catalog")
schema_name = dbutils.widgets.get("schema")
volume_name = dbutils.widgets.get("volume")

# Base paths
base_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
bronze_path = f"{base_path}/bronze"
silver_path = f"{base_path}/silver"

# Entities
bronze_table_path = f"{bronze_path}/chicago_taxi_trips"
silver_table_path = f"{silver_path}/chicago_taxi_trips_silver"
silver_view_name = f"{catalog_name}.{schema_name}.chicago_taxi_silver_v"


In [0]:
# Read bronze Delta by path
bronze_df = spark.read.format("delta").load(bronze_table_path)

# Minimal normalization (example-safe for Databricks CE)
silver_df = (
    bronze_df
    # enforce basic types where safe
    .withColumn("trip_seconds", col("trip_seconds").cast(IntegerType()))
    .withColumn("trip_miles", col("trip_miles").cast(DoubleType()))
    .withColumn("fare", col("fare").cast(DoubleType()))
    .withColumn("tips", col("tips").cast(DoubleType()))
    .withColumn("tolls", col("tolls").cast(DoubleType()))
    .withColumn("extras", col("extras").cast(DoubleType()))
    .withColumn("trip_total", col("trip_total").cast(DoubleType()))
    # basic null-handling examples
    .withColumn("payment_type", coalesce(col("payment_type"), lit("UNKNOWN")).cast(StringType()))
    .withColumn("company", coalesce(col("company"), lit("UNKNOWN")).cast(StringType()))
    # processing metadata
    .withColumn("processed_ts", current_timestamp())
)

# Write Delta by path (idempotent overwrite for demo; switch to 'append' if incremental)

(
    silver_df
    .write
    .format("delta")
    .mode("overwrite")
    .save(silver_table_path)
)

#Expose UV VIEW over the Delta path
spark.sql(f"""
    CREATE OR REPLACE VIEW {silver_view_name} AS
    SELECT * from delta.`{silver_table_path}`
""")

# Check: non-empty and a couple of sanity constrains
# Counts (robust, no alias issues)
cnt_path = spark.read.format("delta").load(silver_table_path).count()
cnt_view = spark.table(silver_view_name).count()

# Negative amounts check (use positional indexing)
neg_counts_row = spark.sql(f"""
  SELECT
    SUM(CASE WHEN trip_total < 0 THEN 1 ELSE 0 END) AS neg_trip_total,
    SUM(CASE WHEN fare < 0 THEN 1 ELSE 0 END)       AS neg_fare
  FROM {silver_view_name}
""").collect()[0]
neg_trip_total = neg_counts_row[0]
neg_fare = neg_counts_row[1]

print("Counts -> path:", cnt_path, "| view:", cnt_view,
      "| neg_trip_total:", neg_trip_total, "| neg_fare:", neg_fare)

assert cnt_path == cnt_view and cnt_path > 0, "Silver is empty or mismatched."
assert neg_trip_total == 0 and neg_fare == 0, "Negative amounts found in silver."

dbutils.notebook.exit("OK")
