# Chicago Taxi Lakehouse – Bronze Ingestion (DBCE)

Purpose: Ingest raw Chicago Taxi files into a Delta bronze table on Databricks Community Edition, following medallion architecture (bronze → silver → gold) and adding basic governance metadata (ingestion timestamp, source file).

Exam coverage – Databricks Data Engineer Associate:
- Databricks platform basics and DBCE limitations (no Unity Catalog, no Lakeflow, no Auto Loader UI).
- Development and ingestion with notebooks, Spark, and Delta Lake.
- Medallion architecture and Delta bronze layer design.

CE simulation:
- Use classic Spark DataFrame reader/writer + Delta instead of Auto Loader / Lakeflow Jobs.
- Use a local database name to simulate catalogs/schemas.


In [0]:
# Core Imports for bronz ingesttion
from pyspark.sql.functions import col, current_timestamp
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()


In [0]:
# Notebook parameters for pipeline execution
dbutils.widgets.text("catalog", "taxi_catalog")
dbutils.widgets.text("schema", "taxi_schema")
dbutils.widgets.text("volume", "taxi_volume")

# Read widget values
catalog_name = dbutils.widgets.get("catalog")
schema_name =  dbutils.widgets.get("schema")
volume_name = dbutils.widgets.get("volume")

# Base UC volume path
base_path = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
bronze_path = f"{base_path}/bronze"
silver_path = f"{base_path}/silver"
gold_path = f"{base_path}/gold"

print("Base path:", base_path)
print("Bronze path:", bronze_path)
print("Silver path:", silver_path)
print("Gold path:", gold_path)

In [0]:
# Ensure medallion directories exist in the volume
dbutils.fs.mkdirs(bronze_path)
dbutils.fs.mkdirs(silver_path)
dbutils.fs.mkdirs(gold_path)

print("Medallion directories ensured under base path")
display(dbutils.fs.ls(base_path))

In [0]:
# Define raw source path
raw_source_path = f"{bronze_path}/m6dm-c72p.csv"

# Quick existence check (raise if missing)
files = [f.path for f in dbutils.fs.ls(bronze_path)]
assert raw_source_path.replace("dbfs:", "") in [p.replace("dbfs:", "") for p in files], \
    f"Source file not found: {raw_source_path}"

print("Ok - Source file found:", raw_source_path)

In [0]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# 1) Explicit schema (avoid inference)
taxi_schema = StructType([
    StructField("taxi_id", StringType(), True),
    StructField("trip_start_timestamp", TimestampType(), True),
    StructField("trip_end_timestamp", TimestampType(), True),
    StructField("trip_seconds", IntegerType(), True),
    StructField("trip_miles", DoubleType(), True),
    StructField("pickup_census_tract", StringType(), True),
    StructField("dropoff_census_tract", StringType(), True),
    StructField("pickup_community_area", IntegerType(), True),
    StructField("dropoff_community_area", IntegerType(), True),
    StructField("fare", DoubleType(), True),
    StructField("tips", DoubleType(), True),
    StructField("tolls", DoubleType(), True),
    StructField("extras", DoubleType(), True),
    StructField("trip_total", DoubleType(), True),
    StructField("payment_type", StringType(), True),
    StructField("company", StringType(), True),
    StructField("pickup_centroid_latitude", DoubleType(), True),
    StructField("pickup_centroid_longitude", DoubleType(), True),
    StructField("dropoff_centroid_latitude", DoubleType(), True),
    StructField("dropoff_centroid_longitude", DoubleType(), True),
])

# 2) Read raw file
df_raw = (
    spark.read
         .option("header", True)
         .schema(taxi_schema)
         .csv(raw_source_path)
)

# 3) Governance metadata (UC-safe: _metadata.file_path)
df_bronze = (
    df_raw
    .withColumn("ingestion_ts", current_timestamp())
    .withColumn("source_file", col("_metadata.file_path"))
)

# 4) Write Delta by path
bronze_table_path = f"{bronze_path}/chicago_taxi_trips"
df_bronze.write.format("delta").mode("overwrite").save(bronze_table_path)

# 5) UC view over Delta path
bronze_view_name = f"{catalog_name}.{schema_name}.chicago_taxi_bronze_v"
spark.sql(f"""
  CREATE OR REPLACE VIEW {bronze_view_name} AS
  SELECT * FROM delta.`{bronze_table_path}`
""")

# 6) Quick checks
cnt_path = spark.sql(f"SELECT COUNT(*) c FROM delta.`{bronze_table_path}`").collect()[0]["c"]
cnt_view = spark.sql(f"SELECT COUNT(*) c FROM {bronze_view_name}").collect()[0]["c"]
print("Counts -> path:", cnt_path, "| view:", cnt_view)
assert cnt_path == cnt_view and cnt_path > 0, "Bronze ingestion failed or empty."


In [0]:
from delta.tables import DeltaTable
dt = DeltaTable.forPath(spark, f"{bronze_path}/chicago_taxi_trips")
display(dt.history(5))
# Optional: time travel (tenta v0)
try:
    _ = (spark.read.format("delta")
         .option("versionAsOf", 0)
         .load(f"{bronze_path}/chicago_taxi_trips")
         .limit(1).count())
    print("Time travel Ok (version 0).")
except Exception:
    print("No earlier version availabler yer.")

In [0]:
# Minimal data quality assertions for bronze

bronze_df = spark.read.format("delta").load(f"{bronze_path}/chicago_taxi_trips")

assert bronze_df.count() > 0, "Empty bronze."
for c in ["trip_total", "fare", "tips", "tolls", "extras", "trip_seconds", "trip_miles"]:
    assert c in bronze_df.columns, f"Missing expected column: {c}"

non_negative_cols = ["trip_total", "fare", "tips", "tolls", "extras", "trip_seconds", "trip_miles"]
violations = {}
for c in non_negative_cols:
    v = bronze_df.filter(col(c) < 0).limit(1).count()
    if v > 0:
        violations[c] = v

assert not violations, f"Found negative values in: {list(violations.keys())}"
print("Bronze DQ check passed.")

In [0]:
ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
current_path = ctx.notebookPath().get()

import posixpath
target_path = posixpath.join(posixpath.dirname(current_path), "02_silver_transformations")

args = {"catalog": catalog_name, "schema": schema_name, "volume": volume_name}
print("Running:", target_path)
res = dbutils.notebook.run(target_path, 0, arguments=args)
print("02_silver_transformations ->", res)