In [0]:
# Databricks notebook source
# ------------------------------------------------------------------------------
# Notebook: 05_pipeline_ops_sim
# Purpose : Simulate productionizing the medallion pipeline by orchestrating
#           other notebooks with dbutils.notebook.run and basic logging.
#
# Exam Coverage (Databricks Certified Data Engineer Associate – Exam Guide 2025-07-30)
# - Section 4: Productionizing Data Pipelines
#   - Databricks Workflows and Lakeflow Jobs concepts.
#   - Job deployment, retries and repair/rerun patterns (conceptual).
#   - Using serverless compute for auto-optimized jobs (conceptual).
# - Section 1: Databricks Intelligence Platform
#   - Understanding job behavior and compute choices at a high level.
#
# Key Practices
# - Use dbutils.notebook.run to orchestrate steps with shared parameters.
# - Implement fail-fast behavior when a step returns a non-OK status.
# - Log start and end timestamps for basic observability.
# ------------------------------------------------------------------------------


## Exam Focus – Productionizing Pipelines (Section 4)

This notebook simulates how medallion pipelines are productionized:

- Orchestrating multiple notebooks with `dbutils.notebook.run`.
- Passing parameters such as `catalog`, `schema`, and `volume`.
- Implementing fail-fast behavior when a task fails.
- Mapping this pattern to Databricks **Workflows** and **Lakeflow Jobs**.


# Chicago Taxi – Pipeline Orchestration (DBCE)

Purpose: Simulate production-style orchestration by chaining notebooks with parameters, minimal logging, and status propagation.

Exam coverage:
- Productionizing pipelines (Jobs/Workflows concepts via `dbutils.notebook.run`).
- Notebook capabilities (widgets, parameter passing).
- Fail-fast and simple run logs.


In [0]:
#Orchestration imports and small helpers
from datetime import datetime, UTC
import posixpath

# Simple log printer (kept minimal for CE)
def log_event(event: str) -> None:
    print(f"[{datetime.now(UTC).isoformat(timespec='seconds')}] {event}")

#Resolve sibling notebook path robustly
def sibling_notebook(current_path: str, target_name: str) -> str:
    return posixpath.join(posixpath.dirname(current_path), target_name)

In [0]:
# Pipeline parameters and base paths
dbutils.widgets.text("catalog", "taxi_catalog")
dbutils.widgets.text("schema", "taxi_schema")
dbutils.widgets.text("volume", "taxi_volume")

catalog_name = dbutils.widgets.get("catalog")
schema_name  = dbutils.widgets.get("schema")
volume_name  = dbutils.widgets.get("volume")

base_path  = f"/Volumes/{catalog_name}/{schema_name}/{volume_name}"
bronze_path = f"{base_path}/bronze"
silver_path = f"{base_path}/silver"
gold_path   = f"{base_path}/gold"

# Common args to pass downstream
run_args = {"catalog": catalog_name, "schema": schema_name, "volume": volume_name}


In [0]:
from datetime import datetime

catalog = dbutils.widgets.get("catalog")
schema = dbutils.widgets.get("schema")
volume = dbutils.widgets.get("volume")

def run_step(notebook_path: str, step_name: str) -> None:
    print(f"[{datetime.utcnow().isoformat()}] Starting step: {step_name}")
    result = dbutils.notebook.run(
        notebook_path,
        timeout_seconds=0,
        arguments={
            "catalog": catalog,
            "schema": schema,
            "volume": volume,
        },
    )
    if result != "OK":
        raise RuntimeError(f"Step {step_name} failed with result: {result}")
    print(f"[{datetime.utcnow().isoformat()}] Finished step: {step_name}")

# Orchestration (00 is usually manual; start from 01)
run_step("/Workspace/01_bronze_ingestion_autoloader", "bronze_ingestion")
run_step("/Workspace/02_silver_transformations", "silver_transformations")
run_step("/Workspace/03_gold_analytics", "gold_analytics")
run_step("/Workspace/04_quality_governance", "quality_governance")

dbutils.notebook.exit("OK")


### Workflows and Lakeflow (exam notes)

- Databricks Workflows:
  - Native orchestrator for data and AI pipelines.
  - Jobs with one or more tasks (notebooks, SQL, Python scripts, DLT).
  - Supports schedules, retries, alerts and different cluster types, including serverless.
- Lakeflow Jobs:
  - Higher-level orchestration for data pipelines across ingestion, transformation and delivery.
  - Deep integration with Unity Catalog and Delta Live Tables.
- Community Edition:
  - Does not expose full Workflows / Lakeflow features.
  - Concepts still apply and are tested in the exam; this notebook simulates them using `dbutils.notebook.run`.


In [0]:
# Chain notebooks with parameters, minimal logging, and fail-fast behavior

# Resolve current notebook path once
ctx = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
current_path = str(ctx.notebookPath().get())  # ensure plain str

steps = [
    "00_setup_project",
    "01_bronze_ingestion_autoloader",
    "02_silver_transformations",
    "03_gold_analytics",
    "04_quality_governance",
]

for step in steps:
    target = sibling_notebook(current_path, step)
    log_event(f"RUN -> {target} with args {run_args}")
    try:
        res = dbutils.notebook.run(target, 0, arguments=run_args)
        log_event(f"OK <- {step}: {res}")

        # Normalize return and validate
        res_norm = "OK" if res is None else str(res).strip().upper()
        if res_norm not in {"OK", "SUCCESS"}:
            raise RuntimeError(f"Unexpected return from {step}: {res}")

    except Exception as e:
        log_event(f"FAIL <- {step}: {e}")
        dbutils.notebook.exit(f"FAILED at {step}: {e}")
# End-to-end sanity
_ = spark.table(f"{catalog_name}.{schema_name}.chicago_taxi_gold_v").limit(1).count()
log_event("Sanity gold view ok")

dbutils.notebook.exit("OK")  
