
### Raw Data

This notebook assumes raw_data has already run and creates a medallion architecture declarative pipeline to normalize the event stream and create summary tables

In [0]:
%pip install --upgrade databricks-sdk

In [0]:
CATALOG = dbutils.widgets.get("CATALOG")
EVENTS_VOLUME = dbutils.widgets.get("EVENTS_VOLUME")
SIMULATOR_SCHEMA = dbutils.widgets.get("SIMULATOR_SCHEMA")

In [0]:
import os
import time

from databricks.sdk import WorkspaceClient
from databricks.sdk.service import pipelines

w = WorkspaceClient()

root_abs_path = os.path.abspath("../pipelines/order_items")
root_dbx_path = root_abs_path.replace(
    os.environ.get("DATABRICKS_WORKSPACE_ROOT", "/Workspace"),
    "/Workspace"
)

created = w.pipelines.create(
    catalog=CATALOG,
    schema='lakeflow',
    continuous=True,
    name=f"Order Items Medallion Declarative Pipeline",
    serverless=True,
    configuration={
        "RAW_DATA_CATALOG":CATALOG,
        "RAW_DATA_SCHEMA":SIMULATOR_SCHEMA,
        "RAW_DATA_VOLUME":EVENTS_VOLUME
    },
    root_path=root_dbx_path,
    libraries=[pipelines.PipelineLibrary(glob=pipelines.PathPattern(include=f"{root_dbx_path}/**"))],
    allow_duplicate_names=True
)

print(f"Created pipeline_id={created.pipeline_id}")



In [0]:
# wait for the tables to be created
# future stages may require their existence before being able to be run

import time

while True:
    try:
        if spark.catalog.tableExists(f"{CATALOG}.lakeflow.all_events"):
            break
    except Exception:
        pass
    time.sleep(5)

In [None]:
state_dir = "../.state"
pipelines_file_path = os.path.join(state_dir, "pipelines")
with open(pipelines_file_path, "a") as f:
    f.write(f"{created.pipeline_id}\n")

# Also add to UC-state
import sys
sys.path.append('../utils')
from uc_state import create_state_manager

state_manager = create_state_manager(CATALOG)
state_manager.add("pipelines", created)